diff --git a/.travis.yml b/.travis.yml index a540380d..87ee43b9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,19 +1,21 @@ sudo: required -dist: trusty +dist: precise addons: apt: sources: - ubuntu-toolchain-r-test - kalakris-cmake + - llvm-toolchain-precise-3.7 + - boost-latest packages: + - clang-format-3.7 + - clang-tidy-3.7 - gcc-4.9 - g++-4.9 - gcov - cmake - valgrind - - libboost-dev #needed for thrift cpp compilation - - libboost-program-options-dev #needed for thrift cpp compilation - - libboost-test-dev #needed for thrift cpp compilation + - libboost1.55-all-dev #needed for thrift cpp compilation - libssl-dev #needed for thrift cpp compilation - libtool #needed for thrift cpp compilation - bison #needed for thrift cpp compilation @@ -28,24 +30,11 @@ matrix: - source $TRAVIS_BUILD_DIR/ci/before_script_travis.sh - cmake -DCMAKE_CXX_FLAGS="-Werror" -DPARQUET_TEST_MEMCHECK=ON -DPARQUET_GENERATE_COVERAGE=1 $TRAVIS_BUILD_DIR - export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data - script: - - make lint - - make -j4 || exit 1 - - ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } - - sudo pip install cpp_coveralls - - export PARQUET_ROOT=$TRAVIS_BUILD_DIR - - $TRAVIS_BUILD_DIR/ci/upload_coverage.sh - compiler: clang os: linux - script: - - make -j4 || exit 1 - - ctest - os: osx compiler: clang addons: - script: - - make -j4 || exit 1 - - ctest language: cpp before_install: @@ -56,3 +45,6 @@ before_script: - source $TRAVIS_BUILD_DIR/ci/before_script_travis.sh - cmake -DCMAKE_CXX_FLAGS="-Werror" $TRAVIS_BUILD_DIR - export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data + +script: +- $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 7764cc0f..56e9dea8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,7 +30,21 @@ enable_testing() # where to find cmake modules set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") -set(BUILD_SUPPORT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/build-support) +set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") + +find_package(ClangTools) +if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) + # Generate a Clang compile_commands.json "compilation database" file for use + # with various development tools, such as Vim's YouCompleteMe plugin. + # See http://clang.llvm.org/docs/JSONCompilationDatabase.html + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +endif() + +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) if(APPLE) set(CMAKE_MACOSX_RPATH 1) @@ -196,7 +210,7 @@ set_target_properties(lz4static PROPERTIES IMPORTED_LOCATION ${LZ4_STATIC_LIB}) find_package(ZLIB REQUIRED) include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS}) add_library(zlibstatic STATIC IMPORTED) -set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) +set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB}) ## GTest find_package(GTest REQUIRED) @@ -280,6 +294,35 @@ if (UNIX) `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`) endif (UNIX) +############################################################ +# "make format" and "make check-format" targets +############################################################ + +if (${CLANG_FORMAT_FOUND}) + # runs clang format and updates files in place. + add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1 + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) + + # runs clang format and exits with a non-zero exit code if any files need to be reformatted + add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0 + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) +endif() + + +############################################################ +# "make clang-tidy" and "make check-clang-tidy" targets +############################################################ + +if (${CLANG_TIDY_FOUND}) + # runs clang-tidy and attempts to fix any warning automatically + add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_types/g' | sed -e '/_constants/g'`) + # runs clang-tidy and exits with a non-zero exit code if any errors are found. + add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json + 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore`) + +endif() + ############################################################# # Test linking diff --git a/build-support/run-clang-format.sh b/build-support/run-clang-format.sh new file mode 100755 index 00000000..01ddab26 --- /dev/null +++ b/build-support/run-clang-format.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Runs clang format in the given directory +# Arguments: +# $1 - Path to the source tree +# $2 - Path to the clang format binary +# $3 - Apply fixes (will raise an error if false and not there where changes) +# $ARGN - Files to run clang format on +# +SOURCE_DIR=$1 +shift +CLANG_FORMAT=$1 +shift +APPLY_FIXES=$1 +shift + +# clang format will only find its configuration if we are in +# the source tree or in a path relative to the source tree +pushd $SOURCE_DIR +if [ "$APPLY_FIXES" == "1" ]; then + $CLANG_FORMAT -i $@ +else + + NUM_CORRECTIONS=`$CLANG_FORMAT -output-replacements-xml $@ | grep offset | wc -l` + if [ "$NUM_CORRECTIONS" -gt "0" ]; then + echo "clang-format suggested changes, please run 'make format'!!!!" + exit 1 + fi +fi +popd diff --git a/build-support/run-clang-tidy.sh b/build-support/run-clang-tidy.sh new file mode 100755 index 00000000..2a4b1c06 --- /dev/null +++ b/build-support/run-clang-tidy.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Runs clang format in the given directory +# Arguments: +# $1 - Path to the clang tidy binary +# $2 - Path to the compile_commands.json to use +# $3 - Apply fixes (will raise an error if false and not there where changes) +# $ARGN - Files to run clang-tidy on +# +CLANG_TIDY=$1 +shift +COMPILE_COMMANDS=$1 +shift +APPLY_FIXES=$1 +shift + +# clang format will only find its configuration if we are in +# the source tree or in a path relative to the source tree +if [ "$APPLY_FIXES" == "1" ]; then + $CLANG_TIDY -p $COMPILE_COMMANDS -fix $@ +else + NUM_CORRECTIONS=`$CLANG_TIDY -p $COMPILE_COMMANDS $@ 2>&1 | grep -v Skipping | grep "warnings* generated" | wc -l` + if [ "$NUM_CORRECTIONS" -gt "0" ]; then + echo "clang-tidy had suggested fixes. Please fix these!!!" + exit 1 + fi +fi diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh new file mode 100755 index 00000000..c8c0ac07 --- /dev/null +++ b/ci/travis_script_cpp.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -e + +: ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/parquet-build} + +pushd $CPP_BUILD_DIR + +make lint +if [ $TRAVIS_OS_NAME == "linux" ]; then + make check-format + make check-clang-tidy +fi + +if [ $TRAVIS_OS_NAME == "linux" ]; then + make -j4 || exit 1 + ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } + sudo pip install cpp_coveralls + export PARQUET_ROOT=$TRAVIS_BUILD_DIR + $TRAVIS_BUILD_DIR/ci/upload_coverage.sh +else + make -j4 || exit 1 + ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } +fi + +popd diff --git a/cmake_modules/FindClangTools.cmake b/cmake_modules/FindClangTools.cmake new file mode 100644 index 00000000..c07c7d24 --- /dev/null +++ b/cmake_modules/FindClangTools.cmake @@ -0,0 +1,60 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find the clang-tidy and clang-format modules +# +# Usage of this module as follows: +# +# find_package(ClangTools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# ClangToolsBin_HOME - +# When set, this path is inspected instead of standard library binary locations +# to find clang-tidy and clang-format +# +# This module defines +# CLANG_TIDY_BIN, The path to the clang tidy binary +# CLANG_TIDY_FOUND, Whether clang tidy was found +# CLANG_FORMAT_BIN, The path to the clang format binary +# CLANG_TIDY_FOUND, Whether clang format was found + +find_program(CLANG_TIDY_BIN + NAMES clang-tidy-3.8 clang-tidy-3.7 clang-tidy-3.6 clang-tidy + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + NO_DEFAULT_PATH +) + +if ( "${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND" ) + set(CLANG_TIDY_FOUND 0) + message("clang-tidy not found") +else() + set(CLANG_TIDY_FOUND 1) + message("clang-tidy found at ${CLANG_TIDY_BIN}") +endif() + +find_program(CLANG_FORMAT_BIN + NAMES clang-format-3.8 clang-format-3.7 clang-format-3.6 clang-format + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + NO_DEFAULT_PATH +) + +if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" ) + set(CLANG_FORMAT_FOUND 0) + message("clang-format not found") +else() + set(CLANG_FORMAT_FOUND 1) + message("clang-format found at ${CLANG_FORMAT_BIN}") +endif() + diff --git a/src/.clang-format b/src/.clang-format new file mode 100644 index 00000000..7d5b3cf3 --- /dev/null +++ b/src/.clang-format @@ -0,0 +1,65 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: false +AlignConsecutiveAssignments: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 90 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1000 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never diff --git a/src/.clang-tidy b/src/.clang-tidy new file mode 100644 index 00000000..6fc3742a --- /dev/null +++ b/src/.clang-tidy @@ -0,0 +1,13 @@ +--- +Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*' +HeaderFilterRegex: 'parquet/.*' +AnalyzeTemporaryDtors: true +CheckOptions: + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' diff --git a/src/.clang-tidy-ignore b/src/.clang-tidy-ignore new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/.clang-tidy-ignore @@ -0,0 +1 @@ + diff --git a/src/parquet/api/io.h b/src/parquet/api/io.h index 3a9b148c..683dae27 100644 --- a/src/parquet/api/io.h +++ b/src/parquet/api/io.h @@ -24,4 +24,4 @@ #include "parquet/util/mem-allocator.h" #include "parquet/util/output.h" -#endif // PARQUET_API_IO_H +#endif // PARQUET_API_IO_H diff --git a/src/parquet/api/reader.h b/src/parquet/api/reader.h index 41cb06b3..572ecf5d 100644 --- a/src/parquet/api/reader.h +++ b/src/parquet/api/reader.h @@ -29,4 +29,4 @@ // IO #include "parquet/api/io.h" -#endif // PARQUET_API_READER_H +#endif // PARQUET_API_READER_H diff --git a/src/parquet/api/schema.h b/src/parquet/api/schema.h index aca6c99a..523d046a 100644 --- a/src/parquet/api/schema.h +++ b/src/parquet/api/schema.h @@ -23,4 +23,4 @@ #include "parquet/schema/printer.h" #include "parquet/schema/types.h" -#endif // PARQUET_API_SCHEMA_H +#endif // PARQUET_API_SCHEMA_H diff --git a/src/parquet/column/column-reader-test.cc b/src/parquet/column/column-reader-test.cc index 1db06132..524ec50c 100644 --- a/src/parquet/column/column-reader-test.cc +++ b/src/parquet/column/column-reader-test.cc @@ -76,19 +76,15 @@ class TestPrimitiveReader : public ::testing::Test { ASSERT_EQ(num_levels_, batch_actual); ASSERT_EQ(num_values_, total_values_read); ASSERT_TRUE(vector_equal(values_, vresult)); - if (max_def_level_ > 0) { - ASSERT_TRUE(vector_equal(def_levels_, dresult)); - } - if (max_rep_level_ > 0) { - ASSERT_TRUE(vector_equal(rep_levels_, rresult)); - } + if (max_def_level_ > 0) { ASSERT_TRUE(vector_equal(def_levels_, dresult)); } + if (max_rep_level_ > 0) { ASSERT_TRUE(vector_equal(rep_levels_, rresult)); } // catch improper writes at EOS batch_actual = reader->ReadBatch(5, nullptr, nullptr, nullptr, &values_read); ASSERT_EQ(0, batch_actual); ASSERT_EQ(0, values_read); } - void ExecutePlain(int num_pages, int levels_per_page, const ColumnDescriptor *d) { + void ExecutePlain(int num_pages, int levels_per_page, const ColumnDescriptor* d) { num_values_ = MakePages(d, num_pages, levels_per_page, def_levels_, rep_levels_, values_, data_buffer_, pages_, Encoding::PLAIN); num_levels_ = num_pages * levels_per_page; @@ -101,7 +97,7 @@ class TestPrimitiveReader : public ::testing::Test { reader_.reset(); } - void ExecuteDict(int num_pages, int levels_per_page, const ColumnDescriptor *d) { + void ExecuteDict(int num_pages, int levels_per_page, const ColumnDescriptor* d) { num_values_ = MakePages(d, num_pages, levels_per_page, def_levels_, rep_levels_, values_, data_buffer_, pages_, Encoding::RLE_DICTIONARY); num_levels_ = num_pages * levels_per_page; @@ -114,12 +110,12 @@ class TestPrimitiveReader : public ::testing::Test { int num_values_; int16_t max_def_level_; int16_t max_rep_level_; - vector > pages_; + vector> pages_; std::shared_ptr reader_; vector values_; vector def_levels_; vector rep_levels_; - vector data_buffer_; // For BA and FLBA + vector data_buffer_; // For BA and FLBA }; TEST_F(TestPrimitiveReader, TestInt32FlatRequired) { @@ -162,10 +158,10 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { const ColumnDescriptor descr(type, max_def_level_, max_rep_level_); shared_ptr dummy = std::make_shared(); - shared_ptr dict_page = std::make_shared(dummy, - 0, Encoding::PLAIN); - shared_ptr data_page = MakeDataPage(&descr, {}, 0, - Encoding::RLE_DICTIONARY, {}, 0, {}, 0, {}, 0); + shared_ptr dict_page = + std::make_shared(dummy, 0, Encoding::PLAIN); + shared_ptr data_page = MakeDataPage( + &descr, {}, 0, Encoding::RLE_DICTIONARY, {}, 0, {}, 0, {}, 0); pages_.push_back(dict_page); pages_.push_back(data_page); InitReader(&descr); @@ -173,10 +169,9 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { ASSERT_NO_THROW(reader_->HasNext()); pages_.clear(); - dict_page = std::make_shared(dummy, - 0, Encoding::PLAIN_DICTIONARY); - data_page = MakeDataPage(&descr, {}, 0, - Encoding::PLAIN_DICTIONARY, {}, 0, {}, 0, {}, 0); + dict_page = std::make_shared(dummy, 0, Encoding::PLAIN_DICTIONARY); + data_page = MakeDataPage( + &descr, {}, 0, Encoding::PLAIN_DICTIONARY, {}, 0, {}, 0, {}, 0); pages_.push_back(dict_page); pages_.push_back(data_page); InitReader(&descr); @@ -184,26 +179,25 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { ASSERT_NO_THROW(reader_->HasNext()); pages_.clear(); - data_page = MakeDataPage(&descr, {}, 0, - Encoding::RLE_DICTIONARY, {}, 0, {}, 0, {}, 0); + data_page = MakeDataPage( + &descr, {}, 0, Encoding::RLE_DICTIONARY, {}, 0, {}, 0, {}, 0); pages_.push_back(data_page); InitReader(&descr); // Tests dictionary page must occur before data page ASSERT_THROW(reader_->HasNext(), ParquetException); pages_.clear(); - dict_page = std::make_shared(dummy, - 0, Encoding::DELTA_BYTE_ARRAY); + dict_page = std::make_shared(dummy, 0, Encoding::DELTA_BYTE_ARRAY); pages_.push_back(dict_page); InitReader(&descr); // Tests only RLE_DICTIONARY is supported ASSERT_THROW(reader_->HasNext(), ParquetException); pages_.clear(); - shared_ptr dict_page1 = std::make_shared(dummy, - 0, Encoding::PLAIN_DICTIONARY); - shared_ptr dict_page2 = std::make_shared(dummy, - 0, Encoding::PLAIN); + shared_ptr dict_page1 = + std::make_shared(dummy, 0, Encoding::PLAIN_DICTIONARY); + shared_ptr dict_page2 = + std::make_shared(dummy, 0, Encoding::PLAIN); pages_.push_back(dict_page1); pages_.push_back(dict_page2); InitReader(&descr); @@ -211,8 +205,8 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { ASSERT_THROW(reader_->HasNext(), ParquetException); pages_.clear(); - data_page = MakeDataPage(&descr, {}, 0, - Encoding::DELTA_BYTE_ARRAY, {}, 0, {}, 0, {}, 0); + data_page = MakeDataPage( + &descr, {}, 0, Encoding::DELTA_BYTE_ARRAY, {}, 0, {}, 0, {}, 0); pages_.push_back(data_page); InitReader(&descr); // unsupported encoding @@ -220,5 +214,5 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { pages_.clear(); } -} // namespace test -} // namespace parquet +} // namespace test +} // namespace parquet diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc index df5aa9a7..653572ae 100644 --- a/src/parquet/column/column-writer-test.cc +++ b/src/parquet/column/column-writer-test.cc @@ -70,8 +70,8 @@ class TestPrimitiveWriter : public ::testing::Test { sink_.reset(new InMemoryOutputStream()); std::unique_ptr pager( new SerializedPageWriter(sink_.get(), Compression::UNCOMPRESSED, &metadata_)); - return std::unique_ptr(new Int64Writer(schema_.get(), std::move(pager), - output_size)); + return std::unique_ptr( + new Int64Writer(schema_.get(), std::move(pager), output_size)); } void ReadColumn() { @@ -138,8 +138,8 @@ TEST_F(TestPrimitiveWriter, OptionalRepeated) { std::vector repetition_levels(100, 0); auto writer = BuildWriter(); - writer->WriteBatch(values.size(), definition_levels.data(), - repetition_levels.data(), values.data()); + writer->WriteBatch( + values.size(), definition_levels.data(), repetition_levels.data(), values.data()); writer->Close(); ReadColumn(); @@ -176,8 +176,8 @@ TEST_F(TestPrimitiveWriter, OptionalRepeatedTooFewRows) { repetition_levels[3] = 1; auto writer = BuildWriter(); - writer->WriteBatch(values.size(), definition_levels.data(), - repetition_levels.data(), values.data()); + writer->WriteBatch( + values.size(), definition_levels.data(), repetition_levels.data(), values.data()); ASSERT_THROW(writer->Close(), ParquetException); } @@ -196,7 +196,5 @@ TEST_F(TestPrimitiveWriter, RequiredNonRepeatedLargeChunk) { ASSERT_EQ(values, values_out_); } -} // namespace test -} // namespace parquet - - +} // namespace test +} // namespace parquet diff --git a/src/parquet/column/levels-test.cc b/src/parquet/column/levels-test.cc index 87e596d6..b15c0af9 100644 --- a/src/parquet/column/levels-test.cc +++ b/src/parquet/column/levels-test.cc @@ -28,8 +28,8 @@ using std::string; namespace parquet { -void GenerateLevels(int min_repeat_factor, int max_repeat_factor, - int max_level, std::vector& input_levels) { +void GenerateLevels(int min_repeat_factor, int max_repeat_factor, int max_level, + std::vector& input_levels) { // for each repetition count upto max_repeat_factor for (int repeat = min_repeat_factor; repeat <= max_repeat_factor; repeat++) { // repeat count increases by a factor of 2 for every iteration @@ -56,14 +56,13 @@ void EncodeLevels(Encoding::type encoding, int max_level, int num_levels, // encode levels if (encoding == Encoding::RLE) { // leave space to write the rle length value - encoder.Init(encoding, max_level, num_levels, - bytes.data() + sizeof(uint32_t), bytes.size()); + encoder.Init( + encoding, max_level, num_levels, bytes.data() + sizeof(uint32_t), bytes.size()); levels_count = encoder.Encode(num_levels, input_levels); (reinterpret_cast(bytes.data()))[0] = encoder.len(); } else { - encoder.Init(encoding, max_level, num_levels, - bytes.data(), bytes.size()); + encoder.Init(encoding, max_level, num_levels, bytes.data(), bytes.size()); levels_count = encoder.Encode(num_levels, input_levels); } ASSERT_EQ(num_levels, levels_count); @@ -94,7 +93,7 @@ void VerifyDecodingLevels(Encoding::type encoding, int max_level, } // check the remaining levels int num_levels_completed = decode_count * (num_levels / decode_count); - int num_remaining_levels = num_levels - num_levels_completed; + int num_remaining_levels = num_levels - num_levels_completed; if (num_remaining_levels > 0) { levels_count = decoder.Decode(num_remaining_levels, output_levels.data()); ASSERT_EQ(num_remaining_levels, levels_count); @@ -102,7 +101,7 @@ void VerifyDecodingLevels(Encoding::type encoding, int max_level, EXPECT_EQ(input_levels[i + num_levels_completed], output_levels[i]); } } - //Test zero Decode values + // Test zero Decode values ASSERT_EQ(0, decoder.Decode(1, output_levels.data())); } @@ -133,12 +132,11 @@ void VerifyDecodingMultipleSetData(Encoding::type encoding, int max_level, // increase the repetition count for each iteration by a factor of 2 TEST(TestLevels, TestLevelsDecodeMultipleBitWidth) { int min_repeat_factor = 0; - int max_repeat_factor = 7; // 128 + int max_repeat_factor = 7; // 128 int max_bit_width = 8; std::vector input_levels; std::vector bytes; - Encoding::type encodings[2] = {Encoding::RLE, - Encoding::BIT_PACKED}; + Encoding::type encodings[2] = {Encoding::RLE, Encoding::BIT_PACKED}; // for each encoding for (int encode = 0; encode < 2; encode++) { @@ -150,8 +148,7 @@ TEST(TestLevels, TestLevelsDecodeMultipleBitWidth) { // find the maximum level for the current bit_width int max_level = (1 << bit_width) - 1; // Generate levels - GenerateLevels(min_repeat_factor, max_repeat_factor, - max_level, input_levels); + GenerateLevels(min_repeat_factor, max_repeat_factor, max_level, input_levels); EncodeLevels(encoding, max_level, input_levels.size(), input_levels.data(), bytes); VerifyDecodingLevels(encoding, max_level, input_levels, bytes); input_levels.clear(); @@ -162,15 +159,13 @@ TEST(TestLevels, TestLevelsDecodeMultipleBitWidth) { // Test multiple decoder SetData calls TEST(TestLevels, TestLevelsDecodeMultipleSetData) { int min_repeat_factor = 3; - int max_repeat_factor = 7; // 128 + int max_repeat_factor = 7; // 128 int bit_width = 8; int max_level = (1 << bit_width) - 1; std::vector input_levels; std::vector> bytes; - Encoding::type encodings[2] = {Encoding::RLE, - Encoding::BIT_PACKED}; - GenerateLevels(min_repeat_factor, max_repeat_factor, - max_level, input_levels); + Encoding::type encodings[2] = {Encoding::RLE, Encoding::BIT_PACKED}; + GenerateLevels(min_repeat_factor, max_repeat_factor, max_level, input_levels); int num_levels = input_levels.size(); int setdata_factor = 8; int split_level_size = num_levels / setdata_factor; @@ -188,4 +183,4 @@ TEST(TestLevels, TestLevelsDecodeMultipleSetData) { } } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/column/levels.h b/src/parquet/column/levels.h index f57708d2..fd84ec9d 100644 --- a/src/parquet/column/levels.h +++ b/src/parquet/column/levels.h @@ -32,8 +32,8 @@ class LevelEncoder { LevelEncoder() {} // Initialize the LevelEncoder. - void Init(Encoding::type encoding, int16_t max_level, - int num_buffered_values, uint8_t* data, int data_size) { + void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values, + uint8_t* data, int data_size) { bit_width_ = BitUtil::Log2(max_level + 1); encoding_ = encoding; switch (encoding) { @@ -60,18 +60,14 @@ class LevelEncoder { if (encoding_ == Encoding::RLE) { for (int i = 0; i < batch_size; ++i) { - if (!rle_encoder_->Put(*(levels + i))) { - break; - } + if (!rle_encoder_->Put(*(levels + i))) { break; } ++num_encoded; } rle_encoder_->Flush(); rle_length_ = rle_encoder_->len(); } else { for (int i = 0; i < batch_size; ++i) { - if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) { - break; - } + if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) { break; } ++num_encoded; } bit_packed_encoder_->Flush(); @@ -94,15 +90,14 @@ class LevelEncoder { std::unique_ptr bit_packed_encoder_; }; - class LevelDecoder { public: LevelDecoder() : num_values_remaining_(0) {} // Initialize the LevelDecoder state with new data // and return the number of bytes consumed - int SetData(Encoding::type encoding, int16_t max_level, - int num_buffered_values, const uint8_t* data) { + int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, + const uint8_t* data) { uint32_t num_bytes = 0; encoding_ = encoding; num_values_remaining_ = num_buffered_values; @@ -140,16 +135,12 @@ class LevelDecoder { int num_values = std::min(num_values_remaining_, batch_size); if (encoding_ == Encoding::RLE) { for (int i = 0; i < num_values; ++i) { - if (!rle_decoder_->Get(levels + i)) { - break; - } + if (!rle_decoder_->Get(levels + i)) { break; } ++num_decoded; } } else { for (int i = 0; i < num_values; ++i) { - if (!bit_packed_decoder_->GetValue(bit_width_, levels + i)) { - break; - } + if (!bit_packed_decoder_->GetValue(bit_width_, levels + i)) { break; } ++num_decoded; } } @@ -165,5 +156,5 @@ class LevelDecoder { std::unique_ptr bit_packed_decoder_; }; -} // namespace parquet -#endif // PARQUET_COLUMN_LEVELS_H +} // namespace parquet +#endif // PARQUET_COLUMN_LEVELS_H diff --git a/src/parquet/column/page.h b/src/parquet/column/page.h index 709f7c84..8bad57f2 100644 --- a/src/parquet/column/page.h +++ b/src/parquet/column/page.h @@ -40,67 +40,46 @@ namespace parquet { // here, both on the read and write path class Page { public: - Page(const std::shared_ptr& buffer, PageType::type type) : - buffer_(buffer), - type_(type) {} + Page(const std::shared_ptr& buffer, PageType::type type) + : buffer_(buffer), type_(type) {} - PageType::type type() const { - return type_; - } + PageType::type type() const { return type_; } // @returns: a pointer to the page's data - const uint8_t* data() const { - return buffer_->data(); - } + const uint8_t* data() const { return buffer_->data(); } // @returns: the total size in bytes of the page's data buffer - int32_t size() const { - return buffer_->size(); - } + int32_t size() const { return buffer_->size(); } private: std::shared_ptr buffer_; PageType::type type_; }; - class DataPage : public Page { public: - DataPage(const std::shared_ptr& buffer, - int32_t num_values, Encoding::type encoding, - Encoding::type definition_level_encoding, - Encoding::type repetition_level_encoding) : - Page(buffer, PageType::DATA_PAGE), - num_values_(num_values), - encoding_(encoding), - definition_level_encoding_(definition_level_encoding), - repetition_level_encoding_(repetition_level_encoding) {} + DataPage(const std::shared_ptr& buffer, int32_t num_values, + Encoding::type encoding, Encoding::type definition_level_encoding, + Encoding::type repetition_level_encoding) + : Page(buffer, PageType::DATA_PAGE), + num_values_(num_values), + encoding_(encoding), + definition_level_encoding_(definition_level_encoding), + repetition_level_encoding_(repetition_level_encoding) {} - int32_t num_values() const { - return num_values_; - } + int32_t num_values() const { return num_values_; } - Encoding::type encoding() const { - return encoding_; - } + Encoding::type encoding() const { return encoding_; } - Encoding::type repetition_level_encoding() const { - return repetition_level_encoding_; - } + Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } - Encoding::type definition_level_encoding() const { - return definition_level_encoding_; - } + Encoding::type definition_level_encoding() const { return definition_level_encoding_; } // DataPageHeader::statistics::max field, if it was set - const uint8_t* max() const { - return reinterpret_cast(max_.c_str()); - } + const uint8_t* max() const { return reinterpret_cast(max_.c_str()); } // DataPageHeader::statistics::min field, if it was set - const uint8_t* min() const { - return reinterpret_cast(min_.c_str()); - } + const uint8_t* min() const { return reinterpret_cast(min_.c_str()); } private: int32_t num_values_; @@ -114,50 +93,33 @@ class DataPage : public Page { std::string min_; }; - class DataPageV2 : public Page { public: - DataPageV2(const std::shared_ptr& buffer, - int32_t num_values, int32_t num_nulls, int32_t num_rows, - Encoding::type encoding, - int32_t definition_levels_byte_length, - int32_t repetition_levels_byte_length, bool is_compressed = false) : - Page(buffer, PageType::DATA_PAGE_V2), - num_values_(num_values), - num_nulls_(num_nulls), - num_rows_(num_rows), - encoding_(encoding), - definition_levels_byte_length_(definition_levels_byte_length), - repetition_levels_byte_length_(repetition_levels_byte_length), - is_compressed_(is_compressed) {} - - int32_t num_values() const { - return num_values_; - } - - int32_t num_nulls() const { - return num_nulls_; - } - - int32_t num_rows() const { - return num_rows_; - } - - Encoding::type encoding() const { - return encoding_; - } - - int32_t definition_levels_byte_length() const { - return definition_levels_byte_length_; - } - - int32_t repetition_levels_byte_length() const { - return repetition_levels_byte_length_; - } - - bool is_compressed() const { - return is_compressed_; - } + DataPageV2(const std::shared_ptr& buffer, int32_t num_values, int32_t num_nulls, + int32_t num_rows, Encoding::type encoding, int32_t definition_levels_byte_length, + int32_t repetition_levels_byte_length, bool is_compressed = false) + : Page(buffer, PageType::DATA_PAGE_V2), + num_values_(num_values), + num_nulls_(num_nulls), + num_rows_(num_rows), + encoding_(encoding), + definition_levels_byte_length_(definition_levels_byte_length), + repetition_levels_byte_length_(repetition_levels_byte_length), + is_compressed_(is_compressed) {} + + int32_t num_values() const { return num_values_; } + + int32_t num_nulls() const { return num_nulls_; } + + int32_t num_rows() const { return num_rows_; } + + Encoding::type encoding() const { return encoding_; } + + int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } + + int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } + + bool is_compressed() const { return is_compressed_; } private: int32_t num_values_; @@ -171,27 +133,20 @@ class DataPageV2 : public Page { // TODO(wesm): format::DataPageHeaderV2.statistics }; - class DictionaryPage : public Page { public: - DictionaryPage(const std::shared_ptr& buffer, - int32_t num_values, Encoding::type encoding, bool is_sorted = false) : - Page(buffer, PageType::DICTIONARY_PAGE), - num_values_(num_values), - encoding_(encoding), - is_sorted_(is_sorted) {} + DictionaryPage(const std::shared_ptr& buffer, int32_t num_values, + Encoding::type encoding, bool is_sorted = false) + : Page(buffer, PageType::DICTIONARY_PAGE), + num_values_(num_values), + encoding_(encoding), + is_sorted_(is_sorted) {} - int32_t num_values() const { - return num_values_; - } + int32_t num_values() const { return num_values_; } - Encoding::type encoding() const { - return encoding_; - } + Encoding::type encoding() const { return encoding_; } - bool is_sorted() const { - return is_sorted_; - } + bool is_sorted() const { return is_sorted_; } private: int32_t num_values_; @@ -220,10 +175,10 @@ class PageWriter { const std::shared_ptr& definition_levels, Encoding::type definition_level_encoding, const std::shared_ptr& repetition_levels, - Encoding::type repetition_level_encoding, - const std::shared_ptr& values, Encoding::type encoding) = 0; + Encoding::type repetition_level_encoding, const std::shared_ptr& values, + Encoding::type encoding) = 0; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_COLUMN_PAGE_H +#endif // PARQUET_COLUMN_PAGE_H diff --git a/src/parquet/column/reader.cc b/src/parquet/column/reader.cc index 4598dfbe..a13dfd33 100644 --- a/src/parquet/column/reader.cc +++ b/src/parquet/column/reader.cc @@ -30,11 +30,11 @@ namespace parquet { ColumnReader::ColumnReader(const ColumnDescriptor* descr, std::unique_ptr pager, MemoryAllocator* allocator) - : descr_(descr), - pager_(std::move(pager)), - num_buffered_values_(0), - num_decoded_values_(0), - allocator_(allocator) {} + : descr_(descr), + pager_(std::move(pager)), + num_buffered_values_(0), + num_decoded_values_(0), + allocator_(allocator) {} template void TypedColumnReader::ConfigureDictionary(const DictionaryPage* page) { @@ -60,7 +60,7 @@ void TypedColumnReader::ConfigureDictionary(const DictionaryPage* page) { // TODO(wesm): investigate whether this all-or-nothing decoding of the // dictionary makes sense and whether performance can be improved - auto decoder = std::make_shared >(descr_, allocator_); + auto decoder = std::make_shared>(descr_, allocator_); decoder->SetDict(&dictionary); decoders_[encoding] = decoder; } else { @@ -73,8 +73,7 @@ void TypedColumnReader::ConfigureDictionary(const DictionaryPage* page) { // PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index // encoding. static bool IsDictionaryIndexEncoding(const Encoding::type& e) { - return e == Encoding::RLE_DICTIONARY || - e == Encoding::PLAIN_DICTIONARY; + return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY; } template @@ -108,24 +107,24 @@ bool TypedColumnReader::ReadNewPage() { // the page size to determine the number of bytes in the encoded data. int64_t data_size = page->size(); - //Data page Layout: Repetition Levels - Definition Levels - encoded values. - //Levels are encoded as rle or bit-packed. - //Init repetition levels + // Data page Layout: Repetition Levels - Definition Levels - encoded values. + // Levels are encoded as rle or bit-packed. + // Init repetition levels if (descr_->max_repetition_level() > 0) { - int64_t rep_levels_bytes = repetition_level_decoder_.SetData( - page->repetition_level_encoding(), descr_->max_repetition_level(), - num_buffered_values_, buffer); + int64_t rep_levels_bytes = + repetition_level_decoder_.SetData(page->repetition_level_encoding(), + descr_->max_repetition_level(), num_buffered_values_, buffer); buffer += rep_levels_bytes; data_size -= rep_levels_bytes; } - //TODO figure a way to set max_definition_level_ to 0 - //if the initial value is invalid + // TODO figure a way to set max_definition_level_ to 0 + // if the initial value is invalid - //Init definition levels + // Init definition levels if (descr_->max_definition_level() > 0) { - int64_t def_levels_bytes = definition_level_decoder_.SetData( - page->definition_level_encoding(), descr_->max_definition_level(), - num_buffered_values_, buffer); + int64_t def_levels_bytes = + definition_level_decoder_.SetData(page->definition_level_encoding(), + descr_->max_definition_level(), num_buffered_values_, buffer); buffer += def_levels_bytes; data_size -= def_levels_bytes; } @@ -134,14 +133,12 @@ bool TypedColumnReader::ReadNewPage() { // first page with this encoding. Encoding::type encoding = page->encoding(); - if (IsDictionaryIndexEncoding(encoding)) { - encoding = Encoding::RLE_DICTIONARY; - } + if (IsDictionaryIndexEncoding(encoding)) { encoding = Encoding::RLE_DICTIONARY; } auto it = decoders_.find(static_cast(encoding)); if (it != decoders_.end()) { if (encoding == Encoding::RLE_DICTIONARY) { - DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY); + DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY); } current_decoder_ = it->second.get(); } else { @@ -179,26 +176,20 @@ bool TypedColumnReader::ReadNewPage() { // Batch read APIs int64_t ColumnReader::ReadDefinitionLevels(int64_t batch_size, int16_t* levels) { - if (descr_->max_definition_level() == 0) { - return 0; - } + if (descr_->max_definition_level() == 0) { return 0; } return definition_level_decoder_.Decode(batch_size, levels); } int64_t ColumnReader::ReadRepetitionLevels(int64_t batch_size, int16_t* levels) { - if (descr_->max_repetition_level() == 0) { - return 0; - } + if (descr_->max_repetition_level() == 0) { return 0; } return repetition_level_decoder_.Decode(batch_size, levels); } // ---------------------------------------------------------------------- // Dynamic column reader constructor -std::shared_ptr ColumnReader::Make( - const ColumnDescriptor* descr, - std::unique_ptr pager, - MemoryAllocator* allocator) { +std::shared_ptr ColumnReader::Make(const ColumnDescriptor* descr, + std::unique_ptr pager, MemoryAllocator* allocator) { switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared(descr, std::move(pager), allocator); @@ -215,8 +206,8 @@ std::shared_ptr ColumnReader::Make( case Type::BYTE_ARRAY: return std::make_shared(descr, std::move(pager), allocator); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(descr, - std::move(pager), allocator); + return std::make_shared( + descr, std::move(pager), allocator); default: ParquetException::NYI("type reader not implemented"); } @@ -236,4 +227,4 @@ template class TypedColumnReader; template class TypedColumnReader; template class TypedColumnReader; -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/column/reader.h b/src/parquet/column/reader.h index 7704c52a..926a2fdb 100644 --- a/src/parquet/column/reader.h +++ b/src/parquet/column/reader.h @@ -47,20 +47,14 @@ class ColumnReader { // Either there is no data page available yet, or the data page has been // exhausted if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) { - if (!ReadNewPage() || num_buffered_values_ == 0) { - return false; - } + if (!ReadNewPage() || num_buffered_values_ == 0) { return false; } } return true; } - Type::type type() const { - return descr_->physical_type(); - } + Type::type type() const { return descr_->physical_type(); } - const ColumnDescriptor* descr() const { - return descr_; - } + const ColumnDescriptor* descr() const { return descr_; } protected: virtual bool ReadNewPage() = 0; @@ -107,12 +101,9 @@ class TypedColumnReader : public ColumnReader { public: typedef typename DType::c_type T; - TypedColumnReader(const ColumnDescriptor* schema, - std::unique_ptr pager, - MemoryAllocator* allocator = default_allocator()) : - ColumnReader(schema, std::move(pager), allocator), - current_decoder_(NULL) { - } + TypedColumnReader(const ColumnDescriptor* schema, std::unique_ptr pager, + MemoryAllocator* allocator = default_allocator()) + : ColumnReader(schema, std::move(pager), allocator), current_decoder_(NULL) {} // Read a batch of repetition levels, definition levels, and values from the // column. @@ -145,14 +136,13 @@ class TypedColumnReader : public ColumnReader { // Map of encoding type to the respective decoder object. For example, a // column chunk's data pages may include both dictionary-encoded and // plain-encoded data. - std::unordered_map > decoders_; + std::unordered_map> decoders_; void ConfigureDictionary(const DictionaryPage* page); DecoderType* current_decoder_; }; - template inline int64_t TypedColumnReader::ReadValues(int64_t batch_size, T* out) { int64_t num_decoded = current_decoder_->Decode(out, batch_size); @@ -183,9 +173,7 @@ inline int64_t TypedColumnReader::ReadBatch(int batch_size, int16_t* def_ // TODO(wesm): this tallying of values-to-decode can be performed with better // cache-efficiency if fused with the level decoding. for (int64_t i = 0; i < num_def_levels; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { - ++values_to_read; - } + if (def_levels[i] == descr_->max_definition_level()) { ++values_to_read; } } } else { // Required field, read all values @@ -207,7 +195,6 @@ inline int64_t TypedColumnReader::ReadBatch(int batch_size, int16_t* def_ return total_values; } - typedef TypedColumnReader BoolReader; typedef TypedColumnReader Int32Reader; typedef TypedColumnReader Int64Reader; @@ -217,6 +204,6 @@ typedef TypedColumnReader DoubleReader; typedef TypedColumnReader ByteArrayReader; typedef TypedColumnReader FixedLenByteArrayReader; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_COLUMN_READER_H +#endif // PARQUET_COLUMN_READER_H diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc index 78bc3c97..1e3ce745 100644 --- a/src/parquet/column/scanner-test.cc +++ b/src/parquet/column/scanner-test.cc @@ -48,59 +48,52 @@ bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) { namespace test { -template<> -void InitValues(int num_values, vector& values, - vector& buffer) { +template <> +void InitValues(int num_values, vector& values, vector& buffer) { values = flip_coins(num_values, 0); } -template<> -void InitValues(int num_values, vector& values, - vector& buffer) { +template <> +void InitValues(int num_values, vector& values, vector& buffer) { random_Int96_numbers(num_values, 0, std::numeric_limits::min(), std::numeric_limits::max(), values.data()); } -template<> -void InitValues(int num_values, vector& values, - vector& buffer) { +template <> +void InitValues( + int num_values, vector& values, vector& buffer) { int max_byte_array_len = 12; int num_bytes = max_byte_array_len + sizeof(uint32_t); size_t nbytes = num_values * num_bytes; buffer.resize(nbytes); - random_byte_array(num_values, 0, buffer.data(), values.data(), - max_byte_array_len); + random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len); } -template<> -void InitValues(int num_values, vector& values, - vector& buffer) { +template <> +void InitValues(int num_values, vector& values, vector& buffer) { size_t nbytes = num_values * FLBA_LENGTH; buffer.resize(nbytes); - random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, - values.data()); + random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data()); } -template<> -void InitDictValues(int num_values, int dict_per_page, - vector& values, vector& buffer) { +template <> +void InitDictValues( + int num_values, int dict_per_page, vector& values, vector& buffer) { // No op for bool } - template class TestFlatScanner : public ::testing::Test { public: typedef typename Type::c_type T; - void InitScanner(const ColumnDescriptor *d) { + void InitScanner(const ColumnDescriptor* d) { std::unique_ptr pager(new test::MockPageReader(pages_)); scanner_ = Scanner::Make(ColumnReader::Make(d, std::move(pager))); } - void CheckResults(int batch_size, const ColumnDescriptor *d) { - TypedScanner* scanner = - reinterpret_cast* >(scanner_.get()); + void CheckResults(int batch_size, const ColumnDescriptor* d) { + TypedScanner* scanner = reinterpret_cast*>(scanner_.get()); T val; bool is_null = false; int16_t def_level; @@ -110,14 +103,14 @@ class TestFlatScanner : public ::testing::Test { for (int i = 0; i < num_levels_; i++) { ASSERT_TRUE(scanner->Next(&val, &def_level, &rep_level, &is_null)) << i << j; if (!is_null) { - ASSERT_EQ(values_[j], val) << i <<"V"<< j; + ASSERT_EQ(values_[j], val) << i << "V" << j; j++; } if (d->max_definition_level() > 0) { - ASSERT_EQ(def_levels_[i], def_level) << i <<"D"<< j; + ASSERT_EQ(def_levels_[i], def_level) << i << "D" << j; } if (d->max_repetition_level() > 0) { - ASSERT_EQ(rep_levels_[i], rep_level) << i <<"R"<< j; + ASSERT_EQ(rep_levels_[i], rep_level) << i << "R" << j; } } ASSERT_EQ(num_values_, j); @@ -132,7 +125,7 @@ class TestFlatScanner : public ::testing::Test { } void Execute(int num_pages, int levels_per_page, int batch_size, - const ColumnDescriptor *d, Encoding::type encoding) { + const ColumnDescriptor* d, Encoding::type encoding) { num_values_ = MakePages(d, num_pages, levels_per_page, def_levels_, rep_levels_, values_, data_buffer_, pages_, encoding); num_levels_ = num_pages * levels_per_page; @@ -145,14 +138,14 @@ class TestFlatScanner : public ::testing::Test { std::shared_ptr& d2, std::shared_ptr& d3, int length) { NodePtr type; - type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num, - LogicalType::NONE, length); + type = schema::PrimitiveNode::Make( + "c1", Repetition::REQUIRED, Type::type_num, LogicalType::NONE, length); d1.reset(new ColumnDescriptor(type, 0, 0)); - type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num, - LogicalType::NONE, length); + type = schema::PrimitiveNode::Make( + "c2", Repetition::OPTIONAL, Type::type_num, LogicalType::NONE, length); d2.reset(new ColumnDescriptor(type, 4, 0)); - type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num, - LogicalType::NONE, length); + type = schema::PrimitiveNode::Make( + "c3", Repetition::REPEATED, Type::type_num, LogicalType::NONE, length); d3.reset(new ColumnDescriptor(type, 4, 2)); } @@ -173,12 +166,12 @@ class TestFlatScanner : public ::testing::Test { protected: int num_levels_; int num_values_; - vector > pages_; + vector> pages_; std::shared_ptr scanner_; vector values_; vector def_levels_; vector rep_levels_; - vector data_buffer_; // For BA and FLBA + vector data_buffer_; // For BA and FLBA }; typedef TestFlatScanner TestFlatFLBAScanner; @@ -187,8 +180,8 @@ static int num_levels_per_page = 100; static int num_pages = 20; static int batch_size = 32; -typedef ::testing::Types TestTypes; +typedef ::testing::Types TestTypes; typedef TestFlatScanner TestBooleanFlatScanner; typedef TestFlatScanner TestFLBAFlatScanner; @@ -200,8 +193,8 @@ TYPED_TEST(TestFlatScanner, TestPlainScanner) { } TYPED_TEST(TestFlatScanner, TestDictScanner) { - this->ExecuteAll(num_pages, num_levels_per_page, batch_size, 0, - Encoding::RLE_DICTIONARY); + this->ExecuteAll( + num_pages, num_levels_per_page, batch_size, 0, Encoding::RLE_DICTIONARY); } TEST_F(TestBooleanFlatScanner, TestPlainScanner) { @@ -213,8 +206,8 @@ TEST_F(TestFLBAFlatScanner, TestPlainScanner) { } TEST_F(TestFLBAFlatScanner, TestDictScanner) { - this->ExecuteAll(num_pages, num_levels_per_page, batch_size, FLBA_LENGTH, - Encoding::RLE_DICTIONARY); + this->ExecuteAll( + num_pages, num_levels_per_page, batch_size, FLBA_LENGTH, Encoding::RLE_DICTIONARY); } TEST_F(TestFLBAFlatScanner, TestPlainDictScanner) { @@ -222,14 +215,13 @@ TEST_F(TestFLBAFlatScanner, TestPlainDictScanner) { Encoding::PLAIN_DICTIONARY); } - -//PARQUET 502 +// PARQUET 502 TEST_F(TestFlatFLBAScanner, TestSmallBatch) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 0, 0); - num_values_ = MakePages(&d, 1, 100, def_levels_, rep_levels_, values_, - data_buffer_, pages_); + num_values_ = MakePages( + &d, 1, 100, def_levels_, rep_levels_, values_, data_buffer_, pages_); num_levels_ = 1 * 100; InitScanner(&d); CheckResults(1, &d); @@ -239,12 +231,12 @@ TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 4, 0); - num_values_ = MakePages(&d, 1, 100, def_levels_, rep_levels_, values_, - data_buffer_, pages_); + num_values_ = MakePages( + &d, 1, 100, def_levels_, rep_levels_, values_, data_buffer_, pages_); num_levels_ = 1 * 100; InitScanner(&d); TypedScanner* scanner = - reinterpret_cast* >(scanner_.get()); + reinterpret_cast*>(scanner_.get()); ASSERT_EQ(10, scanner->descr()->type_precision()); ASSERT_EQ(2, scanner->descr()->type_scale()); ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length()); @@ -254,12 +246,12 @@ TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 4, 0); - num_values_ = MakePages(&d, 1, 100, def_levels_, rep_levels_, values_, - data_buffer_, pages_); + num_values_ = MakePages( + &d, 1, 100, def_levels_, rep_levels_, values_, data_buffer_, pages_); num_levels_ = 1 * 100; InitScanner(&d); TypedScanner* scanner = - reinterpret_cast* >(scanner_.get()); + reinterpret_cast*>(scanner_.get()); scanner->SetBatchSize(batch_size); std::stringstream ss_fail; for (int i = 0; i < num_levels_; i++) { @@ -271,5 +263,5 @@ TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) { ASSERT_THROW(scanner->PrintNext(ss_fail, 17), ParquetException); } -} // namespace test -} // namespace parquet +} // namespace test +} // namespace parquet diff --git a/src/parquet/column/scanner.cc b/src/parquet/column/scanner.cc index 5397751c..8db3d2bb 100644 --- a/src/parquet/column/scanner.cc +++ b/src/parquet/column/scanner.cc @@ -42,8 +42,8 @@ std::shared_ptr Scanner::Make(std::shared_ptr col_reader, case Type::BYTE_ARRAY: return std::make_shared(col_reader, batch_size, allocator); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(col_reader, - batch_size, allocator); + return std::make_shared( + col_reader, batch_size, allocator); default: ParquetException::NYI("type reader not implemented"); } @@ -51,4 +51,4 @@ std::shared_ptr Scanner::Make(std::shared_ptr col_reader, return std::shared_ptr(nullptr); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h index d52838e7..f27c2d38 100644 --- a/src/parquet/column/scanner.h +++ b/src/parquet/column/scanner.h @@ -39,14 +39,14 @@ class Scanner { public: explicit Scanner(std::shared_ptr reader, int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, - MemoryAllocator* allocator = default_allocator()) : - batch_size_(batch_size), - level_offset_(0), - levels_buffered_(0), - value_buffer_(0, allocator), - value_offset_(0), - values_buffered_(0), - reader_(reader) { + MemoryAllocator* allocator = default_allocator()) + : batch_size_(batch_size), + level_offset_(0), + levels_buffered_(0), + value_buffer_(0, allocator), + value_offset_(0), + values_buffered_(0), + reader_(reader) { // TODO: don't allocate for required fields def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0); rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0); @@ -60,19 +60,13 @@ class Scanner { virtual void PrintNext(std::ostream& out, int width) = 0; - bool HasNext() { - return level_offset_ < levels_buffered_ || reader_->HasNext(); - } + bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); } - const ColumnDescriptor* descr() const { - return reader_->descr(); - } + const ColumnDescriptor* descr() const { return reader_->descr(); } - int64_t batch_size() const { return batch_size_;} + int64_t batch_size() const { return batch_size_; } - void SetBatchSize(int64_t batch_size) { - batch_size_ = batch_size; - } + void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; } protected: int64_t batch_size_; @@ -90,7 +84,6 @@ class Scanner { std::shared_ptr reader_; }; - template class TypedScanner : public Scanner { public: @@ -98,8 +91,8 @@ class TypedScanner : public Scanner { explicit TypedScanner(std::shared_ptr reader, int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, - MemoryAllocator* allocator = default_allocator()) : - Scanner(reader, batch_size, allocator) { + MemoryAllocator* allocator = default_allocator()) + : Scanner(reader, batch_size, allocator) { typed_reader_ = static_cast*>(reader.get()); int value_byte_size = type_traits::value_byte_size; value_buffer_.Resize(batch_size_ * value_byte_size); @@ -110,14 +103,12 @@ class TypedScanner : public Scanner { bool NextLevels(int16_t* def_level, int16_t* rep_level) { if (level_offset_ == levels_buffered_) { - levels_buffered_ = typed_reader_->ReadBatch(batch_size_, &def_levels_[0], - &rep_levels_[0], values_, &values_buffered_); + levels_buffered_ = typed_reader_->ReadBatch( + batch_size_, &def_levels_[0], &rep_levels_[0], values_, &values_buffered_); value_offset_ = 0; level_offset_ = 0; - if (!levels_buffered_) { - return false; - } + if (!levels_buffered_) { return false; } } *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0; *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0; @@ -126,7 +117,7 @@ class TypedScanner : public Scanner { } bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) { - if (level_offset_ == levels_buffered_) { + if (level_offset_ == levels_buffered_) { if (!HasNext()) { // Out of data pages return false; @@ -136,9 +127,7 @@ class TypedScanner : public Scanner { NextLevels(def_level, rep_level); *is_null = *def_level < descr()->max_definition_level(); - if (*is_null) { - return true; - } + if (*is_null) { return true; } if (value_offset_ == values_buffered_) { throw ParquetException("Value was non-null, but has not been buffered"); @@ -162,9 +151,7 @@ class TypedScanner : public Scanner { NextLevels(&def_level, &rep_level); *is_null = def_level < descr()->max_definition_level(); - if (*is_null) { - return true; - } + if (*is_null) { return true; } if (value_offset_ == values_buffered_) { throw ParquetException("Value was non-null, but has not been buffered"); @@ -178,9 +165,7 @@ class TypedScanner : public Scanner { bool is_null = false; char buffer[25]; - if (!NextValue(&val, &is_null)) { - throw ParquetException("No more values buffered"); - } + if (!NextValue(&val, &is_null)) { throw ParquetException("No more values buffered"); } if (is_null) { std::string null_fmt = format_fwf(width); @@ -200,10 +185,9 @@ class TypedScanner : public Scanner { T* values_; }; - template -inline void TypedScanner::FormatValue(void* val, char* buffer, - int bufsize, int width) { +inline void TypedScanner::FormatValue( + void* val, char* buffer, int bufsize, int width) { std::string fmt = format_fwf(width); snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast(val)); } @@ -229,8 +213,7 @@ inline void TypedScanner::FormatValue( void* val, char* buffer, int bufsize, int width) { std::string fmt = format_fwf(width); std::string result = FixedLenByteArrayToString( - *reinterpret_cast(val), - descr()->type_length()); + *reinterpret_cast(val), descr()->type_length()); snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); } @@ -243,6 +226,6 @@ typedef TypedScanner DoubleScanner; typedef TypedScanner ByteArrayScanner; typedef TypedScanner FixedLenByteArrayScanner; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_COLUMN_SCANNER_H +#endif // PARQUET_COLUMN_SCANNER_H diff --git a/src/parquet/column/test-util.h b/src/parquet/column/test-util.h index 95b19816..d13f0527 100644 --- a/src/parquet/column/test-util.h +++ b/src/parquet/column/test-util.h @@ -45,15 +45,14 @@ namespace parquet { namespace test { template -static void InitValues(int num_values, vector& values, - vector& buffer) { +static void InitValues(int num_values, vector& values, vector& buffer) { random_numbers(num_values, 0, std::numeric_limits::min(), std::numeric_limits::max(), values.data()); } template -static void InitDictValues(int num_values, int num_dicts, - vector& values, vector& buffer) { +static void InitDictValues( + int num_values, int num_dicts, vector& values, vector& buffer) { int repeat_factor = num_values / num_dicts; InitValues(num_dicts, values, buffer); // add some repeated values @@ -71,9 +70,8 @@ static void InitDictValues(int num_values, int num_dicts, class MockPageReader : public PageReader { public: - explicit MockPageReader(const vector >& pages) : - pages_(pages), - page_index_(0) {} + explicit MockPageReader(const vector>& pages) + : pages_(pages), page_index_(0) {} // Implement the PageReader interface virtual shared_ptr NextPage() { @@ -85,7 +83,7 @@ class MockPageReader : public PageReader { } private: - vector > pages_; + vector> pages_; int page_index_; }; @@ -97,16 +95,15 @@ class DataPageBuilder { typedef typename Type::c_type T; // This class writes data and metadata to the passed inputs - explicit DataPageBuilder(InMemoryOutputStream* sink) : - sink_(sink), - num_values_(0), - encoding_(Encoding::PLAIN), - definition_level_encoding_(Encoding::RLE), - repetition_level_encoding_(Encoding::RLE), - have_def_levels_(false), - have_rep_levels_(false), - have_values_(false) { - } + explicit DataPageBuilder(InMemoryOutputStream* sink) + : sink_(sink), + num_values_(0), + encoding_(Encoding::PLAIN), + definition_level_encoding_(Encoding::RLE), + repetition_level_encoding_(Encoding::RLE), + have_def_levels_(false), + have_rep_levels_(false), + have_values_(false) {} void AppendDefLevels(const vector& levels, int16_t max_level, Encoding::type encoding = Encoding::RLE) { @@ -126,7 +123,7 @@ class DataPageBuilder { have_rep_levels_ = true; } - void AppendValues(const ColumnDescriptor *d, const vector& values, + void AppendValues(const ColumnDescriptor* d, const vector& values, Encoding::type encoding = Encoding::PLAIN) { PlainEncoder encoder(d); encoder.Encode(&values[0], values.size(), sink_); @@ -136,21 +133,13 @@ class DataPageBuilder { have_values_ = true; } - int32_t num_values() const { - return num_values_; - } + int32_t num_values() const { return num_values_; } - Encoding::type encoding() const { - return encoding_; - } + Encoding::type encoding() const { return encoding_; } - Encoding::type rep_level_encoding() const { - return repetition_level_encoding_; - } + Encoding::type rep_level_encoding() const { return repetition_level_encoding_; } - Encoding::type def_level_encoding() const { - return definition_level_encoding_; - } + Encoding::type def_level_encoding() const { return definition_level_encoding_; } private: InMemoryOutputStream* sink_; @@ -165,8 +154,8 @@ class DataPageBuilder { bool have_values_; // Used internally for both repetition and definition levels - void AppendLevels(const vector& levels, int16_t max_level, - Encoding::type encoding) { + void AppendLevels( + const vector& levels, int16_t max_level, Encoding::type encoding) { if (encoding != Encoding::RLE) { ParquetException::NYI("only rle encoding currently implemented"); } @@ -178,8 +167,8 @@ class DataPageBuilder { // RLE-encoded bytes have to be preceded in the stream by their absolute // size. LevelEncoder encoder; - encoder.Init(encoding, max_level, levels.size(), - encode_buffer.data(), encode_buffer.size()); + encoder.Init( + encoding, max_level, levels.size(), encode_buffer.data(), encode_buffer.size()); encoder.Encode(levels.size(), levels.data()); @@ -189,9 +178,9 @@ class DataPageBuilder { } }; -template<> -void DataPageBuilder::AppendValues(const ColumnDescriptor *d, - const vector& values, Encoding::type encoding) { +template <> +void DataPageBuilder::AppendValues( + const ColumnDescriptor* d, const vector& values, Encoding::type encoding) { if (encoding != Encoding::PLAIN) { ParquetException::NYI("only plain encoding currently implemented"); } @@ -204,40 +193,32 @@ void DataPageBuilder::AppendValues(const ColumnDescriptor *d, } template -static shared_ptr MakeDataPage(const ColumnDescriptor *d, - const vector& values, int num_vals, - Encoding::type encoding, const uint8_t* indices, int indices_size, - const vector& def_levels, int16_t max_def_level, - const vector& rep_levels, int16_t max_rep_level) { +static shared_ptr MakeDataPage(const ColumnDescriptor* d, + const vector& values, int num_vals, Encoding::type encoding, + const uint8_t* indices, int indices_size, const vector& def_levels, + int16_t max_def_level, const vector& rep_levels, int16_t max_rep_level) { int num_values = 0; InMemoryOutputStream page_stream; test::DataPageBuilder page_builder(&page_stream); - if (!rep_levels.empty()) { - page_builder.AppendRepLevels(rep_levels, max_rep_level); - } - if (!def_levels.empty()) { - page_builder.AppendDefLevels(def_levels, max_def_level); - } + if (!rep_levels.empty()) { page_builder.AppendRepLevels(rep_levels, max_rep_level); } + if (!def_levels.empty()) { page_builder.AppendDefLevels(def_levels, max_def_level); } if (encoding == Encoding::PLAIN) { page_builder.AppendValues(d, values, encoding); num_values = page_builder.num_values(); - } else {// DICTIONARY PAGES + } else { // DICTIONARY PAGES page_stream.Write(indices, indices_size); num_values = std::max(page_builder.num_values(), num_vals); } auto buffer = page_stream.GetBuffer(); - return std::make_shared(buffer, num_values, - encoding, - page_builder.def_level_encoding(), - page_builder.rep_level_encoding()); + return std::make_shared(buffer, num_values, encoding, + page_builder.def_level_encoding(), page_builder.rep_level_encoding()); } - template class DictionaryPageBuilder { public: @@ -245,19 +226,14 @@ class DictionaryPageBuilder { static constexpr int TN = TYPE::type_num; // This class writes data and metadata to the passed inputs - explicit DictionaryPageBuilder(const ColumnDescriptor *d) : - num_dict_values_(0), - have_values_(false) { - int type_length = 0; - if (TN == Type::FIXED_LEN_BYTE_ARRAY) { - type_length = d->type_length(); - } - encoder_.reset(new DictEncoder(&pool_, default_allocator(), type_length)); + explicit DictionaryPageBuilder(const ColumnDescriptor* d) + : num_dict_values_(0), have_values_(false) { + int type_length = 0; + if (TN == Type::FIXED_LEN_BYTE_ARRAY) { type_length = d->type_length(); } + encoder_.reset(new DictEncoder(&pool_, default_allocator(), type_length)); } - ~DictionaryPageBuilder() { - pool_.FreeAll(); - } + ~DictionaryPageBuilder() { pool_.FreeAll(); } shared_ptr AppendValues(const vector& values) { int num_values = values.size(); @@ -269,43 +245,41 @@ class DictionaryPageBuilder { have_values_ = true; shared_ptr rle_indices = std::make_shared( sizeof(int) * encoder_->EstimatedDataEncodedSize()); - int actual_bytes = encoder_->WriteIndices(rle_indices->mutable_data(), - rle_indices->size()); + int actual_bytes = + encoder_->WriteIndices(rle_indices->mutable_data(), rle_indices->size()); rle_indices->Resize(actual_bytes); encoder_->ClearIndices(); return rle_indices; } shared_ptr WriteDict() { - shared_ptr dict_buffer = std::make_shared( - encoder_->dict_encoded_size()); + shared_ptr dict_buffer = + std::make_shared(encoder_->dict_encoded_size()); encoder_->WriteDict(dict_buffer->mutable_data()); return dict_buffer; } - int32_t num_values() const { - return num_dict_values_; - } + int32_t num_values() const { return num_dict_values_; } private: MemPool pool_; - shared_ptr > encoder_; + shared_ptr> encoder_; int32_t num_dict_values_; bool have_values_; }; -template<> -DictionaryPageBuilder::DictionaryPageBuilder(const ColumnDescriptor *d) { +template <> +DictionaryPageBuilder::DictionaryPageBuilder(const ColumnDescriptor* d) { ParquetException::NYI("only plain encoding currently implemented for boolean"); } -template<> +template <> shared_ptr DictionaryPageBuilder::WriteDict() { ParquetException::NYI("only plain encoding currently implemented for boolean"); return nullptr; } -template<> +template <> shared_ptr DictionaryPageBuilder::AppendValues( const vector& values) { ParquetException::NYI("only plain encoding currently implemented for boolean"); @@ -313,39 +287,37 @@ shared_ptr DictionaryPageBuilder::AppendValues( } template -static shared_ptr MakeDictPage(const ColumnDescriptor *d, +static shared_ptr MakeDictPage(const ColumnDescriptor* d, const vector& values, const vector& values_per_page, - Encoding::type encoding, vector >& rle_indices) { + Encoding::type encoding, vector>& rle_indices) { InMemoryOutputStream page_stream; test::DictionaryPageBuilder page_builder(d); int num_pages = values_per_page.size(); int value_start = 0; for (int i = 0; i < num_pages; i++) { - rle_indices.push_back(page_builder.AppendValues(slice(values, value_start, - value_start + values_per_page[i]))); + rle_indices.push_back(page_builder.AppendValues( + slice(values, value_start, value_start + values_per_page[i]))); value_start += values_per_page[i]; } auto buffer = page_builder.WriteDict(); - return std::make_shared(buffer, page_builder.num_values(), - Encoding::PLAIN); + return std::make_shared( + buffer, page_builder.num_values(), Encoding::PLAIN); } // Given def/rep levels and values create multiple dict pages template -static void PaginateDict(const ColumnDescriptor *d, - const vector& values, - const vector& def_levels, int16_t max_def_level, - const vector& rep_levels, int16_t max_rep_level, +static void PaginateDict(const ColumnDescriptor* d, + const vector& values, const vector& def_levels, + int16_t max_def_level, const vector& rep_levels, int16_t max_rep_level, int num_levels_per_page, const vector& values_per_page, - vector >& pages, - Encoding::type encoding = Encoding::RLE_DICTIONARY) { + vector>& pages, Encoding::type encoding = Encoding::RLE_DICTIONARY) { int num_pages = values_per_page.size(); - vector > rle_indices; - shared_ptr dict_page = MakeDictPage(d, values, values_per_page, - encoding, rle_indices); + vector> rle_indices; + shared_ptr dict_page = + MakeDictPage(d, values, values_per_page, encoding, rle_indices); pages.push_back(dict_page); int def_level_start = 0; int def_level_end = 0; @@ -370,13 +342,11 @@ static void PaginateDict(const ColumnDescriptor *d, // Given def/rep levels and values create multiple plain pages template -static void PaginatePlain(const ColumnDescriptor *d, - const vector& values, - const vector& def_levels, int16_t max_def_level, - const vector& rep_levels, int16_t max_rep_level, +static void PaginatePlain(const ColumnDescriptor* d, + const vector& values, const vector& def_levels, + int16_t max_def_level, const vector& rep_levels, int16_t max_rep_level, int num_levels_per_page, const vector& values_per_page, - vector >& pages, - Encoding::type encoding = Encoding::PLAIN) { + vector>& pages, Encoding::type encoding = Encoding::PLAIN) { int num_pages = values_per_page.size(); int def_level_start = 0; int def_level_end = 0; @@ -394,9 +364,8 @@ static void PaginatePlain(const ColumnDescriptor *d, } shared_ptr page = MakeDataPage(d, slice(values, value_start, value_start + values_per_page[i]), values_per_page[i], - encoding, NULL, 0, - slice(def_levels, def_level_start, def_level_end), max_def_level, - slice(rep_levels, rep_level_start, rep_level_end), max_rep_level); + encoding, NULL, 0, slice(def_levels, def_level_start, def_level_end), + max_def_level, slice(rep_levels, rep_level_start, rep_level_end), max_rep_level); pages.push_back(page); value_start += values_per_page[i]; } @@ -404,11 +373,10 @@ static void PaginatePlain(const ColumnDescriptor *d, // Generates pages from randomly generated data template -static int MakePages(const ColumnDescriptor *d, int num_pages, int levels_per_page, +static int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page, vector& def_levels, vector& rep_levels, vector& values, vector& buffer, - vector >& pages, - Encoding::type encoding = Encoding::PLAIN) { + vector>& pages, Encoding::type encoding = Encoding::PLAIN) { int num_levels = levels_per_page * num_pages; int num_values = 0; uint32_t seed = 0; @@ -442,21 +410,21 @@ static int MakePages(const ColumnDescriptor *d, int num_pages, int levels_per_pa values.resize(num_values); if (encoding == Encoding::PLAIN) { InitValues(num_values, values, buffer); - PaginatePlain(d, values, def_levels, max_def_level, - rep_levels, max_rep_level, levels_per_page, values_per_page, pages); - } else if (encoding == Encoding::RLE_DICTIONARY - || encoding == Encoding::PLAIN_DICTIONARY) { + PaginatePlain(d, values, def_levels, max_def_level, rep_levels, max_rep_level, + levels_per_page, values_per_page, pages); + } else if (encoding == Encoding::RLE_DICTIONARY || + encoding == Encoding::PLAIN_DICTIONARY) { // Calls InitValues and repeats the data InitDictValues(num_values, levels_per_page, values, buffer); - PaginateDict(d, values, def_levels, max_def_level, - rep_levels, max_rep_level, levels_per_page, values_per_page, pages); + PaginateDict(d, values, def_levels, max_def_level, rep_levels, max_rep_level, + levels_per_page, values_per_page, pages); } return num_values; } -} // namespace test +} // namespace test -} // namespace parquet +} // namespace parquet -#endif // PARQUET_COLUMN_TEST_UTIL_H +#endif // PARQUET_COLUMN_TEST_UTIL_H diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc index 4dcb6721..584b8a7a 100644 --- a/src/parquet/column/writer.cc +++ b/src/parquet/column/writer.cc @@ -25,12 +25,15 @@ namespace parquet { // ColumnWriter ColumnWriter::ColumnWriter(const ColumnDescriptor* descr, - std::unique_ptr pager, int64_t expected_rows, - MemoryAllocator* allocator): - descr_(descr), pager_(std::move(pager)), expected_rows_(expected_rows), - allocator_(allocator), - num_buffered_values_(0), num_buffered_encoded_values_(0), - num_rows_(0), total_bytes_written_(0) { + std::unique_ptr pager, int64_t expected_rows, MemoryAllocator* allocator) + : descr_(descr), + pager_(std::move(pager)), + expected_rows_(expected_rows), + allocator_(allocator), + num_buffered_values_(0), + num_buffered_encoded_values_(0), + num_rows_(0), + total_bytes_written_(0) { InitSinks(); } @@ -41,13 +44,13 @@ void ColumnWriter::InitSinks() { } void ColumnWriter::WriteDefinitionLevels(int64_t num_levels, int16_t* levels) { - definition_levels_sink_->Write(reinterpret_cast(levels), - sizeof(int16_t) * num_levels); + definition_levels_sink_->Write( + reinterpret_cast(levels), sizeof(int16_t) * num_levels); } void ColumnWriter::WriteRepetitionLevels(int64_t num_levels, int16_t* levels) { - repetition_levels_sink_->Write(reinterpret_cast(levels), - sizeof(int16_t) * num_levels); + repetition_levels_sink_->Write( + reinterpret_cast(levels), sizeof(int16_t) * num_levels); } std::shared_ptr ColumnWriter::RleEncodeLevels( @@ -55,11 +58,11 @@ std::shared_ptr ColumnWriter::RleEncodeLevels( // TODO: This only works with due to some RLE specifics int64_t rle_size = 2 * num_buffered_values_ + sizeof(uint32_t); auto buffer_rle = std::make_shared(rle_size, allocator_); - level_encoder_.Init(Encoding::RLE, max_level, - num_buffered_values_, buffer_rle->mutable_data() + sizeof(uint32_t), + level_encoder_.Init(Encoding::RLE, max_level, num_buffered_values_, + buffer_rle->mutable_data() + sizeof(uint32_t), buffer_rle->size() - sizeof(uint32_t)); - int encoded = level_encoder_.Encode(num_buffered_values_, - reinterpret_cast(buffer->data())); + int encoded = level_encoder_.Encode( + num_buffered_values_, reinterpret_cast(buffer->data())); DCHECK_EQ(encoded, num_buffered_values_); reinterpret_cast(buffer_rle->mutable_data())[0] = level_encoder_.len(); int64_t encoded_size = level_encoder_.len() + sizeof(uint32_t); @@ -75,21 +78,19 @@ void ColumnWriter::WriteNewPage() { std::shared_ptr values = values_sink_->GetBuffer(); if (descr_->max_definition_level() > 0) { - definition_levels = RleEncodeLevels(definition_levels, - descr_->max_definition_level()); + definition_levels = + RleEncodeLevels(definition_levels, descr_->max_definition_level()); } if (descr_->max_repetition_level() > 0) { - repetition_levels = RleEncodeLevels(repetition_levels, - descr_->max_repetition_level()); + repetition_levels = + RleEncodeLevels(repetition_levels, descr_->max_repetition_level()); } // TODO(PARQUET-590): Encodings are hard-coded int64_t bytes_written = pager_->WriteDataPage(num_buffered_values_, - num_buffered_encoded_values_, - definition_levels, Encoding::RLE, - repetition_levels, Encoding::RLE, - values, Encoding::PLAIN); + num_buffered_encoded_values_, definition_levels, Encoding::RLE, repetition_levels, + Encoding::RLE, values, Encoding::PLAIN); total_bytes_written_ += bytes_written; // Re-initialize the sinks as GetBuffer made them invalid. @@ -100,12 +101,11 @@ void ColumnWriter::WriteNewPage() { int64_t ColumnWriter::Close() { // Write all outstanding data to a new page - if (num_buffered_values_ > 0) { - WriteNewPage(); - } + if (num_buffered_values_ > 0) { WriteNewPage(); } if (num_rows_ != expected_rows_) { - throw ParquetException("Less then the number of expected rows written in" + throw ParquetException( + "Less then the number of expected rows written in" " the current column chunk"); } @@ -119,47 +119,44 @@ int64_t ColumnWriter::Close() { template TypedColumnWriter::TypedColumnWriter(const ColumnDescriptor* schema, - std::unique_ptr pager, int64_t expected_rows, - MemoryAllocator* allocator) : - ColumnWriter(schema, std::move(pager), expected_rows, allocator) { + std::unique_ptr pager, int64_t expected_rows, MemoryAllocator* allocator) + : ColumnWriter(schema, std::move(pager), expected_rows, allocator) { // TODO(PARQUET-590) Get decoder type from WriterProperties - current_encoder_ = std::unique_ptr( - new PlainEncoder(schema, allocator)); + current_encoder_ = + std::unique_ptr(new PlainEncoder(schema, allocator)); } // ---------------------------------------------------------------------- // Dynamic column writer constructor -std::shared_ptr ColumnWriter::Make( - const ColumnDescriptor* descr, - std::unique_ptr pager, - int64_t expected_rows, +std::shared_ptr ColumnWriter::Make(const ColumnDescriptor* descr, + std::unique_ptr pager, int64_t expected_rows, MemoryAllocator* allocator) { switch (descr->physical_type()) { case Type::BOOLEAN: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::INT32: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::INT64: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::INT96: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::FLOAT: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::DOUBLE: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::BYTE_ARRAY: - return std::make_shared(descr, std::move(pager), expected_rows, - allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(descr, - std::move(pager), expected_rows, allocator); + return std::make_shared( + descr, std::move(pager), expected_rows, allocator); default: ParquetException::NYI("type reader not implemented"); } @@ -179,4 +176,4 @@ template class TypedColumnWriter; template class TypedColumnWriter; template class TypedColumnWriter; -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h index 7ccfe730..32d07127 100644 --- a/src/parquet/column/writer.h +++ b/src/parquet/column/writer.h @@ -37,13 +37,9 @@ class ColumnWriter { std::unique_ptr, int64_t expected_rows, MemoryAllocator* allocator = default_allocator()); - Type::type type() const { - return descr_->physical_type(); - } + Type::type type() const { return descr_->physical_type(); } - const ColumnDescriptor* descr() const { - return descr_; - } + const ColumnDescriptor* descr() const { return descr_; } /** * Closes the ColumnWriter, commits any buffered values to pages. @@ -61,8 +57,8 @@ class ColumnWriter { // Write multiple repetition levels void WriteRepetitionLevels(int64_t num_levels, int16_t* levels); - std::shared_ptr RleEncodeLevels(const std::shared_ptr& buffer, - int16_t max_level); + std::shared_ptr RleEncodeLevels( + const std::shared_ptr& buffer, int16_t max_level); const ColumnDescriptor* descr_; @@ -106,14 +102,13 @@ class TypedColumnWriter : public ColumnWriter { public: typedef typename DType::c_type T; - TypedColumnWriter(const ColumnDescriptor* schema, - std::unique_ptr pager, int64_t expected_rows, - MemoryAllocator* allocator = default_allocator()); + TypedColumnWriter(const ColumnDescriptor* schema, std::unique_ptr pager, + int64_t expected_rows, MemoryAllocator* allocator = default_allocator()); // Write a batch of repetition levels, definition levels, and values to the // column. - void WriteBatch(int64_t num_values, int16_t* def_levels, int16_t* rep_levels, - T* values); + void WriteBatch( + int64_t num_values, int16_t* def_levels, int16_t* rep_levels, T* values); private: typedef Encoder EncoderType; @@ -124,7 +119,7 @@ class TypedColumnWriter : public ColumnWriter { // Map of encoding type to the respective encoder object. For example, a // column chunk's data pages may include both dictionary-encoded and // plain-encoded data. - std::unordered_map > encoders_; + std::unordered_map> encoders_; void ConfigureDictionary(const DictionaryPage* page); @@ -136,16 +131,14 @@ class TypedColumnWriter : public ColumnWriter { const int64_t PAGE_VALUE_COUNT = 1000; template -inline void TypedColumnWriter::WriteBatch(int64_t num_values, int16_t* def_levels, - int16_t* rep_levels, T* values) { +inline void TypedColumnWriter::WriteBatch( + int64_t num_values, int16_t* def_levels, int16_t* rep_levels, T* values) { int64_t values_to_write = 0; // If the field is required and non-repeated, there are no definition levels if (descr_->max_definition_level() > 0) { for (int64_t i = 0; i < num_values; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { - ++values_to_write; - } + if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; } } WriteDefinitionLevels(num_values, def_levels); @@ -159,9 +152,7 @@ inline void TypedColumnWriter::WriteBatch(int64_t num_values, int16_t* de // A row could include more than one value // Count the occasions where we start a new row for (int64_t i = 0; i < num_values; ++i) { - if (rep_levels[i] == 0) { - num_rows_++; - } + if (rep_levels[i] == 0) { num_rows_++; } } WriteRepetitionLevels(num_values, rep_levels); @@ -180,9 +171,7 @@ inline void TypedColumnWriter::WriteBatch(int64_t num_values, int16_t* de num_buffered_encoded_values_ += values_to_write; // TODO(PARQUET-591): Instead of rows as a boundary, do a size check - if (num_buffered_values_ >= PAGE_VALUE_COUNT) { - WriteNewPage(); - } + if (num_buffered_values_ >= PAGE_VALUE_COUNT) { WriteNewPage(); } } template @@ -199,6 +188,6 @@ typedef TypedColumnWriter DoubleWriter; typedef TypedColumnWriter ByteArrayWriter; typedef TypedColumnWriter FixedLenByteArrayWriter; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_COLUMN_READER_H +#endif // PARQUET_COLUMN_READER_H diff --git a/src/parquet/compression/codec-test.cc b/src/parquet/compression/codec-test.cc index 6326003b..d6ddbd61 100644 --- a/src/parquet/compression/codec-test.cc +++ b/src/parquet/compression/codec-test.cc @@ -39,24 +39,22 @@ void CheckCodecRoundtrip(const vector& data) { std::vector decompressed(data.size()); // compress with c1 - int actual_size = c1.Compress(data.size(), &data[0], max_compressed_len, - &compressed[0]); + int actual_size = + c1.Compress(data.size(), &data[0], max_compressed_len, &compressed[0]); compressed.resize(actual_size); // decompress with c2 - c2.Decompress(compressed.size(), &compressed[0], - decompressed.size(), &decompressed[0]); + c2.Decompress(compressed.size(), &compressed[0], decompressed.size(), &decompressed[0]); ASSERT_TRUE(test::vector_equal(data, decompressed)); // compress with c2 - int actual_size2 = c2.Compress(data.size(), &data[0], max_compressed_len, - &compressed[0]); + int actual_size2 = + c2.Compress(data.size(), &data[0], max_compressed_len, &compressed[0]); ASSERT_EQ(actual_size2, actual_size); // decompress with c1 - c1.Decompress(compressed.size(), &compressed[0], - decompressed.size(), &decompressed[0]); + c1.Decompress(compressed.size(), &compressed[0], decompressed.size(), &decompressed[0]); ASSERT_TRUE(test::vector_equal(data, decompressed)); } @@ -83,4 +81,4 @@ TEST(TestCompressors, GZip) { CheckCodec(); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/compression/codec.cc b/src/parquet/compression/codec.cc index 2023fcdb..fed56445 100644 --- a/src/parquet/compression/codec.cc +++ b/src/parquet/compression/codec.cc @@ -44,4 +44,4 @@ std::unique_ptr Codec::Create(Compression::type codec_type) { return result; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/compression/codec.h b/src/parquet/compression/codec.h index 95d6014e..ffbe5632 100644 --- a/src/parquet/compression/codec.h +++ b/src/parquet/compression/codec.h @@ -34,8 +34,8 @@ class Codec { static std::unique_ptr Create(Compression::type codec); - virtual void Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer) = 0; + virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) = 0; virtual int64_t Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, uint8_t* output_buffer) = 0; @@ -45,12 +45,11 @@ class Codec { virtual const char* name() const = 0; }; - // Snappy codec. class SnappyCodec : public Codec { public: - virtual void Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer); + virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer); virtual int64_t Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, uint8_t* output_buffer); @@ -63,8 +62,8 @@ class SnappyCodec : public Codec { // Lz4 codec. class Lz4Codec : public Codec { public: - virtual void Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer); + virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer); virtual int64_t Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, uint8_t* output_buffer); @@ -87,8 +86,8 @@ class GZipCodec : public Codec { explicit GZipCodec(Format format = GZIP); virtual ~GZipCodec(); - virtual void Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer); + virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer); virtual int64_t Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, uint8_t* output_buffer); @@ -121,6 +120,6 @@ class GZipCodec : public Codec { bool decompressor_initialized_; }; -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/compression/gzip-codec.cc b/src/parquet/compression/gzip-codec.cc index b71afc3d..6c277145 100644 --- a/src/parquet/compression/gzip-codec.cc +++ b/src/parquet/compression/gzip-codec.cc @@ -36,11 +36,8 @@ static constexpr int GZIP_CODEC = 16; // Determine if this is libz or gzip from header. static constexpr int DETECT_CODEC = 32; -GZipCodec::GZipCodec(Format format) : - format_(format), - compressor_initialized_(false), - decompressor_initialized_(false) { -} +GZipCodec::GZipCodec(Format format) + : format_(format), compressor_initialized_(false), decompressor_initialized_(false) {} GZipCodec::~GZipCodec() { EndCompressor(); @@ -59,19 +56,16 @@ void GZipCodec::InitCompressor() { } else if (format_ == GZIP) { window_bits += GZIP_CODEC; } - if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, - window_bits, 9, Z_DEFAULT_STRATEGY)) != Z_OK) { - throw ParquetException("zlib deflateInit failed: " + - std::string(stream_.msg)); + if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9, + Z_DEFAULT_STRATEGY)) != Z_OK) { + throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg)); } compressor_initialized_ = true; } void GZipCodec::EndCompressor() { - if (compressor_initialized_) { - (void)deflateEnd(&stream_); - } + if (compressor_initialized_) { (void)deflateEnd(&stream_); } compressor_initialized_ = false; } @@ -83,23 +77,19 @@ void GZipCodec::InitDecompressor() { // Initialize to run either deflate or zlib/gzip format int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC; if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { - throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg)); + throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg)); } decompressor_initialized_ = true; } void GZipCodec::EndDecompressor() { - if (decompressor_initialized_) { - (void)inflateEnd(&stream_); - } + if (decompressor_initialized_) { (void)inflateEnd(&stream_); } decompressor_initialized_ = false; } -void GZipCodec::Decompress(int64_t input_length, const uint8_t* input, - int64_t output_length, uint8_t* output) { - if (!decompressor_initialized_) { - InitDecompressor(); - } +void GZipCodec::Decompress( + int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) { + if (!decompressor_initialized_) { InitDecompressor(); } if (output_length == 0) { // The zlib library does not allow *output to be NULL, even when output_length // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an @@ -133,8 +123,8 @@ void GZipCodec::Decompress(int64_t input_length, const uint8_t* input, // Failure, buffer was too small std::stringstream ss; - ss << "Too small a buffer passed to GZipCodec. InputLength=" - << input_length << " OutputLength=" << output_length; + ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length + << " OutputLength=" << output_length; throw ParquetException(ss.str()); } @@ -149,18 +139,14 @@ void GZipCodec::Decompress(int64_t input_length, const uint8_t* input, int64_t GZipCodec::MaxCompressedLen(int64_t input_length, const uint8_t* input) { // Most be in compression mode - if (!compressor_initialized_) { - InitCompressor(); - } + if (!compressor_initialized_) { InitCompressor(); } // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase) return deflateBound(&stream_, static_cast(input_length)); } -int64_t GZipCodec::Compress(int64_t input_length, const uint8_t* input, - int64_t output_length, uint8_t* output) { - if (!compressor_initialized_) { - InitCompressor(); - } +int64_t GZipCodec::Compress( + int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) { + if (!compressor_initialized_) { InitCompressor(); } stream_.next_in = const_cast(reinterpret_cast(input)); stream_.avail_in = input_length; stream_.next_out = reinterpret_cast(output); @@ -179,12 +165,11 @@ int64_t GZipCodec::Compress(int64_t input_length, const uint8_t* input, } if (deflateReset(&stream_) != Z_OK) { - throw ParquetException("zlib deflateReset failed: " + - std::string(stream_.msg)); + throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg)); } // Actual output length return output_length - stream_.avail_out; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/compression/lz4-codec.cc b/src/parquet/compression/lz4-codec.cc index cc91bf01..7acc1dee 100644 --- a/src/parquet/compression/lz4-codec.cc +++ b/src/parquet/compression/lz4-codec.cc @@ -23,13 +23,11 @@ namespace parquet { -void Lz4Codec::Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer) { +void Lz4Codec::Decompress( + int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { int64_t n = LZ4_decompress_fast(reinterpret_cast(input), reinterpret_cast(output_buffer), output_len); - if (n != input_len) { - throw ParquetException("Corrupt lz4 compressed data."); - } + if (n != input_len) { throw ParquetException("Corrupt lz4 compressed data."); } } int64_t Lz4Codec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { @@ -42,4 +40,4 @@ int64_t Lz4Codec::Compress(int64_t input_len, const uint8_t* input, reinterpret_cast(output_buffer), input_len); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/compression/snappy-codec.cc b/src/parquet/compression/snappy-codec.cc index 62ba00a9..ccd25b90 100644 --- a/src/parquet/compression/snappy-codec.cc +++ b/src/parquet/compression/snappy-codec.cc @@ -24,10 +24,10 @@ namespace parquet { -void SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer) { +void SnappyCodec::Decompress( + int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { if (!snappy::RawUncompress(reinterpret_cast(input), - static_cast(input_len), reinterpret_cast(output_buffer))) { + static_cast(input_len), reinterpret_cast(output_buffer))) { throw parquet::ParquetException("Corrupt snappy compressed data."); } } @@ -45,4 +45,4 @@ int64_t SnappyCodec::Compress(int64_t input_len, const uint8_t* input, return output_len; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/encodings/decoder.h b/src/parquet/encodings/decoder.h index 36af107d..44425070 100644 --- a/src/parquet/encodings/decoder.h +++ b/src/parquet/encodings/decoder.h @@ -65,6 +65,6 @@ class Decoder { int num_values_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_ENCODINGS_DECODER_H +#endif // PARQUET_ENCODINGS_DECODER_H diff --git a/src/parquet/encodings/delta-bit-pack-encoding.h b/src/parquet/encodings/delta-bit-pack-encoding.h index b0a16a70..5353817f 100644 --- a/src/parquet/encodings/delta-bit-pack-encoding.h +++ b/src/parquet/encodings/delta-bit-pack-encoding.h @@ -33,8 +33,8 @@ class DeltaBitPackDecoder : public Decoder { public: typedef typename DType::c_type T; - explicit DeltaBitPackDecoder(const ColumnDescriptor* descr, - MemoryAllocator* allocator = default_allocator()) + explicit DeltaBitPackDecoder( + const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) : Decoder(descr, Encoding::DELTA_BINARY_PACKED), delta_bit_widths_(0, allocator) { if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) { @@ -60,9 +60,7 @@ class DeltaBitPackDecoder : public Decoder { int32_t block_size; if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException(); if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); - if (!decoder_.GetVlqInt(&values_current_block_)) { - ParquetException::EofException(); - } + if (!decoder_.GetVlqInt(&values_current_block_)) { ParquetException::EofException(); } if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException(); delta_bit_widths_.Resize(num_mini_blocks_); @@ -119,6 +117,6 @@ class DeltaBitPackDecoder : public Decoder { int32_t last_value_; }; -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/encodings/delta-byte-array-encoding.h b/src/parquet/encodings/delta-byte-array-encoding.h index 34867e2d..27d6065e 100644 --- a/src/parquet/encodings/delta-byte-array-encoding.h +++ b/src/parquet/encodings/delta-byte-array-encoding.h @@ -28,12 +28,11 @@ namespace parquet { class DeltaByteArrayDecoder : public Decoder { public: - explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr, - MemoryAllocator* allocator = default_allocator()) + explicit DeltaByteArrayDecoder( + const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) : Decoder(descr, Encoding::DELTA_BYTE_ARRAY), - prefix_len_decoder_(nullptr, allocator), - suffix_decoder_(nullptr, allocator) { - } + prefix_len_decoder_(nullptr, allocator), + suffix_decoder_(nullptr, allocator) {} virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; @@ -51,7 +50,7 @@ class DeltaByteArrayDecoder : public Decoder { // new strings to store the results. virtual int Decode(ByteArray* buffer, int max_values) { max_values = std::min(max_values, num_values_); - for (int i = 0; i < max_values; ++i) { + for (int i = 0; i < max_values; ++i) { int prefix_len = 0; prefix_len_decoder_.Decode(&prefix_len, 1); ByteArray suffix; @@ -77,6 +76,6 @@ class DeltaByteArrayDecoder : public Decoder { ByteArray last_value_; }; -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/encodings/delta-length-byte-array-encoding.h b/src/parquet/encodings/delta-length-byte-array-encoding.h index 7a19aa3e..b1591718 100644 --- a/src/parquet/encodings/delta-length-byte-array-encoding.h +++ b/src/parquet/encodings/delta-length-byte-array-encoding.h @@ -29,11 +29,10 @@ namespace parquet { class DeltaLengthByteArrayDecoder : public Decoder { public: - explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr, - MemoryAllocator* allocator = default_allocator()) : - Decoder(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), - len_decoder_(nullptr, allocator) { - } + explicit DeltaLengthByteArrayDecoder( + const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) + : Decoder(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), + len_decoder_(nullptr, allocator) {} virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; @@ -49,7 +48,7 @@ class DeltaLengthByteArrayDecoder : public Decoder { max_values = std::min(max_values, num_values_); int lengths[max_values]; len_decoder_.Decode(lengths, max_values); - for (int i = 0; i < max_values; ++i) { + for (int i = 0; i < max_values; ++i) { buffer[i].len = lengths[i]; buffer[i].ptr = data_; data_ += lengths[i]; @@ -66,6 +65,6 @@ class DeltaLengthByteArrayDecoder : public Decoder { int len_; }; -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/encodings/dictionary-encoding.h b/src/parquet/encodings/dictionary-encoding.h index e26ba2dc..7d6785e0 100644 --- a/src/parquet/encodings/dictionary-encoding.h +++ b/src/parquet/encodings/dictionary-encoding.h @@ -44,10 +44,11 @@ class DictionaryDecoder : public Decoder { // Initializes the dictionary with values from 'dictionary'. The data in // dictionary is not guaranteed to persist in memory after this call so the // dictionary decoder needs to copy the data out if necessary. - explicit DictionaryDecoder(const ColumnDescriptor* descr, - MemoryAllocator* allocator = default_allocator()): - Decoder(descr, Encoding::RLE_DICTIONARY), dictionary_(0, allocator), - byte_array_data_(0, allocator) {} + explicit DictionaryDecoder( + const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) + : Decoder(descr, Encoding::RLE_DICTIONARY), + dictionary_(0, allocator), + byte_array_data_(0, allocator) {} // Perform type-specific initiatialization void SetDict(Decoder* dictionary); @@ -97,8 +98,7 @@ inline void DictionaryDecoder::SetDict(Decoder* dictionary) { } template <> -inline void DictionaryDecoder::SetDict( - Decoder* dictionary) { +inline void DictionaryDecoder::SetDict(Decoder* dictionary) { ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); } @@ -129,7 +129,7 @@ inline void DictionaryDecoder::SetDict(Decoder* dictionary) dictionary->Decode(&dictionary_[0], num_dictionary_values); int fixed_len = descr_->type_length(); - int total_size = num_dictionary_values*fixed_len; + int total_size = num_dictionary_values * fixed_len; byte_array_data_.Resize(total_size); int offset = 0; @@ -162,9 +162,7 @@ static constexpr double MAX_HASH_LOAD = 0.7; /// the encoder, including new dictionary entries. class DictEncoderBase { public: - virtual ~DictEncoderBase() { - DCHECK(buffered_indices_.empty()); - } + virtual ~DictEncoderBase() { DCHECK(buffered_indices_.empty()); } /// Writes out the encoded dictionary to buffer. buffer must be preallocated to /// dict_encoded_size() bytes. @@ -200,17 +198,15 @@ class DictEncoderBase { int dict_encoded_size() { return dict_encoded_size_; } protected: - explicit DictEncoderBase(MemPool* pool, MemoryAllocator* allocator) : - hash_table_size_(INITIAL_HASH_TABLE_SIZE), - mod_bitmask_(hash_table_size_ - 1), - hash_slots_(0, allocator), - allocator_(allocator), - pool_(pool), - dict_encoded_size_(0) { + explicit DictEncoderBase(MemPool* pool, MemoryAllocator* allocator) + : hash_table_size_(INITIAL_HASH_TABLE_SIZE), + mod_bitmask_(hash_table_size_ - 1), + hash_slots_(0, allocator), + allocator_(allocator), + pool_(pool), + dict_encoded_size_(0) { hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY); - if (!CpuInfo::initialized()) { - CpuInfo::Init(); - } + if (!CpuInfo::initialized()) { CpuInfo::Init(); } } /// Size of the table. Must be a power of 2. @@ -240,18 +236,14 @@ template class DictEncoder : public DictEncoderBase { public: explicit DictEncoder(MemPool* pool = nullptr, - MemoryAllocator* allocator = default_allocator(), int type_length = -1) : - DictEncoderBase(pool, allocator), type_length_(type_length) {} + MemoryAllocator* allocator = default_allocator(), int type_length = -1) + : DictEncoderBase(pool, allocator), type_length_(type_length) {} // TODO(wesm): think about how to address the construction semantics in // encodings/dictionary-encoding.h - void set_mem_pool(MemPool* pool) { - pool_ = pool; - } + void set_mem_pool(MemPool* pool) { pool_ = pool; } - void set_type_length(int type_length) { - type_length_ = type_length; - } + void set_type_length(int type_length) { type_length_ = type_length; } /// Encode value. Note that this does not actually write any data, just /// buffers the value's index to be written later. @@ -278,19 +270,18 @@ class DictEncoder : public DictEncoderBase { void AddDictKey(const T& value); }; -template +template inline int DictEncoder::Hash(const T& value) const { return HashUtil::Hash(&value, sizeof(value), 0); } -template<> +template <> inline int DictEncoder::Hash(const ByteArray& value) const { return HashUtil::Hash(value.ptr, value.len, 0); } -template<> -inline int DictEncoder::Hash( - const FixedLenByteArray& value) const { +template <> +inline int DictEncoder::Hash(const FixedLenByteArray& value) const { return HashUtil::Hash(value.ptr, type_length_, 0); } @@ -324,8 +315,7 @@ inline void DictEncoder::Put(const T& v) { hash_slots_[j] = index; AddDictKey(v); - if (UNLIKELY(static_cast(uniques_.size()) > - hash_table_size_ * MAX_HASH_LOAD)) { + if (UNLIKELY(static_cast(uniques_.size()) > hash_table_size_ * MAX_HASH_LOAD)) { DoubleTableSize(); } } @@ -343,9 +333,7 @@ inline void DictEncoder::DoubleTableSize() { for (int i = 0; i < hash_table_size_; ++i) { index = hash_slots_[i]; - if (index == HASH_SLOT_EMPTY) { - continue; - } + if (index == HASH_SLOT_EMPTY) { continue; } // Compute the hash value mod the new table size to start looking for an // empty slot @@ -370,24 +358,22 @@ inline void DictEncoder::DoubleTableSize() { hash_slots_.Swap(new_hash_slots); } -template +template inline void DictEncoder::AddDictKey(const T& v) { uniques_.push_back(v); dict_encoded_size_ += sizeof(T); } -template<> +template <> inline void DictEncoder::AddDictKey(const ByteArray& v) { uint8_t* heap = pool_->Allocate(v.len); - if (UNLIKELY(v.len > 0 && heap == nullptr)) { - throw ParquetException("out of memory"); - } + if (UNLIKELY(v.len > 0 && heap == nullptr)) { throw ParquetException("out of memory"); } memcpy(heap, v.ptr, v.len); uniques_.push_back(ByteArray(v.len, heap)); dict_encoded_size_ += v.len + sizeof(uint32_t); } -template<> +template <> inline void DictEncoder::AddDictKey(const FixedLenByteArray& v) { uint8_t* heap = pool_->Allocate(type_length_); if (UNLIKELY(type_length_ > 0 && heap == nullptr)) { @@ -440,6 +426,6 @@ inline int DictEncoderBase::WriteIndices(uint8_t* buffer, int buffer_len) { return 1 + encoder.len(); } -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/encodings/encoder.h b/src/parquet/encodings/encoder.h index 0d69111d..85559303 100644 --- a/src/parquet/encodings/encoder.h +++ b/src/parquet/encodings/encoder.h @@ -44,8 +44,8 @@ class Encoder { const Encoding::type encoding() const { return encoding_; } protected: - explicit Encoder(const ColumnDescriptor* descr, - const Encoding::type& encoding, MemoryAllocator* allocator) + explicit Encoder(const ColumnDescriptor* descr, const Encoding::type& encoding, + MemoryAllocator* allocator) : descr_(descr), encoding_(encoding), allocator_(allocator) {} // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY @@ -54,6 +54,6 @@ class Encoder { MemoryAllocator* allocator_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_ENCODINGS_ENCODER_H +#endif // PARQUET_ENCODINGS_ENCODER_H diff --git a/src/parquet/encodings/encoding-test.cc b/src/parquet/encodings/encoding-test.cc index d55de769..bde6f10a 100644 --- a/src/parquet/encodings/encoding-test.cc +++ b/src/parquet/encodings/encoding-test.cc @@ -74,8 +74,8 @@ TEST(VectorBooleanTest, TestEncodeDecode) { template void GenerateData(int num_values, T* out, vector* heap) { // seed the prng so failure is deterministic - random_numbers(num_values, 0, std::numeric_limits::min(), - std::numeric_limits::max(), out); + random_numbers( + num_values, 0, std::numeric_limits::min(), std::numeric_limits::max(), out); } template <> @@ -148,15 +148,11 @@ class TestEncodingBase : public ::testing::Test { void SetUp() { descr_ = ExampleDescr(); - if (descr_) { - type_length_ = descr_->type_length(); - } + if (descr_) { type_length_ = descr_->type_length(); } allocator_ = default_allocator(); } - void TearDown() { - pool_.FreeAll(); - } + void TearDown() { pool_.FreeAll(); } void InitData(int nvalues, int repeats) { num_values_ = nvalues * repeats; @@ -210,7 +206,6 @@ class TestEncodingBase : public ::testing::Test { using TestEncodingBase::encode_buffer_; \ using TestEncodingBase::decode_buf_; - template class TestPlainEncoding : public TestEncodingBase { public: @@ -225,8 +220,7 @@ class TestPlainEncoding : public TestEncodingBase { encode_buffer_ = dst.GetBuffer(); - decoder.SetData(num_values_, encode_buffer_->data(), - encode_buffer_->size()); + decoder.SetData(num_values_, encode_buffer_->data(), encode_buffer_->size()); int values_decoded = decoder.Decode(decode_buf_, num_values_); ASSERT_EQ(num_values_, values_decoded); VerifyResults(decode_buf_, draws_, num_values_); @@ -246,7 +240,7 @@ TYPED_TEST(TestPlainEncoding, BasicRoundTrip) { // Dictionary encoding tests typedef ::testing::Types DictEncodedTypes; + ByteArrayType, FLBAType> DictEncodedTypes; template class TestDictionaryEncoding : public TestEncodingBase { @@ -260,23 +254,21 @@ class TestDictionaryEncoding : public TestEncodingBase { dict_buffer_ = std::make_shared(); auto indices = std::make_shared(); - ASSERT_NO_THROW( - { - for (int i = 0; i < num_values_; ++i) { - encoder.Put(draws_[i]); - } - }); + ASSERT_NO_THROW({ + for (int i = 0; i < num_values_; ++i) { + encoder.Put(draws_[i]); + } + }); dict_buffer_->Resize(encoder.dict_encoded_size()); encoder.WriteDict(dict_buffer_->mutable_data()); indices->Resize(encoder.EstimatedDataEncodedSize()); - int actual_bytes = encoder.WriteIndices(indices->mutable_data(), - indices->size()); + int actual_bytes = encoder.WriteIndices(indices->mutable_data(), indices->size()); indices->Resize(actual_bytes); PlainDecoder dict_decoder(descr_.get()); - dict_decoder.SetData(encoder.num_entries(), dict_buffer_->data(), - dict_buffer_->size()); + dict_decoder.SetData( + encoder.num_entries(), dict_buffer_->data(), dict_buffer_->size()); DictionaryDecoder decoder(descr_.get()); decoder.SetDict(&dict_decoder); @@ -309,6 +301,6 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { ASSERT_THROW(decoder.SetDict(&dict_decoder), ParquetException); } -} // namespace test +} // namespace test -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/encodings/plain-encoding.h b/src/parquet/encodings/plain-encoding.h index 56243c80..71ae740c 100644 --- a/src/parquet/encodings/plain-encoding.h +++ b/src/parquet/encodings/plain-encoding.h @@ -39,9 +39,8 @@ class PlainDecoder : public Decoder { typedef typename DType::c_type T; using Decoder::num_values_; - explicit PlainDecoder(const ColumnDescriptor* descr) : - Decoder(descr, Encoding::PLAIN), - data_(NULL), len_(0) { + explicit PlainDecoder(const ColumnDescriptor* descr) + : Decoder(descr, Encoding::PLAIN), data_(NULL), len_(0) { if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { type_length_ = descr_->type_length(); } else { @@ -66,12 +65,10 @@ class PlainDecoder : public Decoder { // Decode routine templated on C++ type rather than type enum template -inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, - int type_length, T* out) { +inline int DecodePlain( + const uint8_t* data, int64_t data_size, int num_values, int type_length, T* out) { int bytes_to_decode = num_values * sizeof(T); - if (data_size < bytes_to_decode) { - ParquetException::EofException(); - } + if (data_size < bytes_to_decode) { ParquetException::EofException(); } memcpy(out, data, bytes_to_decode); return bytes_to_decode; } @@ -101,9 +98,7 @@ template <> inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, int type_length, FixedLenByteArray* out) { int bytes_to_decode = type_length * num_values; - if (data_size < bytes_to_decode) { - ParquetException::EofException(); - } + if (data_size < bytes_to_decode) { ParquetException::EofException(); } for (int i = 0; i < num_values; ++i) { out[i].ptr = data; data += type_length; @@ -115,8 +110,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size template inline int PlainDecoder::Decode(T* buffer, int max_values) { max_values = std::min(max_values, num_values_); - int bytes_consumed = DecodePlain(data_, len_, max_values, - type_length_, buffer); + int bytes_consumed = DecodePlain(data_, len_, max_values, type_length_, buffer); data_ += bytes_consumed; len_ -= bytes_consumed; num_values_ -= max_values; @@ -126,8 +120,8 @@ inline int PlainDecoder::Decode(T* buffer, int max_values) { template <> class PlainDecoder : public Decoder { public: - explicit PlainDecoder(const ColumnDescriptor* descr) : - Decoder(descr, Encoding::PLAIN) {} + explicit PlainDecoder(const ColumnDescriptor* descr) + : Decoder(descr, Encoding::PLAIN) {} virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; @@ -139,9 +133,7 @@ class PlainDecoder : public Decoder { max_values = std::min(max_values, num_values_); bool val; for (int i = 0; i < max_values; ++i) { - if (!bit_reader_.GetValue(1, &val)) { - ParquetException::EofException(); - } + if (!bit_reader_.GetValue(1, &val)) { ParquetException::EofException(); } BitUtil::SetArrayBit(buffer, i, val); } num_values_ -= max_values; @@ -152,9 +144,7 @@ class PlainDecoder : public Decoder { max_values = std::min(max_values, num_values_); bool val; for (int i = 0; i < max_values; ++i) { - if (!bit_reader_.GetValue(1, &val)) { - ParquetException::EofException(); - } + if (!bit_reader_.GetValue(1, &val)) { ParquetException::EofException(); } buffer[i] = val; } num_values_ -= max_values; @@ -173,9 +163,9 @@ class PlainEncoder : public Encoder { public: typedef typename DType::c_type T; - explicit PlainEncoder(const ColumnDescriptor* descr, - MemoryAllocator* allocator = default_allocator()) : - Encoder(descr, Encoding::PLAIN, allocator) {} + explicit PlainEncoder( + const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) + : Encoder(descr, Encoding::PLAIN, allocator) {} void Encode(const T* src, int num_values, OutputStream* dst) override; }; @@ -183,9 +173,9 @@ class PlainEncoder : public Encoder { template <> class PlainEncoder : public Encoder { public: - explicit PlainEncoder(const ColumnDescriptor* descr, - MemoryAllocator* allocator = default_allocator()) : - Encoder(descr, Encoding::PLAIN, allocator) {} + explicit PlainEncoder( + const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) + : Encoder(descr, Encoding::PLAIN, allocator) {} virtual void Encode(const bool* src, int num_values, OutputStream* dst) { int bytes_required = BitUtil::Ceil(num_values, 8); @@ -222,14 +212,14 @@ class PlainEncoder : public Encoder { }; template -inline void PlainEncoder::Encode(const T* buffer, int num_values, - OutputStream* dst) { +inline void PlainEncoder::Encode( + const T* buffer, int num_values, OutputStream* dst) { dst->Write(reinterpret_cast(buffer), num_values * sizeof(T)); } template <> -inline void PlainEncoder::Encode(const ByteArray* src, - int num_values, OutputStream* dst) { +inline void PlainEncoder::Encode( + const ByteArray* src, int num_values, OutputStream* dst) { for (int i = 0; i < num_values; ++i) { // Write the result to the output stream dst->Write(reinterpret_cast(&src[i].len), sizeof(uint32_t)); @@ -245,6 +235,6 @@ inline void PlainEncoder::Encode( dst->Write(reinterpret_cast(src[i].ptr), descr_->type_length()); } } -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/exception.h b/src/parquet/exception.h index 608a45aa..03f23560 100644 --- a/src/parquet/exception.h +++ b/src/parquet/exception.h @@ -44,6 +44,6 @@ class ParquetException : public std::exception { std::string msg_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_EXCEPTION_H +#endif // PARQUET_EXCEPTION_H diff --git a/src/parquet/file/file-deserialize-test.cc b/src/parquet/file/file-deserialize-test.cc index be9c23c9..db99f16f 100644 --- a/src/parquet/file/file-deserialize-test.cc +++ b/src/parquet/file/file-deserialize-test.cc @@ -39,17 +39,14 @@ namespace parquet { - // Adds page statistics occupying a certain amount of bytes (for testing very // large page headers) -static inline void AddDummyStats(int stat_size, - format::DataPageHeader& data_page) { - +static inline void AddDummyStats(int stat_size, format::DataPageHeader& data_page) { std::vector stat_bytes(stat_size); // Some non-zero value std::fill(stat_bytes.begin(), stat_bytes.end(), 1); - data_page.statistics.__set_max(std::string( - reinterpret_cast(stat_bytes.data()), stat_size)); + data_page.statistics.__set_max( + std::string(reinterpret_cast(stat_bytes.data()), stat_size)); data_page.__isset.statistics = true; } @@ -63,16 +60,15 @@ class TestPageSerde : public ::testing::Test { ResetStream(); } - void InitSerializedPageReader(Compression::type codec = - Compression::UNCOMPRESSED) { + void InitSerializedPageReader(Compression::type codec = Compression::UNCOMPRESSED) { EndStream(); std::unique_ptr stream; stream.reset(new InMemoryInputStream(out_buffer_)); page_reader_.reset(new SerializedPageReader(std::move(stream), codec)); } - void WriteDataPageHeader(int max_serialized_len = 1024, - int32_t uncompressed_size = 0, int32_t compressed_size = 0) { + void WriteDataPageHeader(int max_serialized_len = 1024, int32_t uncompressed_size = 0, + int32_t compressed_size = 0) { // Simplifying writing serialized data page headers which may or may not // have meaningful data associated with them @@ -82,17 +78,13 @@ class TestPageSerde : public ::testing::Test { page_header_.compressed_page_size = compressed_size; page_header_.type = format::PageType::DATA_PAGE; - ASSERT_NO_THROW(SerializeThriftMsg(&page_header_, max_serialized_len, - out_stream_.get())); + ASSERT_NO_THROW( + SerializeThriftMsg(&page_header_, max_serialized_len, out_stream_.get())); } - void ResetStream() { - out_stream_.reset(new InMemoryOutputStream); - } + void ResetStream() { out_stream_.reset(new InMemoryOutputStream); } - void EndStream() { - out_buffer_ = out_stream_->GetBuffer(); - } + void EndStream() { out_buffer_ = out_stream_->GetBuffer(); } protected: std::unique_ptr out_stream_; @@ -103,25 +95,22 @@ class TestPageSerde : public ::testing::Test { format::DataPageHeader data_page_header_; }; -void CheckDataPageHeader(const format::DataPageHeader expected, - const Page* page) { +void CheckDataPageHeader(const format::DataPageHeader expected, const Page* page) { ASSERT_EQ(PageType::DATA_PAGE, page->type()); const DataPage* data_page = static_cast(page); ASSERT_EQ(expected.num_values, data_page->num_values()); ASSERT_EQ(expected.encoding, data_page->encoding()); - ASSERT_EQ(expected.definition_level_encoding, - data_page->definition_level_encoding()); - ASSERT_EQ(expected.repetition_level_encoding, - data_page->repetition_level_encoding()); + ASSERT_EQ(expected.definition_level_encoding, data_page->definition_level_encoding()); + ASSERT_EQ(expected.repetition_level_encoding, data_page->repetition_level_encoding()); if (expected.statistics.__isset.max) { - ASSERT_EQ(0, memcmp(expected.statistics.max.c_str(), - data_page->max(), expected.statistics.max.length())); + ASSERT_EQ(0, memcmp(expected.statistics.max.c_str(), data_page->max(), + expected.statistics.max.length())); } if (expected.statistics.__isset.min) { - ASSERT_EQ(0, memcmp(expected.statistics.min.c_str(), - data_page->min(), expected.statistics.min.length())); + ASSERT_EQ(0, memcmp(expected.statistics.min.c_str(), data_page->min(), + expected.statistics.min.length())); } } @@ -139,13 +128,13 @@ TEST_F(TestPageSerde, DataPage) { } TEST_F(TestPageSerde, TestLargePageHeaders) { - int stats_size = 256 * 1024; // 256 KB + int stats_size = 256 * 1024; // 256 KB AddDummyStats(stats_size, data_page_header_); // Any number to verify metadata roundtrip data_page_header_.num_values = 4141; - int max_header_size = 512 * 1024; // 512 KB + int max_header_size = 512 * 1024; // 512 KB WriteDataPageHeader(max_header_size); ASSERT_GE(max_header_size, out_stream_->Tell()); @@ -159,11 +148,11 @@ TEST_F(TestPageSerde, TestLargePageHeaders) { } TEST_F(TestPageSerde, TestFailLargePageHeaders) { - int stats_size = 256 * 1024; // 256 KB + int stats_size = 256 * 1024; // 256 KB AddDummyStats(stats_size, data_page_header_); // Serialize the Page header - int max_header_size = 512 * 1024; // 512 KB + int max_header_size = 512 * 1024; // 512 KB WriteDataPageHeader(max_header_size); ASSERT_GE(max_header_size, out_stream_->Tell()); @@ -185,7 +174,7 @@ TEST_F(TestPageSerde, Compression) { int num_pages = 10; - std::vector > faux_data; + std::vector> faux_data; faux_data.resize(num_pages); for (int i = 0; i < num_pages; ++i) { // The pages keep getting larger @@ -203,8 +192,8 @@ TEST_F(TestPageSerde, Compression) { int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data); buffer.resize(max_compressed_size); - int64_t actual_size = codec->Compress(data_size, data, - max_compressed_size, &buffer[0]); + int64_t actual_size = + codec->Compress(data_size, data, max_compressed_size, &buffer[0]); WriteDataPageHeader(1024, data_size, actual_size); out_stream_->Write(buffer.data(), actual_size); @@ -245,8 +234,7 @@ class TestParquetFileReader : public ::testing::Test { reader_.reset(new ParquetFileReader()); ASSERT_THROW( - reader_->Open(SerializedFile::Open(std::move(reader))), - ParquetException); + reader_->Open(SerializedFile::Open(std::move(reader))), ParquetException); } protected: @@ -291,4 +279,4 @@ TEST_F(TestParquetFileReader, IncompleteMetadata) { AssertInvalidFileThrows(buffer); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc index a75d2504..194e496b 100644 --- a/src/parquet/file/file-serialize-test.cc +++ b/src/parquet/file/file-serialize-test.cc @@ -37,35 +37,32 @@ class TestSerialize : public ::testing::Test { public: void SetUpSchemaRequired() { auto pnode = PrimitiveNode::Make("int64", Repetition::REQUIRED, Type::INT64); - node_ = GroupNode::Make("schema", Repetition::REQUIRED, - std::vector({pnode})); + node_ = + GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); schema_.Init(node_); } void SetUpSchemaOptional() { auto pnode = PrimitiveNode::Make("int64", Repetition::OPTIONAL, Type::INT64); - node_ = GroupNode::Make("schema", Repetition::REQUIRED, - std::vector({pnode})); + node_ = + GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); schema_.Init(node_); } void SetUpSchemaRepeated() { auto pnode = PrimitiveNode::Make("int64", Repetition::REPEATED, Type::INT64); - node_ = GroupNode::Make("schema", Repetition::REQUIRED, - std::vector({pnode})); + node_ = + GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); schema_.Init(node_); } - void SetUp() { - SetUpSchemaRequired(); - } + void SetUp() { SetUpSchemaRequired(); } protected: NodePtr node_; SchemaDescriptor schema_; }; - TEST_F(TestSerialize, SmallFile) { std::shared_ptr sink(new InMemoryOutputStream()); auto gnode = std::static_pointer_cast(node_); @@ -100,6 +97,6 @@ TEST_F(TestSerialize, SmallFile) { ASSERT_EQ(values, values_out); } -} // namespace test +} // namespace test -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc index 20e089ae..905c3578 100644 --- a/src/parquet/file/reader-internal.cc +++ b/src/parquet/file/reader-internal.cc @@ -42,9 +42,8 @@ namespace parquet { // assembled in a serialized stream for storing in a Parquet files SerializedPageReader::SerializedPageReader(std::unique_ptr stream, - Compression::type codec_type, MemoryAllocator* allocator) : - stream_(std::move(stream)), - decompression_buffer_(0, allocator) { + Compression::type codec_type, MemoryAllocator* allocator) + : stream_(std::move(stream)), decompression_buffer_(0, allocator) { max_page_header_size_ = DEFAULT_MAX_PAGE_HEADER_SIZE; decompressor_ = Codec::Create(codec_type); } @@ -65,9 +64,7 @@ std::shared_ptr SerializedPageReader::NextPage() { // until a maximum allowed header limit while (true) { buffer = stream_->Peek(allowed_page_size, &bytes_available); - if (bytes_available == 0) { - return std::shared_ptr(nullptr); - } + if (bytes_available == 0) { return std::shared_ptr(nullptr); } // This gets used, then set by DeserializeThriftMsg header_size = bytes_available; @@ -100,8 +97,8 @@ std::shared_ptr SerializedPageReader::NextPage() { if (uncompressed_len > static_cast(decompression_buffer_.size())) { decompression_buffer_.Resize(uncompressed_len); } - decompressor_->Decompress(compressed_len, buffer, uncompressed_len, - &decompression_buffer_[0]); + decompressor_->Decompress( + compressed_len, buffer, uncompressed_len, &decompression_buffer_[0]); buffer = &decompression_buffer_[0]; } @@ -109,40 +106,32 @@ std::shared_ptr SerializedPageReader::NextPage() { if (current_page_header_.type == format::PageType::DICTIONARY_PAGE) { const format::DictionaryPageHeader& dict_header = - current_page_header_.dictionary_page_header; + current_page_header_.dictionary_page_header; - bool is_sorted = dict_header.__isset.is_sorted? dict_header.is_sorted : false; + bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false; - return std::make_shared(page_buffer, - dict_header.num_values, FromThrift(dict_header.encoding), - is_sorted); + return std::make_shared(page_buffer, dict_header.num_values, + FromThrift(dict_header.encoding), is_sorted); } else if (current_page_header_.type == format::PageType::DATA_PAGE) { const format::DataPageHeader& header = current_page_header_.data_page_header; - auto page = std::make_shared(page_buffer, - header.num_values, - FromThrift(header.encoding), - FromThrift(header.definition_level_encoding), + auto page = std::make_shared(page_buffer, header.num_values, + FromThrift(header.encoding), FromThrift(header.definition_level_encoding), FromThrift(header.repetition_level_encoding)); if (header.__isset.statistics) { const format::Statistics stats = header.statistics; - if (stats.__isset.max) { - page->max_ = stats.max; - } - if (stats.__isset.min) { - page->min_ = stats.min; - } + if (stats.__isset.max) { page->max_ = stats.max; } + if (stats.__isset.min) { page->min_ = stats.min; } } return page; } else if (current_page_header_.type == format::PageType::DATA_PAGE_V2) { const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2; - bool is_compressed = header.__isset.is_compressed? header.is_compressed : false; - return std::make_shared(page_buffer, - header.num_values, header.num_nulls, header.num_rows, - FromThrift(header.encoding), - header.definition_levels_byte_length, - header.repetition_levels_byte_length, is_compressed); + bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false; + return std::make_shared(page_buffer, header.num_values, + header.num_nulls, header.num_rows, FromThrift(header.encoding), + header.definition_levels_byte_length, header.repetition_levels_byte_length, + is_compressed); } else { // We don't know what this page type is. We're allowed to skip non-data // pages. @@ -181,8 +170,8 @@ std::unique_ptr SerializedRowGroup::GetColumnPageReader(int i) { } std::unique_ptr stream(new InMemoryInputStream(buffer)); - return std::unique_ptr(new SerializedPageReader(std::move(stream), - FromThrift(col.meta_data.codec), allocator_)); + return std::unique_ptr(new SerializedPageReader( + std::move(stream), FromThrift(col.meta_data.codec), allocator_)); } RowGroupStatistics SerializedRowGroup::GetColumnStats(int i) { @@ -227,8 +216,8 @@ SerializedFile::~SerializedFile() { } std::shared_ptr SerializedFile::GetRowGroup(int i) { - std::unique_ptr contents(new SerializedRowGroup(source_.get(), - &metadata_.row_groups[i], allocator_)); + std::unique_ptr contents( + new SerializedRowGroup(source_.get(), &metadata_.row_groups[i], allocator_)); return std::make_shared(&schema_, std::move(contents), allocator_); } @@ -245,11 +234,9 @@ int SerializedFile::num_row_groups() const { return metadata_.row_groups.size(); } -SerializedFile::SerializedFile( - std::unique_ptr source, - MemoryAllocator* allocator = default_allocator()) : - source_(std::move(source)), allocator_(allocator) {} - +SerializedFile::SerializedFile(std::unique_ptr source, + MemoryAllocator* allocator = default_allocator()) + : source_(std::move(source)), allocator_(allocator) {} void SerializedFile::ParseMetaData() { int64_t filesize = source_->Size(); @@ -261,15 +248,15 @@ void SerializedFile::ParseMetaData() { uint8_t footer_buffer[FOOTER_SIZE]; source_->Seek(filesize - FOOTER_SIZE); int64_t bytes_read = source_->Read(FOOTER_SIZE, footer_buffer); - if (bytes_read != FOOTER_SIZE || - memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) { + if (bytes_read != FOOTER_SIZE || memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) { throw ParquetException("Invalid parquet file. Corrupt footer."); } uint32_t metadata_len = *reinterpret_cast(footer_buffer); int64_t metadata_start = filesize - FOOTER_SIZE - metadata_len; if (FOOTER_SIZE + metadata_len > filesize) { - throw ParquetException("Invalid parquet file. File is less than " + throw ParquetException( + "Invalid parquet file. File is less than " "file metadata size."); } source_->Seek(metadata_start); @@ -281,9 +268,8 @@ void SerializedFile::ParseMetaData() { } DeserializeThriftMsg(&metadata_buffer[0], &metadata_len, &metadata_); - schema::FlatSchemaConverter converter(&metadata_.schema[0], - metadata_.schema.size()); + schema::FlatSchemaConverter converter(&metadata_.schema[0], metadata_.schema.size()); schema_.Init(converter.Convert()); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/file/reader-internal.h b/src/parquet/file/reader-internal.h index 1e8256ad..797ff939 100644 --- a/src/parquet/file/reader-internal.h +++ b/src/parquet/file/reader-internal.h @@ -42,17 +42,15 @@ static constexpr uint32_t DEFAULT_PAGE_HEADER_SIZE = 16 * 1024; // and the page metadata. class SerializedPageReader : public PageReader { public: - SerializedPageReader(std::unique_ptr stream, - Compression::type codec, MemoryAllocator* allocator = default_allocator()); + SerializedPageReader(std::unique_ptr stream, Compression::type codec, + MemoryAllocator* allocator = default_allocator()); virtual ~SerializedPageReader() {} // Implement the PageReader interface virtual std::shared_ptr NextPage(); - void set_max_page_header_size(uint32_t size) { - max_page_header_size_ = size; - } + void set_max_page_header_size(uint32_t size) { max_page_header_size_ = size; } private: std::unique_ptr stream_; @@ -70,11 +68,9 @@ class SerializedPageReader : public PageReader { // RowGroupReader::Contents implementation for the Parquet file specification class SerializedRowGroup : public RowGroupReader::Contents { public: - SerializedRowGroup(RandomAccessSource* source, - const format::RowGroup* metadata, MemoryAllocator* allocator) : - source_(source), - metadata_(metadata), - allocator_(allocator) {} + SerializedRowGroup(RandomAccessSource* source, const format::RowGroup* metadata, + MemoryAllocator* allocator) + : source_(source), metadata_(metadata), allocator_(allocator) {} virtual int num_columns() const; virtual int64_t num_rows() const; @@ -108,8 +104,8 @@ class SerializedFile : public ParquetFileReader::Contents { private: // This class takes ownership of the provided data source - explicit SerializedFile(std::unique_ptr source, - MemoryAllocator* allocator); + explicit SerializedFile( + std::unique_ptr source, MemoryAllocator* allocator); std::unique_ptr source_; format::FileMetaData metadata_; @@ -118,6 +114,6 @@ class SerializedFile : public ParquetFileReader::Contents { void ParseMetaData(); }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_FILE_READER_INTERNAL_H +#endif // PARQUET_FILE_READER_INTERNAL_H diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc index 65237116..232dbe40 100644 --- a/src/parquet/file/reader.cc +++ b/src/parquet/file/reader.cc @@ -41,10 +41,8 @@ namespace parquet { // RowGroupReader public API RowGroupReader::RowGroupReader(const SchemaDescriptor* schema, - std::unique_ptr contents, MemoryAllocator* allocator) : - schema_(schema), - contents_(std::move(contents)), - allocator_(allocator) {} + std::unique_ptr contents, MemoryAllocator* allocator) + : schema_(schema), contents_(std::move(contents)), allocator_(allocator) {} int RowGroupReader::num_columns() const { return contents_->num_columns(); @@ -84,8 +82,8 @@ std::unique_ptr ParquetFileReader::Open( return result; } -std::unique_ptr ParquetFileReader::OpenFile(const std::string& path, - bool memory_map, MemoryAllocator* allocator) { +std::unique_ptr ParquetFileReader::OpenFile( + const std::string& path, bool memory_map, MemoryAllocator* allocator) { std::unique_ptr file; if (memory_map) { file.reset(new MemoryMapSource(allocator)); @@ -103,9 +101,7 @@ void ParquetFileReader::Open(std::unique_ptr conten } void ParquetFileReader::Close() { - if (contents_) { - contents_->Close(); - } + if (contents_) { contents_->Close(); } } int ParquetFileReader::num_row_groups() const { @@ -124,8 +120,7 @@ std::shared_ptr ParquetFileReader::RowGroup(int i) { if (i >= num_row_groups()) { std::stringstream ss; ss << "The file only has " << num_row_groups() - << "row groups, requested reader for: " - << i; + << "row groups, requested reader for: " << i; throw ParquetException(ss.str()); } @@ -138,8 +133,8 @@ std::shared_ptr ParquetFileReader::RowGroup(int i) { // the fixed initial size is just for an example #define COL_WIDTH "20" -void ParquetFileReader::DebugPrint(std::ostream& stream, - std::list selected_columns, bool print_values) { +void ParquetFileReader::DebugPrint( + std::ostream& stream, std::list selected_columns, bool print_values) { stream << "File statistics:\n"; stream << "Total rows: " << num_rows() << "\n"; @@ -157,11 +152,8 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, for (auto i : selected_columns) { const ColumnDescriptor* descr = schema_->Column(i); - stream << "Column " << i << ": " - << descr->name() - << " (" - << type_to_string(descr->physical_type()) - << ")" << std::endl; + stream << "Column " << i << ": " << descr->name() << " (" + << type_to_string(descr->physical_type()) << ")" << std::endl; } for (int r = 0; r < num_row_groups(); ++r) { @@ -173,25 +165,19 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, for (auto i : selected_columns) { RowGroupStatistics stats = group_reader->GetColumnStats(i); - stream << "Column " << i << ": " - << group_reader->num_rows() << " rows, " - << stats.num_values << " values, " - << stats.null_count << " null values, " - << stats.distinct_count << " distinct values, " - << *stats.max << " max, " - << *stats.min << " min, " - << std::endl; + stream << "Column " << i << ": " << group_reader->num_rows() << " rows, " + << stats.num_values << " values, " << stats.null_count << " null values, " + << stats.distinct_count << " distinct values, " << *stats.max << " max, " + << *stats.min << " min, " << std::endl; } - if (!print_values) { - continue; - } + if (!print_values) { continue; } static constexpr int bufsize = 25; char buffer[bufsize]; // Create readers for selected columns and print contents - vector > scanners(selected_columns.size(), NULL); + vector> scanners(selected_columns.size(), NULL); int j = 0; for (auto i : selected_columns) { std::shared_ptr col_reader = group_reader->Column(i); @@ -223,4 +209,4 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, } } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h index 58df9a63..565bb5c4 100644 --- a/src/parquet/file/reader.h +++ b/src/parquet/file/reader.h @@ -50,8 +50,8 @@ class RowGroupReader { virtual std::unique_ptr GetColumnPageReader(int i) = 0; }; - RowGroupReader(const SchemaDescriptor* schema, - std::unique_ptr contents, MemoryAllocator* allocator); + RowGroupReader(const SchemaDescriptor* schema, std::unique_ptr contents, + MemoryAllocator* allocator); // Construct a ColumnReader for the indicated row group-relative // column. Ownership is shared with the RowGroupReader. @@ -73,7 +73,6 @@ class RowGroupReader { MemoryAllocator* allocator_; }; - class ParquetFileReader { public: // Forward declare the PIMPL @@ -89,9 +88,7 @@ class ParquetFileReader { virtual int num_row_groups() const = 0; // Return const-poitner to make it clear that this object is not to be copied - const SchemaDescriptor* schema() const { - return &schema_; - } + const SchemaDescriptor* schema() const { return &schema_; } SchemaDescriptor schema_; }; @@ -117,16 +114,12 @@ class ParquetFileReader { int num_row_groups() const; // Returns the file schema descriptor - const SchemaDescriptor* descr() { - return schema_; - } + const SchemaDescriptor* descr() { return schema_; } - const ColumnDescriptor* column_schema(int i) const { - return schema_->Column(i); - } + const ColumnDescriptor* column_schema(int i) const { return schema_->Column(i); } - void DebugPrint(std::ostream& stream, std::list selected_columns, - bool print_values = true); + void DebugPrint( + std::ostream& stream, std::list selected_columns, bool print_values = true); private: // PIMPL idiom @@ -138,7 +131,6 @@ class ParquetFileReader { const SchemaDescriptor* schema_; }; +} // namespace parquet -} // namespace parquet - -#endif // PARQUET_FILE_READER_H +#endif // PARQUET_FILE_READER_H diff --git a/src/parquet/file/writer-internal.cc b/src/parquet/file/writer-internal.cc index f0877529..d87d5214 100644 --- a/src/parquet/file/writer-internal.cc +++ b/src/parquet/file/writer-internal.cc @@ -33,10 +33,9 @@ static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; // ---------------------------------------------------------------------- // SerializedPageWriter -SerializedPageWriter::SerializedPageWriter(OutputStream* sink, - Compression::type codec, format::ColumnChunk* metadata, - MemoryAllocator* allocator) : sink_(sink), metadata_(metadata), - allocator_(allocator) { +SerializedPageWriter::SerializedPageWriter(OutputStream* sink, Compression::type codec, + format::ColumnChunk* metadata, MemoryAllocator* allocator) + : sink_(sink), metadata_(metadata), allocator_(allocator) { compressor_ = Codec::Create(codec); // Currently we directly start with the data page metadata_->meta_data.__set_data_page_offset(sink_->Tell()); @@ -57,16 +56,16 @@ int64_t SerializedPageWriter::WriteDataPage(int32_t num_rows, int32_t num_values const std::shared_ptr& definition_levels, Encoding::type definition_level_encoding, const std::shared_ptr& repetition_levels, - Encoding::type repetition_level_encoding, - const std::shared_ptr& values, Encoding::type encoding) { - int64_t uncompressed_size = definition_levels->size() + repetition_levels->size() - + values->size(); + Encoding::type repetition_level_encoding, const std::shared_ptr& values, + Encoding::type encoding) { + int64_t uncompressed_size = + definition_levels->size() + repetition_levels->size() + values->size(); // Concatenate data into a single buffer // TODO: In the uncompressed case, directly write this to the sink // TODO: Reuse the (un)compressed_data buffer instead of recreating it each time. std::shared_ptr uncompressed_data = - std::make_shared(uncompressed_size, allocator_); + std::make_shared(uncompressed_size, allocator_); uint8_t* uncompressed_ptr = uncompressed_data->mutable_data(); memcpy(uncompressed_ptr, repetition_levels->data(), repetition_levels->size()); uncompressed_ptr += repetition_levels->size(); @@ -124,7 +123,7 @@ int64_t RowGroupSerializer::num_rows() const { return num_rows_; } -const SchemaDescriptor* RowGroupSerializer::schema() const { +const SchemaDescriptor* RowGroupSerializer::schema() const { return schema_; } @@ -134,20 +133,17 @@ ColumnWriter* RowGroupSerializer::NextColumn() { } current_column_index_++; - if (current_column_writer_) { - total_bytes_written_ += current_column_writer_->Close(); - } + if (current_column_writer_) { total_bytes_written_ += current_column_writer_->Close(); } const ColumnDescriptor* column_descr = schema_->Column(current_column_index_); format::ColumnChunk* col_meta = &metadata_->columns[current_column_index_]; col_meta->__isset.meta_data = true; col_meta->meta_data.__set_type(ToThrift(column_descr->physical_type())); col_meta->meta_data.__set_path_in_schema(column_descr->path()->ToDotVector()); - std::unique_ptr pager(new SerializedPageWriter(sink_, - Compression::UNCOMPRESSED, col_meta, - allocator_)); - current_column_writer_ = ColumnWriter::Make(column_descr, - std::move(pager), num_rows_, allocator_); + std::unique_ptr pager( + new SerializedPageWriter(sink_, Compression::UNCOMPRESSED, col_meta, allocator_)); + current_column_writer_ = + ColumnWriter::Make(column_descr, std::move(pager), num_rows_, allocator_); return current_column_writer_.get(); } @@ -177,9 +173,7 @@ std::unique_ptr FileSerializer::Open( } void FileSerializer::Close() { - if (row_group_writer_) { - row_group_writer_->Close(); - } + if (row_group_writer_) { row_group_writer_->Close(); } row_group_writer_.reset(); // Write magic bytes and metadata @@ -201,9 +195,7 @@ int64_t FileSerializer::num_rows() const { } RowGroupWriter* FileSerializer::AppendRowGroup(int64_t num_rows) { - if (row_group_writer_) { - row_group_writer_->Close(); - } + if (row_group_writer_) { row_group_writer_->Close(); } num_rows_ += num_rows; num_row_groups_++; @@ -224,8 +216,8 @@ void FileSerializer::WriteMetaData() { // Write MetaData uint32_t metadata_len = sink_->Tell(); - SchemaFlattener flattener(static_cast(schema_.schema().get()), - &metadata_.schema); + SchemaFlattener flattener( + static_cast(schema_.schema().get()), &metadata_.schema); flattener.Flatten(); // TODO: Currently we only write version 1 files @@ -244,12 +236,9 @@ void FileSerializer::WriteMetaData() { sink_->Write(PARQUET_MAGIC, 4); } -FileSerializer::FileSerializer( - std::shared_ptr sink, - std::shared_ptr& schema, - MemoryAllocator* allocator = default_allocator()) : - sink_(sink), allocator_(allocator), - num_row_groups_(0), num_rows_(0) { +FileSerializer::FileSerializer(std::shared_ptr sink, + std::shared_ptr& schema, MemoryAllocator* allocator = default_allocator()) + : sink_(sink), allocator_(allocator), num_row_groups_(0), num_rows_(0) { schema_.Init(schema); StartFile(); } @@ -259,4 +248,4 @@ void FileSerializer::StartFile() { sink_->Write(PARQUET_MAGIC, 4); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/file/writer-internal.h b/src/parquet/file/writer-internal.h index e88348b2..8c5db68a 100644 --- a/src/parquet/file/writer-internal.h +++ b/src/parquet/file/writer-internal.h @@ -35,9 +35,8 @@ namespace parquet { // TODO: Currently only writes DataPage pages. class SerializedPageWriter : public PageWriter { public: - SerializedPageWriter(OutputStream* sink, - Compression::type codec, format::ColumnChunk* metadata, - MemoryAllocator* allocator = default_allocator()); + SerializedPageWriter(OutputStream* sink, Compression::type codec, + format::ColumnChunk* metadata, MemoryAllocator* allocator = default_allocator()); virtual ~SerializedPageWriter() {} @@ -47,8 +46,8 @@ class SerializedPageWriter : public PageWriter { const std::shared_ptr& definition_levels, Encoding::type definition_level_encoding, const std::shared_ptr& repetition_levels, - Encoding::type repetition_level_encoding, - const std::shared_ptr& values, Encoding::type encoding) override; + Encoding::type repetition_level_encoding, const std::shared_ptr& values, + Encoding::type encoding) override; void Close() override; @@ -67,16 +66,15 @@ class SerializedPageWriter : public PageWriter { // RowGroupWriter::Contents implementation for the Parquet file specification class RowGroupSerializer : public RowGroupWriter::Contents { public: - RowGroupSerializer(int64_t num_rows, - const SchemaDescriptor* schema, - OutputStream* sink, - format::RowGroup* metadata, - MemoryAllocator* allocator) : - num_rows_(num_rows), schema_(schema), sink_(sink), - metadata_(metadata), - allocator_(allocator), - total_bytes_written_(0), - current_column_index_(-1) { + RowGroupSerializer(int64_t num_rows, const SchemaDescriptor* schema, OutputStream* sink, + format::RowGroup* metadata, MemoryAllocator* allocator) + : num_rows_(num_rows), + schema_(schema), + sink_(sink), + metadata_(metadata), + allocator_(allocator), + total_bytes_written_(0), + current_column_index_(-1) { metadata_->__set_num_rows(num_rows_); metadata_->columns.resize(schema->num_columns()); } @@ -109,8 +107,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { class FileSerializer : public ParquetFileWriter::Contents { public: static std::unique_ptr Open( - std::shared_ptr sink, - std::shared_ptr& schema, + std::shared_ptr sink, std::shared_ptr& schema, MemoryAllocator* allocator = default_allocator()); void Close() override; @@ -125,8 +122,7 @@ class FileSerializer : public ParquetFileWriter::Contents { private: explicit FileSerializer(std::shared_ptr sink, - std::shared_ptr& schema, - MemoryAllocator* allocator); + std::shared_ptr& schema, MemoryAllocator* allocator); std::shared_ptr sink_; format::FileMetaData metadata_; @@ -140,6 +136,6 @@ class FileSerializer : public ParquetFileWriter::Contents { void WriteMetaData(); }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_FILE_WRITER_INTERNAL_H +#endif // PARQUET_FILE_WRITER_INTERNAL_H diff --git a/src/parquet/file/writer.cc b/src/parquet/file/writer.cc index 6e9f4f92..c516a5e7 100644 --- a/src/parquet/file/writer.cc +++ b/src/parquet/file/writer.cc @@ -27,9 +27,9 @@ namespace parquet { // ---------------------------------------------------------------------- // RowGroupWriter public API -RowGroupWriter::RowGroupWriter(std::unique_ptr contents, - MemoryAllocator* allocator): - contents_(std::move(contents)), allocator_(allocator) { +RowGroupWriter::RowGroupWriter( + std::unique_ptr contents, MemoryAllocator* allocator) + : contents_(std::move(contents)), allocator_(allocator) { schema_ = contents_->schema(); } @@ -80,4 +80,4 @@ RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) { return contents_->AppendRowGroup(num_rows); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/file/writer.h b/src/parquet/file/writer.h index 9ad2539d..22ee8562 100644 --- a/src/parquet/file/writer.h +++ b/src/parquet/file/writer.h @@ -94,17 +94,14 @@ class ParquetFileWriter { virtual int num_row_groups() const = 0; // Return const-poitner to make it clear that this object is not to be copied - const SchemaDescriptor* schema() const { - return &schema_; - } + const SchemaDescriptor* schema() const { return &schema_; } SchemaDescriptor schema_; }; ParquetFileWriter(); ~ParquetFileWriter(); - static std::unique_ptr Open( - std::shared_ptr sink, + static std::unique_ptr Open(std::shared_ptr sink, std::shared_ptr& schema, MemoryAllocator* allocator = default_allocator()); @@ -144,13 +141,9 @@ class ParquetFileWriter { /** * Returns the file schema descriptor */ - const SchemaDescriptor* descr() { - return schema_; - } + const SchemaDescriptor* descr() { return schema_; } - const ColumnDescriptor* column_schema(int i) const { - return schema_->Column(i); - } + const ColumnDescriptor* column_schema(int i) const { return schema_->Column(i); } private: // This is declared in the .cc file so that we can hide compiled Thrift @@ -161,7 +154,6 @@ class ParquetFileWriter { const SchemaDescriptor* schema_; }; -} // namespace parquet - -#endif // PARQUET_FILE_WRITER_H +} // namespace parquet +#endif // PARQUET_FILE_WRITER_H diff --git a/src/parquet/public-api-test.cc b/src/parquet/public-api-test.cc index 4edf79b2..1dc7621d 100644 --- a/src/parquet/public-api-test.cc +++ b/src/parquet/public-api-test.cc @@ -29,4 +29,4 @@ TEST(TestPublicAPI, DoesNotIncludeThrift) { #endif } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc index a09891cd..9a8fcf67 100644 --- a/src/parquet/reader-test.cc +++ b/src/parquet/reader-test.cc @@ -36,14 +36,14 @@ namespace parquet { const char* data_dir = std::getenv("PARQUET_TEST_DATA"); - class TestAllTypesPlain : public ::testing::Test { public: void SetUp() { std::string dir_string(data_dir); std::stringstream ss; - ss << dir_string << "/" << "alltypes_plain.parquet"; + ss << dir_string << "/" + << "alltypes_plain.parquet"; reader_ = ParquetFileReader::OpenFile(ss.str()); } @@ -54,15 +54,14 @@ class TestAllTypesPlain : public ::testing::Test { std::unique_ptr reader_; }; -TEST_F(TestAllTypesPlain, NoopConstructDestruct) { -} +TEST_F(TestAllTypesPlain, NoopConstructDestruct) {} TEST_F(TestAllTypesPlain, TestBatchRead) { std::shared_ptr group = reader_->RowGroup(0); // column 0, id std::shared_ptr col = - std::dynamic_pointer_cast(group->Column(0)); + std::dynamic_pointer_cast(group->Column(0)); int16_t def_levels[4]; int16_t rep_levels[4]; @@ -94,8 +93,7 @@ TEST_F(TestAllTypesPlain, TestFlatScannerInt32) { std::shared_ptr group = reader_->RowGroup(0); // column 0, id - std::shared_ptr scanner( - new Int32Scanner(group->Column(0))); + std::shared_ptr scanner(new Int32Scanner(group->Column(0))); int32_t val; bool is_null; for (int i = 0; i < 8; ++i) { @@ -107,20 +105,17 @@ TEST_F(TestAllTypesPlain, TestFlatScannerInt32) { ASSERT_FALSE(scanner->NextValue(&val, &is_null)); } - TEST_F(TestAllTypesPlain, TestSetScannerBatchSize) { std::shared_ptr group = reader_->RowGroup(0); // column 0, id - std::shared_ptr scanner( - new Int32Scanner(group->Column(0))); + std::shared_ptr scanner(new Int32Scanner(group->Column(0))); ASSERT_EQ(128, scanner->batch_size()); scanner->SetBatchSize(1024); ASSERT_EQ(1024, scanner->batch_size()); } - TEST_F(TestAllTypesPlain, DebugPrintWorks) { std::stringstream ss; @@ -156,14 +151,14 @@ TEST_F(TestAllTypesPlain, ColumnSelectionOutOfRange) { ASSERT_THROW(reader_->DebugPrint(ss, columns), ParquetException); } - class TestLocalFileSource : public ::testing::Test { public: void SetUp() { std::string dir_string(data_dir); std::stringstream ss; - ss << dir_string << "/" << "alltypes_plain.parquet"; + ss << dir_string << "/" + << "alltypes_plain.parquet"; file.reset(new LocalFileSource()); file->Open(ss.str()); @@ -186,5 +181,4 @@ TEST_F(TestLocalFileSource, FileClosedOnDestruction) { ASSERT_EQ(EBADF, errno); } - -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc index bece340a..08eeb666 100644 --- a/src/parquet/schema/converter.cc +++ b/src/parquet/schema/converter.cc @@ -77,8 +77,7 @@ std::shared_ptr FromParquet(const std::vector& std::unique_ptr root = converter.Convert(); std::shared_ptr descr = std::make_shared(); - descr->Init(std::shared_ptr( - static_cast(root.release()))); + descr->Init(std::shared_ptr(static_cast(root.release()))); return descr; } @@ -91,7 +90,7 @@ void ToParquet(const GroupNode* schema, std::vector* out) class SchemaVisitor : public Node::ConstVisitor { public: explicit SchemaVisitor(std::vector* elements) - : elements_(elements) {} + : elements_(elements) {} virtual ~SchemaVisitor() {} void Visit(const Node* node) override { @@ -113,8 +112,8 @@ class SchemaVisitor : public Node::ConstVisitor { std::vector* elements_; }; -SchemaFlattener::SchemaFlattener(const GroupNode* schema, - std::vector* out) +SchemaFlattener::SchemaFlattener( + const GroupNode* schema, std::vector* out) : root_(schema), elements_(out) {} void SchemaFlattener::Flatten() { @@ -122,6 +121,6 @@ void SchemaFlattener::Flatten() { root_->VisitConst(&visitor); } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h index 2742b98f..617d9854 100644 --- a/src/parquet/schema/converter.h +++ b/src/parquet/schema/converter.h @@ -30,7 +30,9 @@ namespace parquet { -namespace format { class SchemaElement;} +namespace format { +class SchemaElement; +} class SchemaDescriptor; @@ -47,11 +49,8 @@ std::shared_ptr FromParquet( class FlatSchemaConverter { public: - FlatSchemaConverter(const format::SchemaElement* elements, int length) : - elements_(elements), - length_(length), - pos_(0), - current_id_(0) {} + FlatSchemaConverter(const format::SchemaElement* elements, int length) + : elements_(elements), length_(length), pos_(0), current_id_(0) {} std::unique_ptr Convert(); @@ -61,9 +60,7 @@ class FlatSchemaConverter { int pos_; int current_id_; - int next_id() { - return current_id_++; - } + int next_id() { return current_id_++; } const format::SchemaElement& Next(); @@ -87,8 +84,8 @@ class SchemaFlattener { std::vector* elements_; }; -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet -#endif // PARQUET_SCHEMA_CONVERTER_H +#endif // PARQUET_SCHEMA_CONVERTER_H diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc index 4b3bb501..01f04212 100644 --- a/src/parquet/schema/descriptor.cc +++ b/src/parquet/schema/descriptor.cc @@ -46,8 +46,8 @@ void SchemaDescriptor::Init(const NodePtr& schema) { } } -void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, - int16_t max_rep_level) { +void SchemaDescriptor::BuildTree( + const NodePtr& node, int16_t max_def_level, int16_t max_rep_level) { if (node->is_optional()) { ++max_def_level; } else if (node->is_repeated()) { @@ -71,14 +71,12 @@ void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, int16_t max_repetition_level, - const SchemaDescriptor* schema_descr) : - node_(node), + const SchemaDescriptor* schema_descr) + : node_(node), max_definition_level_(max_definition_level), max_repetition_level_(max_repetition_level), schema_descr_(schema_descr) { - if (!node_->is_primitive()) { - throw ParquetException("Must be a primitive type"); - } + if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); } primitive_node_ = static_cast(node_.get()); } @@ -113,4 +111,4 @@ const std::shared_ptr ColumnDescriptor::path() const { return std::make_shared(std::move(path_)); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h index bb9cd4d2..1686b905 100644 --- a/src/parquet/schema/descriptor.h +++ b/src/parquet/schema/descriptor.h @@ -39,28 +39,17 @@ class SchemaDescriptor; class ColumnDescriptor { public: ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, - int16_t max_repetition_level, - const SchemaDescriptor* schema_descr = nullptr); + int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr); - int16_t max_definition_level() const { - return max_definition_level_; - } + int16_t max_definition_level() const { return max_definition_level_; } - int16_t max_repetition_level() const { - return max_repetition_level_; - } + int16_t max_repetition_level() const { return max_repetition_level_; } - Type::type physical_type() const { - return primitive_node_->physical_type(); - } + Type::type physical_type() const { return primitive_node_->physical_type(); } - LogicalType::type logical_type() const { - return primitive_node_->logical_type(); - } + LogicalType::type logical_type() const { return primitive_node_->logical_type(); } - const std::string& name() const { - return primitive_node_->name(); - } + const std::string& name() const { return primitive_node_->name(); } const std::shared_ptr path() const; @@ -106,13 +95,9 @@ class SchemaDescriptor { const ColumnDescriptor* Column(int i) const; // The number of physical columns appearing in the file - int num_columns() const { - return leaves_.size(); - } + int num_columns() const { return leaves_.size(); } - const schema::NodePtr& schema() const { - return schema_; - } + const schema::NodePtr& schema() const { return schema_; } private: friend class ColumnDescriptor; @@ -120,8 +105,8 @@ class SchemaDescriptor { schema::NodePtr schema_; const schema::GroupNode* group_; - void BuildTree(const schema::NodePtr& node, int16_t max_def_level, - int16_t max_rep_level); + void BuildTree( + const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level); // Result of leaf node / tree analysis std::vector leaves_; @@ -138,6 +123,6 @@ class SchemaDescriptor { std::unordered_map leaf_to_base_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_SCHEMA_DESCRIPTOR_H +#endif // PARQUET_SCHEMA_DESCRIPTOR_H diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc index 6de696b7..b1903980 100644 --- a/src/parquet/schema/printer.cc +++ b/src/parquet/schema/printer.cc @@ -29,10 +29,8 @@ namespace schema { class SchemaPrinter : public Node::ConstVisitor { public: - explicit SchemaPrinter(std::ostream& stream, int indent_width) : - stream_(stream), - indent_(0), - indent_width_(2) {} + explicit SchemaPrinter(std::ostream& stream, int indent_width) + : stream_(stream), indent_(0), indent_width_(2) {} void Visit(const Node* node) override; @@ -132,12 +130,11 @@ void SchemaPrinter::Visit(const Node* node) { } } -void PrintSchema(const Node* schema, std::ostream& stream, - int indent_width) { +void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) { SchemaPrinter printer(stream, indent_width); printer.Visit(schema); } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/printer.h b/src/parquet/schema/printer.h index 18c48e13..e3e2f322 100644 --- a/src/parquet/schema/printer.h +++ b/src/parquet/schema/printer.h @@ -28,11 +28,10 @@ namespace schema { class Node; -void PrintSchema(const Node* schema, std::ostream& stream, - int indent_width = 2); +void PrintSchema(const Node* schema, std::ostream& stream, int indent_width = 2); -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet -#endif // PARQUET_SCHEMA_PRINTER_H +#endif // PARQUET_SCHEMA_PRINTER_H diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc index 55b8439e..0a03dc64 100644 --- a/src/parquet/schema/schema-converter-test.cc +++ b/src/parquet/schema/schema-converter-test.cc @@ -45,9 +45,7 @@ namespace schema { class TestSchemaConverter : public ::testing::Test { public: - void setUp() { - name_ = "parquet_schema"; - } + void setUp() { name_ = "parquet_schema"; } void Convert(const parquet::format::SchemaElement* elements, int length) { FlatSchemaConverter converter(elements, length); @@ -66,14 +64,10 @@ bool check_for_parent_consistency(const GroupNode* node) { // Each node should have the group as parent for (int i = 0; i < node->field_count(); i++) { const NodePtr& field = node->field(i); - if (field->parent() != node) { - return false; - } + if (field->parent() != node) { return false; } if (field->is_group()) { const GroupNode* group = static_cast(field.get()); - if (!check_for_parent_consistency(group)) { - return false; - } + if (!check_for_parent_consistency(group)) { return false; } } } return true; @@ -85,8 +79,8 @@ TEST_F(TestSchemaConverter, NestedExample) { elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); // A primitive one - elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED, - format::Type::INT32, 1)); + elements.push_back( + NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1)); // A group elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2)); @@ -95,8 +89,8 @@ TEST_F(TestSchemaConverter, NestedExample) { elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3); elt.__set_converted_type(ConvertedType::LIST); elements.push_back(elt); - elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL, - format::Type::INT64, 4)); + elements.push_back( + NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4)); Convert(&elements[0], elements.size()); @@ -126,8 +120,8 @@ TEST_F(TestSchemaConverter, InvalidRoot) { // element is not a group, it is a malformed Parquet file. SchemaElement elements[2]; - elements[0] = NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, - format::Type::INT32, 0); + elements[0] = + NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, format::Type::INT32, 0); ASSERT_THROW(Convert(elements, 2), ParquetException); // While the Parquet spec indicates that the root group should have REPEATED @@ -135,8 +129,7 @@ TEST_F(TestSchemaConverter, InvalidRoot) { // groups as the first element. These tests check that this is okay as a // practicality matter. elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0); - elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, - format::Type::INT32, 1); + elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1); Convert(elements, 2); elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0); @@ -156,13 +149,9 @@ TEST_F(TestSchemaConverter, NotEnoughChildren) { class TestSchemaFlatten : public ::testing::Test { public: - void setUp() { - name_ = "parquet_schema"; - } + void setUp() { name_ = "parquet_schema"; } - void Flatten(const GroupNode* schema) { - ToParquet(schema, &elements_); - } + void Flatten(const GroupNode* schema) { ToParquet(schema, &elements_); } protected: std::string name_; @@ -175,8 +164,8 @@ TEST_F(TestSchemaFlatten, NestedExample) { elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); // A primitive one - elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED, - format::Type::INT32, 1)); + elements.push_back( + NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1)); // A group elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2)); @@ -185,8 +174,8 @@ TEST_F(TestSchemaFlatten, NestedExample) { elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3); elt.__set_converted_type(ConvertedType::LIST); elements.push_back(elt); - elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL, - format::Type::INT64, 4)); + elements.push_back( + NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4)); // Construct the schema NodeVector fields; @@ -207,6 +196,6 @@ TEST_F(TestSchemaFlatten, NestedExample) { } } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc index 9f0c70f5..dd552be8 100644 --- a/src/parquet/schema/schema-descriptor-test.cc +++ b/src/parquet/schema/schema-descriptor-test.cc @@ -36,8 +36,8 @@ namespace parquet { namespace schema { TEST(TestColumnDescriptor, TestAttrs) { - NodePtr node = PrimitiveNode::Make("name", Repetition::OPTIONAL, - Type::BYTE_ARRAY, LogicalType::UTF8); + NodePtr node = PrimitiveNode::Make( + "name", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8); ColumnDescriptor descr(node, 4, 1); ASSERT_EQ("name", descr.name()); @@ -49,8 +49,8 @@ TEST(TestColumnDescriptor, TestAttrs) { ASSERT_EQ(-1, descr.type_length()); // Test FIXED_LEN_BYTE_ARRAY - node = PrimitiveNode::Make("name", Repetition::OPTIONAL, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 12, 10, 4); + node = PrimitiveNode::Make("name", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, 12, 10, 4); descr = ColumnDescriptor(node, 4, 1); ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type()); @@ -59,16 +59,14 @@ TEST(TestColumnDescriptor, TestAttrs) { class TestSchemaDescriptor : public ::testing::Test { public: - void setUp() { - } + void setUp() {} protected: SchemaDescriptor descr_; }; TEST_F(TestSchemaDescriptor, InitNonGroup) { - NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL, - Type::INT32); + NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL, Type::INT32); ASSERT_THROW(descr_.Init(node), ParquetException); } @@ -85,8 +83,8 @@ TEST_F(TestSchemaDescriptor, BuildTree) { NodePtr item1 = Int64("item1", Repetition::REQUIRED); NodePtr item2 = Boolean("item2", Repetition::OPTIONAL); NodePtr item3 = Int32("item3", Repetition::REPEATED); - NodePtr list(GroupNode::Make("records", Repetition::REPEATED, - {item1, item2, item3}, LogicalType::LIST)); + NodePtr list(GroupNode::Make( + "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST)); NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); fields.push_back(bag); @@ -129,6 +127,6 @@ TEST_F(TestSchemaDescriptor, BuildTree) { ASSERT_EQ(nleaves, descr_.num_columns()); } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc index e41413b7..286aea91 100644 --- a/src/parquet/schema/schema-printer-test.cc +++ b/src/parquet/schema/schema-printer-test.cc @@ -46,8 +46,8 @@ TEST(TestSchemaPrinter, Examples) { // 3-level list encoding NodePtr item1 = Int64("item1"); NodePtr item2 = Boolean("item2", Repetition::REQUIRED); - NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item1, item2}, - LogicalType::LIST)); + NodePtr list( + GroupNode::Make("b", Repetition::REPEATED, {item1, item2}, LogicalType::LIST)); NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); fields.push_back(bag); @@ -67,6 +67,6 @@ TEST(TestSchemaPrinter, Examples) { ASSERT_EQ(expected, result); } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc index 82156132..37c8b14b 100644 --- a/src/parquet/schema/schema-types-test.cc +++ b/src/parquet/schema/schema-types-test.cc @@ -76,8 +76,7 @@ class TestPrimitiveNode : public ::testing::Test { TEST_F(TestPrimitiveNode, Attrs) { PrimitiveNode node1("foo", Repetition::REPEATED, Type::INT32); - PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY, - LogicalType::UTF8); + PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8); ASSERT_EQ("foo", node1.name()); @@ -111,8 +110,8 @@ TEST_F(TestPrimitiveNode, Attrs) { } TEST_F(TestPrimitiveNode, FromParquet) { - SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, - format::Type::INT32, 0); + SchemaElement elt = + NewPrimitive(name_, FieldRepetitionType::OPTIONAL, format::Type::INT32, 0); Convert(&elt); ASSERT_EQ(name_, prim_node_->name()); ASSERT_EQ(id_, prim_node_->id()); @@ -130,8 +129,8 @@ TEST_F(TestPrimitiveNode, FromParquet) { ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type()); // FIXED_LEN_BYTE_ARRAY - elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, - format::Type::FIXED_LEN_BYTE_ARRAY, 0); + elt = NewPrimitive( + name_, FieldRepetitionType::OPTIONAL, format::Type::FIXED_LEN_BYTE_ARRAY, 0); elt.__set_type_length(16); Convert(&elt); @@ -142,8 +141,8 @@ TEST_F(TestPrimitiveNode, FromParquet) { ASSERT_EQ(16, prim_node_->type_length()); // ConvertedType::Decimal - elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, - format::Type::FIXED_LEN_BYTE_ARRAY, 0); + elt = NewPrimitive( + name_, FieldRepetitionType::OPTIONAL, format::Type::FIXED_LEN_BYTE_ARRAY, 0); elt.__set_converted_type(ConvertedType::DECIMAL); elt.__set_type_length(6); elt.__set_scale(2); @@ -194,42 +193,54 @@ TEST_F(TestPrimitiveNode, Equals) { } TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) { - ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::INT32, LogicalType::INT_32)); - ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::BYTE_ARRAY, LogicalType::JSON)); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::INT32, LogicalType::JSON), ParquetException); - ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::INT64, LogicalType::TIMESTAMP_MILLIS)); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::INT32, LogicalType::INT_64), ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::BYTE_ARRAY, LogicalType::INT_8), ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::BYTE_ARRAY, LogicalType::INTERVAL), ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), ParquetException); - ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::BYTE_ARRAY, LogicalType::ENUM)); + ASSERT_NO_THROW( + PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::INT_32)); + ASSERT_NO_THROW(PrimitiveNode::Make( + "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::JSON)); + ASSERT_THROW( + PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::JSON), + ParquetException); + ASSERT_NO_THROW(PrimitiveNode::Make( + "foo", Repetition::REQUIRED, Type::INT64, LogicalType::TIMESTAMP_MILLIS)); + ASSERT_THROW( + PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::INT_64), + ParquetException); + ASSERT_THROW(PrimitiveNode::Make( + "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::INT_8), + ParquetException); + ASSERT_THROW(PrimitiveNode::Make( + "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::INTERVAL), + ParquetException); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 2, 4), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), + ParquetException); + ASSERT_NO_THROW(PrimitiveNode::Make( + "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::ENUM)); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FLOAT, LogicalType::DECIMAL, 0, 2, 4), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 2, 4), + ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FLOAT, + LogicalType::DECIMAL, 0, 2, 4), + ParquetException); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 4, 0), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 4, 0), + ParquetException); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 0, 4), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 0, 4), + ParquetException); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 4, -1), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 4, -1), + ParquetException); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 2, 4), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 2, 4), + ParquetException); ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 6, 4)); ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12)); ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), ParquetException); + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), + ParquetException); } // ---------------------------------------------------------------------- @@ -295,6 +306,6 @@ TEST_F(TestGroupNode, Equals) { ASSERT_FALSE(group5.Equals(&group4)); } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h index 25dacb00..ee9e2256 100644 --- a/src/parquet/schema/test-util.h +++ b/src/parquet/schema/test-util.h @@ -62,8 +62,8 @@ static inline SchemaElement NewGroup(const std::string& name, return result; } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet -#endif // PARQUET_COLUMN_TEST_UTIL_H +#endif // PARQUET_COLUMN_TEST_UTIL_H diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc index e5b7ed21..abb031da 100644 --- a/src/parquet/schema/types.cc +++ b/src/parquet/schema/types.cc @@ -54,9 +54,7 @@ std::shared_ptr ColumnPath::extend(const std::string& node_name) con std::string ColumnPath::ToDotString() const { std::stringstream ss; for (auto it = path_.cbegin(); it != path_.cend(); ++it) { - if (it != path_.cbegin()) { - ss << "."; - } + if (it != path_.cbegin()) { ss << "."; } ss << *it; } return ss.str(); @@ -70,24 +68,23 @@ const std::vector& ColumnPath::ToDotVector() const { // Base node bool Node::EqualsInternal(const Node* other) const { - return type_ == other->type_ && - name_ == other->name_ && - repetition_ == other->repetition_ && - logical_type_ == other->logical_type_; + return type_ == other->type_ && name_ == other->name_ && + repetition_ == other->repetition_ && logical_type_ == other->logical_type_; } void Node::SetParent(const Node* parent) { - parent_ = parent; + parent_ = parent; } // ---------------------------------------------------------------------- // Primitive node PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition, - Type::type type, LogicalType::type logical_type, - int length, int precision, int scale, int id) : - Node(Node::PRIMITIVE, name, repetition, logical_type, id), - physical_type_(type), type_length_(length) { + Type::type type, LogicalType::type logical_type, int length, int precision, int scale, + int id) + : Node(Node::PRIMITIVE, name, repetition, logical_type, id), + physical_type_(type), + type_length_(length) { std::stringstream ss; // Check if the physical and logical types match // Mapping referred from Apache parquet-mr as on 2016-02-22 @@ -108,10 +105,8 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio } break; case LogicalType::DECIMAL: - if ((type != Type::INT32) && - (type != Type::INT64) && - (type != Type::BYTE_ARRAY) && - (type != Type::FIXED_LEN_BYTE_ARRAY)) { + if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) && + (type != Type::FIXED_LEN_BYTE_ARRAY)) { ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED"; throw ParquetException(ss.str()); } @@ -188,7 +183,7 @@ bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const { } if (logical_type_ == LogicalType::DECIMAL) { is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) && - (decimal_metadata_.scale == other->decimal_metadata_.scale); + (decimal_metadata_.scale == other->decimal_metadata_.scale); } if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { is_equal &= (type_length_ == other->type_length_); @@ -197,9 +192,7 @@ bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const { } bool PrimitiveNode::Equals(const Node* other) const { - if (!Node::EqualsInternal(other)) { - return false; - } + if (!Node::EqualsInternal(other)) { return false; } return EqualsInternal(static_cast(other)); } @@ -215,24 +208,16 @@ void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const { // Group node bool GroupNode::EqualsInternal(const GroupNode* other) const { - if (this == other) { - return true; - } - if (this->field_count() != other->field_count()) { - return false; - } + if (this == other) { return true; } + if (this->field_count() != other->field_count()) { return false; } for (int i = 0; i < this->field_count(); ++i) { - if (!this->field(i)->Equals(other->field(i).get())) { - return false; - } + if (!this->field(i)->Equals(other->field(i).get())) { return false; } } return true; } bool GroupNode::Equals(const Node* other) const { - if (!Node::EqualsInternal(other)) { - return false; - } + if (!Node::EqualsInternal(other)) { return false; } return EqualsInternal(static_cast(other)); } @@ -248,8 +233,7 @@ void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { // Node construction from Parquet metadata struct NodeParams { - explicit NodeParams(const std::string& name) : - name(name) {} + explicit NodeParams(const std::string& name) : name(name) {} const std::string& name; Repetition::type repetition; @@ -268,33 +252,32 @@ static inline NodeParams GetNodeParams(const format::SchemaElement* element) { return params; } -std::unique_ptr GroupNode::FromParquet(const void* opaque_element, int node_id, - const NodeVector& fields) { +std::unique_ptr GroupNode::FromParquet( + const void* opaque_element, int node_id, const NodeVector& fields) { const format::SchemaElement* element = - static_cast(opaque_element); + static_cast(opaque_element); NodeParams params = GetNodeParams(element); - return std::unique_ptr(new GroupNode(params.name, params.repetition, fields, - params.logical_type, node_id)); + return std::unique_ptr(new GroupNode( + params.name, params.repetition, fields, params.logical_type, node_id)); } -std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element, - int node_id) { +std::unique_ptr PrimitiveNode::FromParquet( + const void* opaque_element, int node_id) { const format::SchemaElement* element = - static_cast(opaque_element); + static_cast(opaque_element); NodeParams params = GetNodeParams(element); - std::unique_ptr result = std::unique_ptr( - new PrimitiveNode(params.name, params.repetition, - FromThrift(element->type), params.logical_type, - element->type_length, element->precision, element->scale, node_id)); + std::unique_ptr result = + std::unique_ptr(new PrimitiveNode(params.name, params.repetition, + FromThrift(element->type), params.logical_type, element->type_length, + element->precision, element->scale, node_id)); // Return as unique_ptr to the base type return std::unique_ptr(result.release()); } void GroupNode::ToParquet(void* opaque_element) const { - format::SchemaElement* element = - static_cast(opaque_element); + format::SchemaElement* element = static_cast(opaque_element); element->__set_name(name_); element->__set_num_children(field_count()); element->__set_repetition_type(ToThrift(repetition_)); @@ -305,8 +288,7 @@ void GroupNode::ToParquet(void* opaque_element) const { } void PrimitiveNode::ToParquet(void* opaque_element) const { - format::SchemaElement* element = - static_cast(opaque_element); + format::SchemaElement* element = static_cast(opaque_element); element->__set_name(name_); element->__set_num_children(0); @@ -321,6 +303,6 @@ void PrimitiveNode::ToParquet(void* opaque_element) const { element->__set_scale(decimal_metadata_.scale); } -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h index 452c23ad..c9df1b7d 100644 --- a/src/parquet/schema/types.h +++ b/src/parquet/schema/types.h @@ -66,11 +66,7 @@ namespace schema { // of these encodings (versus a struct containing an array). We should refuse // the temptation to guess, as they say. struct ListEncoding { - enum type { - ONE_LEVEL, - TWO_LEVEL, - THREE_LEVEL - }; + enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; }; struct DecimalMetadata { @@ -100,69 +96,42 @@ class GroupNode; // and optionally a logical type (ConvertedType in Parquet metadata parlance) class Node { public: - enum type { - PRIMITIVE, - GROUP - }; + enum type { PRIMITIVE, GROUP }; - Node(Node::type type, const std::string& name, - Repetition::type repetition, - LogicalType::type logical_type = LogicalType::NONE, - int id = -1) : - type_(type), - name_(name), - repetition_(repetition), - logical_type_(logical_type), - id_(id), - parent_(nullptr) {} + Node(Node::type type, const std::string& name, Repetition::type repetition, + LogicalType::type logical_type = LogicalType::NONE, int id = -1) + : type_(type), + name_(name), + repetition_(repetition), + logical_type_(logical_type), + id_(id), + parent_(nullptr) {} virtual ~Node() {} - bool is_primitive() const { - return type_ == Node::PRIMITIVE; - } + bool is_primitive() const { return type_ == Node::PRIMITIVE; } - bool is_group() const { - return type_ == Node::GROUP; - } + bool is_group() const { return type_ == Node::GROUP; } - bool is_optional() const { - return repetition_ == Repetition::OPTIONAL; - } + bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } - bool is_repeated() const { - return repetition_ == Repetition::REPEATED; - } + bool is_repeated() const { return repetition_ == Repetition::REPEATED; } - bool is_required() const { - return repetition_ == Repetition::REQUIRED; - } + bool is_required() const { return repetition_ == Repetition::REQUIRED; } virtual bool Equals(const Node* other) const = 0; - const std::string& name() const { - return name_; - } + const std::string& name() const { return name_; } - Node::type node_type() const { - return type_; - } + Node::type node_type() const { return type_; } - Repetition::type repetition() const { - return repetition_; - } + Repetition::type repetition() const { return repetition_; } - LogicalType::type logical_type() const { - return logical_type_; - } + LogicalType::type logical_type() const { return logical_type_; } - int id() const { - return id_; - } + int id() const { return id_; } - const Node* parent() const { - return parent_; - } + const Node* parent() const { return parent_; } // ToParquet returns an opaque void* to avoid exporting // parquet::SchemaElement into the public API @@ -214,46 +183,36 @@ class PrimitiveNode : public Node { // parquet::SchemaElement into the public API static std::unique_ptr FromParquet(const void* opaque_element, int id); - static inline NodePtr Make(const std::string& name, - Repetition::type repetition, Type::type type, - LogicalType::type logical_type = LogicalType::NONE, + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + Type::type type, LogicalType::type logical_type = LogicalType::NONE, int length = -1, int precision = -1, int scale = -1) { - return NodePtr(new PrimitiveNode(name, repetition, type, logical_type, - length, precision, scale)); + return NodePtr(new PrimitiveNode( + name, repetition, type, logical_type, length, precision, scale)); } virtual bool Equals(const Node* other) const; - Type::type physical_type() const { - return physical_type_; - } + Type::type physical_type() const { return physical_type_; } - int32_t type_length() const { - return type_length_; - } + int32_t type_length() const { return type_length_; } - const DecimalMetadata& decimal_metadata() const { - return decimal_metadata_; - } + const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } void ToParquet(void* opaque_element) const override; virtual void Visit(Visitor* visitor); void VisitConst(ConstVisitor* visitor) const override; private: - PrimitiveNode(const std::string& name, Repetition::type repetition, - Type::type type, LogicalType::type logical_type = LogicalType::NONE, - int length = -1, int precision = -1, int scale = -1, int id = -1); + PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, + LogicalType::type logical_type = LogicalType::NONE, int length = -1, + int precision = -1, int scale = -1, int id = -1); Type::type physical_type_; int32_t type_length_; DecimalMetadata decimal_metadata_; // For FIXED_LEN_BYTE_ARRAY - void SetTypeLength(int32_t length) { - type_length_ = length; - } - + void SetTypeLength(int32_t length) { type_length_ = length; } // For Decimal logical type: Precision and scale void SetDecimalMetadata(int32_t scale, int32_t precision) { @@ -273,24 +232,19 @@ class GroupNode : public Node { public: // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting // parquet::SchemaElement into the public API - static std::unique_ptr FromParquet(const void* opaque_element, int id, - const NodeVector& fields); + static std::unique_ptr FromParquet( + const void* opaque_element, int id, const NodeVector& fields); - static inline NodePtr Make(const std::string& name, - Repetition::type repetition, const NodeVector& fields, - LogicalType::type logical_type = LogicalType::NONE) { + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE) { return NodePtr(new GroupNode(name, repetition, fields, logical_type)); } virtual bool Equals(const Node* other) const; - const NodePtr& field(int i) const { - return fields_[i]; - } + const NodePtr& field(int i) const { return fields_[i]; } - int field_count() const { - return fields_.size(); - } + int field_count() const { return fields_.size(); } void ToParquet(void* opaque_element) const override; virtual void Visit(Visitor* visitor); @@ -298,15 +252,13 @@ class GroupNode : public Node { private: GroupNode(const std::string& name, Repetition::type repetition, - const NodeVector& fields, - LogicalType::type logical_type = LogicalType::NONE, - int id = -1) : - Node(Node::GROUP, name, repetition, logical_type, id), - fields_(fields) { - for (NodePtr& field : fields_) { - field->SetParent(this); - } + const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE, + int id = -1) + : Node(Node::GROUP, name, repetition, logical_type, id), fields_(fields) { + for (NodePtr& field : fields_) { + field->SetParent(this); } + } NodeVector fields_; bool EqualsInternal(const GroupNode* other) const; @@ -318,10 +270,10 @@ class GroupNode : public Node { // ---------------------------------------------------------------------- // Convenience primitive type factory functions -#define PRIMITIVE_FACTORY(FuncName, TYPE) \ - static inline NodePtr FuncName(const std::string& name, \ - Repetition::type repetition = Repetition::OPTIONAL) { \ - return PrimitiveNode::Make(name, repetition, Type::TYPE); \ +#define PRIMITIVE_FACTORY(FuncName, TYPE) \ + static inline NodePtr FuncName( \ + const std::string& name, Repetition::type repetition = Repetition::OPTIONAL) { \ + return PrimitiveNode::Make(name, repetition, Type::TYPE); \ } PRIMITIVE_FACTORY(Boolean, BOOLEAN); @@ -332,8 +284,8 @@ PRIMITIVE_FACTORY(Float, FLOAT); PRIMITIVE_FACTORY(Double, DOUBLE); PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY); -} // namespace schema +} // namespace schema -} // namespace parquet +} // namespace parquet -#endif // PARQUET_SCHEMA_TYPES_H +#endif // PARQUET_SCHEMA_TYPES_H diff --git a/src/parquet/thrift/util.h b/src/parquet/thrift/util.h index 287884e9..38b1a155 100644 --- a/src/parquet/thrift/util.h +++ b/src/parquet/thrift/util.h @@ -122,6 +122,6 @@ inline void SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { out->Write(out_buffer, out_length); } -} // namespace parquet +} // namespace parquet -#endif // PARQUET_THRIFT_UTIL_H +#endif // PARQUET_THRIFT_UTIL_H diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc index 487e9d51..f1f829d8 100644 --- a/src/parquet/types-test.cc +++ b/src/parquet/types-test.cc @@ -31,52 +31,33 @@ TEST(TestTypeToString, PhysicalTypes) { ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str()); ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str()); ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str()); - ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY", - type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str()); + ASSERT_STREQ( + "FIXED_LEN_BYTE_ARRAY", type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str()); } TEST(TestLogicalTypeToString, LogicalTypes) { - ASSERT_STREQ("NONE", - logical_type_to_string(LogicalType::NONE).c_str()); - ASSERT_STREQ("UTF8", - logical_type_to_string(LogicalType::UTF8).c_str()); - ASSERT_STREQ("MAP_KEY_VALUE", - logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str()); - ASSERT_STREQ("LIST", - logical_type_to_string(LogicalType::LIST).c_str()); - ASSERT_STREQ("ENUM", - logical_type_to_string(LogicalType::ENUM).c_str()); - ASSERT_STREQ("DECIMAL", - logical_type_to_string(LogicalType::DECIMAL).c_str()); - ASSERT_STREQ("DATE", - logical_type_to_string(LogicalType::DATE).c_str()); - ASSERT_STREQ("TIME_MILLIS", - logical_type_to_string(LogicalType::TIME_MILLIS).c_str()); - ASSERT_STREQ("TIMESTAMP_MILLIS", - logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str()); - ASSERT_STREQ("UINT_8", - logical_type_to_string(LogicalType::UINT_8).c_str()); - ASSERT_STREQ("UINT_16", - logical_type_to_string(LogicalType::UINT_16).c_str()); - ASSERT_STREQ("UINT_32", - logical_type_to_string(LogicalType::UINT_32).c_str()); - ASSERT_STREQ("UINT_64", - logical_type_to_string(LogicalType::UINT_64).c_str()); - ASSERT_STREQ("INT_8", - logical_type_to_string(LogicalType::INT_8).c_str()); - ASSERT_STREQ("INT_16", - logical_type_to_string(LogicalType::INT_16).c_str()); - ASSERT_STREQ("INT_32", - logical_type_to_string(LogicalType::INT_32).c_str()); - ASSERT_STREQ("INT_64", - logical_type_to_string(LogicalType::INT_64).c_str()); - ASSERT_STREQ("JSON", - logical_type_to_string(LogicalType::JSON).c_str()); - ASSERT_STREQ("BSON", - logical_type_to_string(LogicalType::BSON).c_str()); - ASSERT_STREQ("INTERVAL", - logical_type_to_string(LogicalType::INTERVAL).c_str()); + ASSERT_STREQ("NONE", logical_type_to_string(LogicalType::NONE).c_str()); + ASSERT_STREQ("UTF8", logical_type_to_string(LogicalType::UTF8).c_str()); + ASSERT_STREQ( + "MAP_KEY_VALUE", logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str()); + ASSERT_STREQ("LIST", logical_type_to_string(LogicalType::LIST).c_str()); + ASSERT_STREQ("ENUM", logical_type_to_string(LogicalType::ENUM).c_str()); + ASSERT_STREQ("DECIMAL", logical_type_to_string(LogicalType::DECIMAL).c_str()); + ASSERT_STREQ("DATE", logical_type_to_string(LogicalType::DATE).c_str()); + ASSERT_STREQ("TIME_MILLIS", logical_type_to_string(LogicalType::TIME_MILLIS).c_str()); + ASSERT_STREQ( + "TIMESTAMP_MILLIS", logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str()); + ASSERT_STREQ("UINT_8", logical_type_to_string(LogicalType::UINT_8).c_str()); + ASSERT_STREQ("UINT_16", logical_type_to_string(LogicalType::UINT_16).c_str()); + ASSERT_STREQ("UINT_32", logical_type_to_string(LogicalType::UINT_32).c_str()); + ASSERT_STREQ("UINT_64", logical_type_to_string(LogicalType::UINT_64).c_str()); + ASSERT_STREQ("INT_8", logical_type_to_string(LogicalType::INT_8).c_str()); + ASSERT_STREQ("INT_16", logical_type_to_string(LogicalType::INT_16).c_str()); + ASSERT_STREQ("INT_32", logical_type_to_string(LogicalType::INT_32).c_str()); + ASSERT_STREQ("INT_64", logical_type_to_string(LogicalType::INT_64).c_str()); + ASSERT_STREQ("JSON", logical_type_to_string(LogicalType::JSON).c_str()); + ASSERT_STREQ("BSON", logical_type_to_string(LogicalType::BSON).c_str()); + ASSERT_STREQ("INTERVAL", logical_type_to_string(LogicalType::INTERVAL).c_str()); } - -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/types.h b/src/parquet/types.h index 72017f96..a2d3622d 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -84,11 +84,7 @@ struct LogicalType { // Mirrors parquet::FieldRepetitionType struct Repetition { - enum type { - REQUIRED = 0, - OPTIONAL = 1, - REPEATED = 2 - }; + enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2 }; }; // Data encodings. Mirrors parquet::Encoding @@ -107,22 +103,12 @@ struct Encoding { // Compression, mirrors parquet::CompressionCodec struct Compression { - enum type { - UNCOMPRESSED, - SNAPPY, - GZIP, - LZO - }; + enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO }; }; // parquet::PageType struct PageType { - enum type { - DATA_PAGE, - INDEX_PAGE, - DICTIONARY_PAGE, - DATA_PAGE_V2 - }; + enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; }; // ---------------------------------------------------------------------- @@ -134,8 +120,7 @@ struct ByteArray { const uint8_t* ptr; bool operator==(const ByteArray& other) const { - return this->len == other.len && - 0 == memcmp(this->ptr, other.ptr, this->len); + return this->len == other.len && 0 == memcmp(this->ptr, other.ptr, this->len); } bool operator!=(const ByteArray& other) const { @@ -158,9 +143,7 @@ MANUALLY_ALIGNED_STRUCT(1) Int96 { return 0 == memcmp(this->value, other.value, 3 * sizeof(uint32_t)); } - bool operator!=(const Int96& other) const { - return !(*this == other); - } + bool operator!=(const Int96& other) const { return !(*this == other); } }; STRUCT_END(Int96, 12); @@ -171,16 +154,16 @@ static inline std::string ByteArrayToString(const ByteArray& a) { static inline std::string Int96ToString(const Int96& a) { std::stringstream result; for (int i = 0; i < 3; i++) { - result << a.value[i] << " "; + result << a.value[i] << " "; } return result.str(); } static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) { - const uint8_t *bytes = reinterpret_cast(a.ptr); + const uint8_t* bytes = reinterpret_cast(a.ptr); std::stringstream result; for (int i = 0; i < len; i++) { - result << (uint32_t)bytes[i] << " "; + result << (uint32_t)bytes[i] << " "; } return result.str(); } @@ -195,8 +178,7 @@ static inline int ByteCompare(const ByteArray& x1, const ByteArray& x2) { } template -struct type_traits { -}; +struct type_traits {}; template <> struct type_traits { @@ -383,6 +365,6 @@ static inline std::string logical_type_to_string(LogicalType::type t) { break; } } -} // namespace parquet +} // namespace parquet -#endif // PARQUET_TYPES_H +#endif // PARQUET_TYPES_H diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h index aee9c931..dd0c9e28 100644 --- a/src/parquet/util/bit-stream-utils.h +++ b/src/parquet/util/bit-stream-utils.h @@ -37,9 +37,7 @@ class BitWriter { public: /// buffer: buffer to write bits to. Buffer should be preallocated with /// 'buffer_len' bytes. - BitWriter(uint8_t* buffer, int buffer_len) : - buffer_(buffer), - max_bytes_(buffer_len) { + BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) { Clear(); } @@ -62,7 +60,7 @@ class BitWriter { /// Writes v to the next aligned byte using num_bytes. If T is larger than /// num_bytes, the extra high-order bytes will be ignored. Returns false if /// there was not enough space. - template + template bool PutAligned(T v, int num_bytes); /// Write a Vlq encoded int to the buffer. Returns false if there was not enough @@ -92,8 +90,8 @@ class BitWriter { /// buffer_. This is faster than writing values byte by byte directly to buffer_. uint64_t buffered_values_; - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ }; /// Utility class to read bit/byte stream. This class can read bits or bytes @@ -102,11 +100,8 @@ class BitWriter { class BitReader { public: /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. - BitReader(const uint8_t* buffer, int buffer_len) : - buffer_(buffer), - max_bytes_(buffer_len), - byte_offset_(0), - bit_offset_(0) { + BitReader(const uint8_t* buffer, int buffer_len) + : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) { int num_bytes = std::min(8, max_bytes_ - byte_offset_); memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); } @@ -124,7 +119,7 @@ class BitReader { /// Gets the next value from the buffer. Returns true if 'v' could be read or false if /// there are not enough bytes left. num_bits must be <= 32. - template + template bool GetValue(int num_bits, T* v); /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T @@ -132,7 +127,7 @@ class BitReader { /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will /// be advanced to the start of the next byte before 'v' is read. Returns /// false if there are not enough bytes left. - template + template bool GetAligned(int num_bytes, T* v); /// Reads a vlq encoded int from the stream. The encoded int must start at @@ -158,10 +153,10 @@ class BitReader { /// faster than reading values byte by byte directly from buffer_. uint64_t buffered_values_; - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H +#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h index cd7d9849..02c0e25a 100644 --- a/src/parquet/util/bit-stream-utils.inline.h +++ b/src/parquet/util/bit-stream-utils.inline.h @@ -67,7 +67,7 @@ inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { return ptr; } -template +template inline bool BitWriter::PutAligned(T val, int num_bytes) { uint8_t* ptr = GetNextBytePtr(num_bytes); if (ptr == NULL) return false; @@ -85,7 +85,7 @@ inline bool BitWriter::PutVlqInt(uint32_t v) { return result; } -template +template inline bool BitReader::GetValue(int num_bits, T* v) { DCHECK(buffer_ != NULL); // TODO: revisit this limit if necessary @@ -116,7 +116,7 @@ inline bool BitReader::GetValue(int num_bits, T* v) { return true; } -template +template inline bool BitReader::GetAligned(int num_bytes, T* v) { DCHECK_LE(num_bytes, static_cast(sizeof(T))); int bytes_read = BitUtil::Ceil(bit_offset_, 8); @@ -165,6 +165,6 @@ inline bool BitReader::GetZigZagVlqInt(int32_t* v) { return true; } -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H +#endif // PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc index bf1ee9d1..f305b9c3 100644 --- a/src/parquet/util/bit-util-test.cc +++ b/src/parquet/util/bit-util-test.cc @@ -33,9 +33,7 @@ namespace parquet { static void ensure_cpu_info_initialized() { - if (!CpuInfo::initialized()) { - CpuInfo::Init(); - } + if (!CpuInfo::initialized()) { CpuInfo::Init(); } } TEST(BitUtil, Ceil) { @@ -93,9 +91,9 @@ TEST(BitUtil, TrailingBits) { EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0); EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1); EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64), - BOOST_BINARY(1 1 1 1 1 1 1 1)); + BOOST_BINARY(1 1 1 1 1 1 1 1)); EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100), - BOOST_BINARY(1 1 1 1 1 1 1 1)); + BOOST_BINARY(1 1 1 1 1 1 1 1)); EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0); EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0); EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0); @@ -111,12 +109,12 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11223344)), 0x44332211); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ(BitUtil::ByteSwap( - static_cast(0x1122334455667788)), 0x8877665544332211); + EXPECT_EQ( + BitUtil::ByteSwap(static_cast(0x1122334455667788)), 0x8877665544332211); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ(BitUtil::ByteSwap( - static_cast(0x1122334455667788)), 0x8877665544332211); + EXPECT_EQ( + BitUtil::ByteSwap(static_cast(0x1122334455667788)), 0x8877665544332211); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); @@ -189,4 +187,4 @@ TEST(BitStreamUtil, ZigZag) { TestZigZag(-std::numeric_limits::max()); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h index d591aefe..b75f5a1b 100644 --- a/src/parquet/util/bit-util.h +++ b/src/parquet/util/bit-util.h @@ -39,8 +39,7 @@ namespace parquet { // We add a partial stub implementation here template -struct make_unsigned { -}; +struct make_unsigned {}; template <> struct make_unsigned { @@ -111,41 +110,30 @@ class BitUtil { /// Specialized round up and down functions for frequently used factors, /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64). /// Returns the rounded up number of bytes that fit the number of bits. - static inline uint32_t RoundUpNumBytes(uint32_t bits) { - return (bits + 7) >> 3; - } + static inline uint32_t RoundUpNumBytes(uint32_t bits) { return (bits + 7) >> 3; } /// Returns the rounded down number of bytes that fit the number of bits. - static inline uint32_t RoundDownNumBytes(uint32_t bits) { - return bits >> 3; - } + static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; } /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32. - static inline uint32_t RoundUpNumi32(uint32_t bits) { - return (bits + 31) >> 5; - } + static inline uint32_t RoundUpNumi32(uint32_t bits) { return (bits + 31) >> 5; } /// Returns the rounded up 32 multiple. - static inline uint32_t RoundDownNumi32(uint32_t bits) { - return bits >> 5; - } + static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; } /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64. - static inline uint32_t RoundUpNumi64(uint32_t bits) { - return (bits + 63) >> 6; - } + static inline uint32_t RoundUpNumi64(uint32_t bits) { return (bits + 63) >> 6; } /// Returns the rounded down to 64 multiple. - static inline uint32_t RoundDownNumi64(uint32_t bits) { - return bits >> 6; - } + static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; } /// Non hw accelerated pop count. /// TODO: we don't use this in any perf sensitive code paths currently. There /// might be a much faster way to implement this. static inline int PopcountNoHw(uint64_t x) { int count = 0; - for (; x != 0; ++count) x &= x-1; + for (; x != 0; ++count) + x &= x - 1; return count; } @@ -163,7 +151,7 @@ class BitUtil { } // Compute correct population count for various-width signed integers - template + template static inline int PopcountSigned(T v) { // Converting to same-width unsigned then extending preserves the bit pattern. return BitUtil::Popcount(static_cast::type>(v)); @@ -189,20 +177,17 @@ class BitUtil { // (floor(log2(n)) = MSB(n) (0-indexed)) --x; int result = 1; - while (x >>= 1) ++result; + while (x >>= 1) + ++result; return result; } /// Swaps the byte order (i.e. endianess) - static inline int64_t ByteSwap(int64_t value) { - return __builtin_bswap64(value); - } + static inline int64_t ByteSwap(int64_t value) { return __builtin_bswap64(value); } static inline uint64_t ByteSwap(uint64_t value) { return static_cast(__builtin_bswap64(value)); } - static inline int32_t ByteSwap(int32_t value) { - return __builtin_bswap32(value); - } + static inline int32_t ByteSwap(int32_t value) { return __builtin_bswap32(value); } static inline uint32_t ByteSwap(uint32_t value) { return static_cast(__builtin_bswap32(value)); } @@ -231,7 +216,8 @@ class BitUtil { *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); return; - default: break; + default: + break; } uint8_t* d = reinterpret_cast(dst); @@ -241,52 +227,52 @@ class BitUtil { } } - /// Converts to big endian format (if not already in big endian) from the - /// machine's native endian format. +/// Converts to big endian format (if not already in big endian) from the +/// machine's native endian format. #if __BYTE_ORDER == __LITTLE_ENDIAN - static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); } + static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); } static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); } - static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); } + static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); } static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); } - static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); } + static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); } static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); } #else - static inline int64_t ToBigEndian(int64_t val) { return val; } + static inline int64_t ToBigEndian(int64_t val) { return val; } static inline uint64_t ToBigEndian(uint64_t val) { return val; } - static inline int32_t ToBigEndian(int32_t val) { return val; } + static inline int32_t ToBigEndian(int32_t val) { return val; } static inline uint32_t ToBigEndian(uint32_t val) { return val; } - static inline int16_t ToBigEndian(int16_t val) { return val; } + static inline int16_t ToBigEndian(int16_t val) { return val; } static inline uint16_t ToBigEndian(uint16_t val) { return val; } #endif - /// Converts from big endian format to the machine's native endian format. +/// Converts from big endian format to the machine's native endian format. #if __BYTE_ORDER == __LITTLE_ENDIAN - static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); } + static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); } static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); } - static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); } + static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); } static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); } - static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); } + static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); } static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); } #else - static inline int64_t FromBigEndian(int64_t val) { return val; } + static inline int64_t FromBigEndian(int64_t val) { return val; } static inline uint64_t FromBigEndian(uint64_t val) { return val; } - static inline int32_t FromBigEndian(int32_t val) { return val; } + static inline int32_t FromBigEndian(int32_t val) { return val; } static inline uint32_t FromBigEndian(uint32_t val) { return val; } - static inline int16_t FromBigEndian(int16_t val) { return val; } + static inline int16_t FromBigEndian(int16_t val) { return val; } static inline uint16_t FromBigEndian(uint16_t val) { return val; } #endif // Logical right shift for signed integer types // This is needed because the C >> operator does arithmetic right shift // Negative shift amounts lead to undefined behavior - template + template static T ShiftRightLogical(T v, int shift) { // Conversion to unsigned ensures most significant bits always filled with 0's return static_cast::type>(v) >> shift; } // Get an specific bit of a numeric type - template + template static inline int8_t GetBit(T v, int bitpos) { T masked = v & (static_cast(0x1) << bitpos); return static_cast(ShiftRightLogical(masked, bitpos)); @@ -294,7 +280,7 @@ class BitUtil { // Set a specific bit to 1 // Behavior when bitpos is negative is undefined - template + template static T SetBit(T v, int bitpos) { return v | (static_cast(0x1) << bitpos); } @@ -309,7 +295,7 @@ class BitUtil { // Set a specific bit to 0 // Behavior when bitpos is negative is undefined - template + template static T UnsetBit(T v, int bitpos) { return v & ~(static_cast(0x1) << bitpos); } @@ -323,6 +309,6 @@ class BitUtil { } }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_BIT_UTIL_H +#endif // PARQUET_UTIL_BIT_UTIL_H diff --git a/src/parquet/util/buffer-builder.h b/src/parquet/util/buffer-builder.h index 5c388a7a..26f134ee 100644 --- a/src/parquet/util/buffer-builder.h +++ b/src/parquet/util/buffer-builder.h @@ -29,20 +29,17 @@ namespace parquet { class BufferBuilder { public: BufferBuilder(uint8_t* dst_buffer, int dst_len) - : buffer_(dst_buffer), capacity_(dst_len), size_(0) { - } + : buffer_(dst_buffer), capacity_(dst_len), size_(0) {} BufferBuilder(char* dst_buffer, int dst_len) - : buffer_(reinterpret_cast(dst_buffer)), - capacity_(dst_len), size_(0) { - } + : buffer_(reinterpret_cast(dst_buffer)), capacity_(dst_len), size_(0) {} inline void Append(const void* buffer, int len) { memcpy(buffer_ + size_, buffer, len); size_ += len; } - template + template inline void Append(const T& v) { Append(&v, sizeof(T)); } @@ -56,6 +53,6 @@ class BufferBuilder { int size_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_BUFFER_BUILDER_H +#endif // PARQUET_UTIL_BUFFER_BUILDER_H diff --git a/src/parquet/util/buffer-test.cc b/src/parquet/util/buffer-test.cc index c871f916..a71a1f66 100644 --- a/src/parquet/util/buffer-test.cc +++ b/src/parquet/util/buffer-test.cc @@ -30,8 +30,7 @@ using std::string; namespace parquet { -class TestBuffer : public ::testing::Test { -}; +class TestBuffer : public ::testing::Test {}; TEST_F(TestBuffer, Resize) { OwnedMutableBuffer buf; @@ -48,8 +47,8 @@ TEST_F(TestBuffer, Resize) { } TEST_F(TestBuffer, ResizeOOM) { - // Tests that deliberately throw Exceptions foul up valgrind and report - // red herring memory leaks +// Tests that deliberately throw Exceptions foul up valgrind and report +// red herring memory leaks #ifndef PARQUET_VALGRIND OwnedMutableBuffer buf; ASSERT_NO_THROW(buf.Resize(100)); @@ -59,10 +58,8 @@ TEST_F(TestBuffer, ResizeOOM) { FAIL() << "Exception not thrown"; } catch (const ParquetException& e) { // pass - } catch(const std::exception& e) { - FAIL() << "Different exception thrown"; - } + } catch (const std::exception& e) { FAIL() << "Different exception thrown"; } #endif } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/buffer.cc b/src/parquet/util/buffer.cc index 84c20fb9..0b7100c8 100644 --- a/src/parquet/util/buffer.cc +++ b/src/parquet/util/buffer.cc @@ -25,8 +25,7 @@ namespace parquet { -Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, - int64_t size) { +Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size) { data_ = parent->data() + offset; size_ = size; parent_ = parent; @@ -36,15 +35,13 @@ std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } -OwnedMutableBuffer::OwnedMutableBuffer(int64_t size, MemoryAllocator* allocator) : - ResizableBuffer(nullptr, 0), allocator_(allocator) { +OwnedMutableBuffer::OwnedMutableBuffer(int64_t size, MemoryAllocator* allocator) + : ResizableBuffer(nullptr, 0), allocator_(allocator) { Resize(size); } OwnedMutableBuffer::~OwnedMutableBuffer() { - if (mutable_data_) { - allocator_->Free(mutable_data_, capacity_); - } + if (mutable_data_) { allocator_->Free(mutable_data_, capacity_); } } void OwnedMutableBuffer::Reserve(int64_t new_capacity) { @@ -72,9 +69,10 @@ uint8_t& OwnedMutableBuffer::operator[](int64_t i) { } template -Vector::Vector(int64_t size, MemoryAllocator* allocator) : - buffer_(new OwnedMutableBuffer(size * sizeof(T), allocator)), - size_(size), capacity_(size) { +Vector::Vector(int64_t size, MemoryAllocator* allocator) + : buffer_(new OwnedMutableBuffer(size * sizeof(T), allocator)), + size_(size), + capacity_(size) { if (size > 0) { data_ = reinterpret_cast(buffer_->mutable_data()); } else { @@ -122,4 +120,4 @@ template class Vector; template class Vector; template class Vector; -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/buffer.h b/src/parquet/util/buffer.h index 0cd973ac..c0f263fb 100644 --- a/src/parquet/util/buffer.h +++ b/src/parquet/util/buffer.h @@ -36,49 +36,35 @@ namespace parquet { // class instance class Buffer : public std::enable_shared_from_this { public: - Buffer(const uint8_t* data, int64_t size) : - data_(data), - size_(size) {} + Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} // An offset into data that is owned by another buffer, but we want to be // able to retain a valid pointer to it even after other shared_ptr's to the // parent buffer have been destroyed Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size); - std::shared_ptr get_shared_ptr() { - return shared_from_this(); - } + std::shared_ptr get_shared_ptr() { return shared_from_this(); } // Return true if both buffers are the same size and contain the same bytes // up to the number of compared bytes bool Equals(const Buffer& other, int64_t nbytes) const { - return this == &other || - (size_ >= nbytes && other.size_ >= nbytes && - !memcmp(data_, other.data_, nbytes)); + return this == &other || (size_ >= nbytes && other.size_ >= nbytes && + !memcmp(data_, other.data_, nbytes)); } bool Equals(const Buffer& other) const { - return this == &other || - (size_ == other.size_ && !memcmp(data_, other.data_, size_)); + return this == &other || (size_ == other.size_ && !memcmp(data_, other.data_, size_)); } - const uint8_t* data() const { - return data_; - } + const uint8_t* data() const { return data_; } - int64_t size() const { - return size_; - } + int64_t size() const { return size_; } // Returns true if this Buffer is referencing memory (possibly) owned by some // other buffer - bool is_shared() const { - return static_cast(parent_); - } + bool is_shared() const { return static_cast(parent_); } - const std::shared_ptr parent() const { - return parent_; - } + const std::shared_ptr parent() const { return parent_; } protected: const uint8_t* data_; @@ -94,22 +80,17 @@ class Buffer : public std::enable_shared_from_this { // A Buffer whose contents can be mutated. May or may not own its data. class MutableBuffer : public Buffer { public: - MutableBuffer(uint8_t* data, int64_t size) : - Buffer(data, size) { + MutableBuffer(uint8_t* data, int64_t size) : Buffer(data, size) { mutable_data_ = data; } - uint8_t* mutable_data() { - return mutable_data_; - } + uint8_t* mutable_data() { return mutable_data_; } // Get a read-only view of this buffer std::shared_ptr GetImmutableView(); protected: - MutableBuffer() : - Buffer(nullptr, 0), - mutable_data_(nullptr) {} + MutableBuffer() : Buffer(nullptr, 0), mutable_data_(nullptr) {} uint8_t* mutable_data_; }; @@ -119,8 +100,8 @@ class ResizableBuffer : public MutableBuffer { virtual void Resize(int64_t new_size) = 0; protected: - ResizableBuffer(uint8_t* data, int64_t size) : - MutableBuffer(data, size), capacity_(size) {} + ResizableBuffer(uint8_t* data, int64_t size) + : MutableBuffer(data, size), capacity_(size) {} int64_t capacity_; }; @@ -129,8 +110,8 @@ class ResizableBuffer : public MutableBuffer { // garbage-collected class OwnedMutableBuffer : public ResizableBuffer { public: - explicit OwnedMutableBuffer(int64_t size = 0, - MemoryAllocator* allocator = default_allocator()); + explicit OwnedMutableBuffer( + int64_t size = 0, MemoryAllocator* allocator = default_allocator()); virtual ~OwnedMutableBuffer(); void Resize(int64_t new_size) override; void Reserve(int64_t new_capacity); @@ -151,9 +132,7 @@ class Vector { void Reserve(int64_t new_capacity); void Assign(int64_t size, const T val); void Swap(Vector& v); - inline T& operator[](int64_t i) { - return data_[i]; - } + inline T& operator[](int64_t i) { return data_[i]; } private: std::unique_ptr buffer_; @@ -164,6 +143,6 @@ class Vector { DISALLOW_COPY_AND_ASSIGN(Vector); }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_BUFFER_H +#endif // PARQUET_UTIL_BUFFER_H diff --git a/src/parquet/util/compiler-util.h b/src/parquet/util/compiler-util.h index 9048ba19..3f2c3730 100644 --- a/src/parquet/util/compiler-util.h +++ b/src/parquet/util/compiler-util.h @@ -36,25 +36,23 @@ #define PREFETCH(addr) __builtin_prefetch(addr) -//macros to disable padding -//these macros are portable across different compilers and platforms +// macros to disable padding +// these macros are portable across different compilers and platforms //[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355] #if defined(_MSC_VER) - #define MANUALLY_ALIGNED_STRUCT(alignment) \ - __pragma(pack(1)); \ - struct __declspec(align(alignment)) - #define STRUCT_END(name, size) \ - __pragma(pack()); \ - static_assert(sizeof(name) == size, "compiler breaks packing rules") +#define MANUALLY_ALIGNED_STRUCT(alignment) \ + __pragma(pack(1)); \ + struct __declspec(align(alignment)) +#define STRUCT_END(name, size) \ + __pragma(pack()); \ + static_assert(sizeof(name) == size, "compiler breaks packing rules") #elif defined(__GNUC__) || defined(__clang__) - #define MANUALLY_ALIGNED_STRUCT(alignment) \ - _Pragma("pack(1)") \ - struct __attribute__((aligned(alignment))) - #define STRUCT_END(name, size) \ - _Pragma("pack()") \ - static_assert(sizeof(name) == size, "compiler breaks packing rules") +#define MANUALLY_ALIGNED_STRUCT(alignment) \ + _Pragma("pack(1)") struct __attribute__((aligned(alignment))) +#define STRUCT_END(name, size) \ + _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules") #else - #error Unknown compiler, please define structure alignment macros +#error Unknown compiler, please define structure alignment macros #endif -#endif // PARQUET_UTIL_COMPILER_UTIL_H +#endif // PARQUET_UTIL_COMPILER_UTIL_H diff --git a/src/parquet/util/cpu-info.cc b/src/parquet/util/cpu-info.cc index 836152f1..1b1bc003 100644 --- a/src/parquet/util/cpu-info.cc +++ b/src/parquet/util/cpu-info.cc @@ -52,16 +52,14 @@ int64_t CpuInfo::original_hardware_flags_; int64_t CpuInfo::cache_sizes_[L3_CACHE + 1]; int64_t CpuInfo::cycles_per_ms_; int CpuInfo::num_cores_ = 1; -string CpuInfo::model_name_ = "unknown"; // NOLINT +string CpuInfo::model_name_ = "unknown"; // NOLINT static struct { string name; int64_t flag; } flag_mappings[] = { - { "ssse3", CpuInfo::SSSE3 }, - { "sse4_1", CpuInfo::SSE4_1 }, - { "sse4_2", CpuInfo::SSE4_2 }, - { "popcnt", CpuInfo::POPCNT }, + {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, {"sse4_2", CpuInfo::SSE4_2}, + {"popcnt", CpuInfo::POPCNT}, }; static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); @@ -72,9 +70,7 @@ static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0] int64_t ParseCPUFlags(const string& values) { int64_t flags = 0; for (int i = 0; i < num_flags; ++i) { - if (contains(values, flag_mappings[i].name)) { - flags |= flag_mappings[i].flag; - } + if (contains(values, flag_mappings[i].name)) { flags |= flag_mappings[i].flag; } } return flags; } @@ -167,4 +163,4 @@ void CpuInfo::EnableFeature(int64_t flag, bool enable) { } } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/cpu-info.h b/src/parquet/util/cpu-info.h index dd951e5a..a2fd5de3 100644 --- a/src/parquet/util/cpu-info.h +++ b/src/parquet/util/cpu-info.h @@ -34,10 +34,10 @@ namespace parquet { /// /sys/devices) class CpuInfo { public: - static const int64_t SSSE3 = (1 << 1); - static const int64_t SSE4_1 = (1 << 2); - static const int64_t SSE4_2 = (1 << 3); - static const int64_t POPCNT = (1 << 4); + static const int64_t SSSE3 = (1 << 1); + static const int64_t SSE4_1 = (1 << 2); + static const int64_t SSE4_2 = (1 << 3); + static const int64_t POPCNT = (1 << 4); /// Cache enums for L1 (data), L2 and L3 enum CacheLevel { @@ -93,9 +93,7 @@ class CpuInfo { return model_name_; } - static bool initialized() { - return initialized_; - } + static bool initialized() { return initialized_; } private: static bool initialized_; @@ -104,9 +102,9 @@ class CpuInfo { static int64_t cache_sizes_[L3_CACHE + 1]; static int64_t cycles_per_ms_; static int num_cores_; - static std::string model_name_; // NOLINT + static std::string model_name_; // NOLINT }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_CPU_INFO_H +#endif // PARQUET_UTIL_CPU_INFO_H diff --git a/src/parquet/util/hash-util.h b/src/parquet/util/hash-util.h index e3f376f9..96701dab 100644 --- a/src/parquet/util/hash-util.h +++ b/src/parquet/util/hash-util.h @@ -104,7 +104,7 @@ class HashUtil { const uint64_t* p = reinterpret_cast(v); hash = SSE4_crc32_u64(hash, *p); ++p; - hash = SSE4_crc32_u32(hash, *reinterpret_cast(p)); + hash = SSE4_crc32_u32(hash, *reinterpret_cast(p)); hash = (hash << 16) | (hash >> 16); return hash; } @@ -141,14 +141,21 @@ class HashUtil { const uint8_t* data2 = reinterpret_cast(data); switch (len & 7) { - case 7: h ^= uint64_t(data2[6]) << 48; - case 6: h ^= uint64_t(data2[5]) << 40; - case 5: h ^= uint64_t(data2[4]) << 32; - case 4: h ^= uint64_t(data2[3]) << 24; - case 3: h ^= uint64_t(data2[2]) << 16; - case 2: h ^= uint64_t(data2[1]) << 8; - case 1: h ^= uint64_t(data2[0]); - h *= MURMUR_PRIME; + case 7: + h ^= uint64_t(data2[6]) << 48; + case 6: + h ^= uint64_t(data2[5]) << 40; + case 5: + h ^= uint64_t(data2[4]) << 32; + case 4: + h ^= uint64_t(data2[3]) << 24; + case 3: + h ^= uint64_t(data2[2]) << 16; + case 2: + h ^= uint64_t(data2[1]) << 8; + case 1: + h ^= uint64_t(data2[0]); + h *= MURMUR_PRIME; } h ^= h >> MURMUR_R; @@ -158,8 +165,8 @@ class HashUtil { } /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/ - static const uint32_t FNV_PRIME = 0x01000193; // 16777619 - static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 + static const uint32_t FNV_PRIME = 0x01000193; // 16777619 + static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 static const uint64_t FNV64_PRIME = 1099511628211UL; static const uint64_t FNV64_SEED = 14695981039346656037UL; @@ -246,6 +253,6 @@ class HashUtil { } }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_HASH_UTIL_H +#endif // PARQUET_UTIL_HASH_UTIL_H diff --git a/src/parquet/util/input-output-test.cc b/src/parquet/util/input-output-test.cc index 3e151832..9db2bdd0 100644 --- a/src/parquet/util/input-output-test.cc +++ b/src/parquet/util/input-output-test.cc @@ -90,9 +90,7 @@ class TestFileReaders : public ::testing::Test { public: void SetUp() { test_path_ = "parquet-input-output-test.txt"; - if (file_exists(test_path_)) { - std::remove(test_path_.c_str()); - } + if (file_exists(test_path_)) { std::remove(test_path_.c_str()); } test_data_ = "testingdata"; std::ofstream stream; @@ -101,14 +99,10 @@ class TestFileReaders : public ::testing::Test { filesize_ = test_data_.size(); } - void TearDown() { - DeleteTestFile(); - } + void TearDown() { DeleteTestFile(); } void DeleteTestFile() { - if (file_exists(test_path_)) { - std::remove(test_path_.c_str()); - } + if (file_exists(test_path_)) { std::remove(test_path_.c_str()); } } protected: @@ -153,4 +147,4 @@ TYPED_TEST(TestFileReaders, BadSeek) { ASSERT_THROW(this->source.Seek(this->filesize_ + 1), ParquetException); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/input.cc b/src/parquet/util/input.cc index 2a0eda3e..8eb3956d 100644 --- a/src/parquet/util/input.cc +++ b/src/parquet/util/input.cc @@ -92,9 +92,7 @@ void LocalFileSource::Seek(int64_t pos) { int64_t LocalFileSource::Tell() const { int64_t position = ftell(file_); - if (position < 0) { - throw ParquetException("ftell failed, did the file disappear?"); - } + if (position < 0) { throw ParquetException("ftell failed, did the file disappear?"); } return position; } @@ -111,9 +109,7 @@ std::shared_ptr LocalFileSource::Read(int64_t nbytes) { result->Resize(nbytes); int64_t bytes_read = Read(nbytes, result->mutable_data()); - if (bytes_read < nbytes) { - result->Resize(bytes_read); - } + if (bytes_read < nbytes) { result->Resize(bytes_read); } return result; } // ---------------------------------------------------------------------- @@ -125,12 +121,10 @@ MemoryMapSource::~MemoryMapSource() { void MemoryMapSource::Open(const std::string& path) { LocalFileSource::Open(path); - data_ = reinterpret_cast(mmap(nullptr, size_, PROT_READ, - MAP_SHARED, fileno(file_), 0)); - if (data_ == nullptr) { - throw ParquetException("Memory mapping file failed"); - } - pos_ = 0; + data_ = reinterpret_cast( + mmap(nullptr, size_, PROT_READ, MAP_SHARED, fileno(file_), 0)); + if (data_ == nullptr) { throw ParquetException("Memory mapping file failed"); } + pos_ = 0; } void MemoryMapSource::Close() { @@ -139,9 +133,7 @@ void MemoryMapSource::Close() { } void MemoryMapSource::CloseFile() { - if (data_ != nullptr) { - munmap(data_, size_); - } + if (data_ != nullptr) { munmap(data_, size_); } LocalFileSource::CloseFile(); } @@ -177,10 +169,8 @@ std::shared_ptr MemoryMapSource::Read(int64_t nbytes) { // ---------------------------------------------------------------------- // BufferReader -BufferReader::BufferReader(const std::shared_ptr& buffer) : - buffer_(buffer), - data_(buffer->data()), - pos_(0) { +BufferReader::BufferReader(const std::shared_ptr& buffer) + : buffer_(buffer), data_(buffer->data()), pos_(0) { size_ = buffer->size(); } @@ -191,8 +181,7 @@ int64_t BufferReader::Tell() const { void BufferReader::Seek(int64_t pos) { if (pos < 0 || pos >= size_) { std::stringstream ss; - ss << "Cannot seek to " << pos - << "File is length " << size_; + ss << "Cannot seek to " << pos << "File is length " << size_; throw ParquetException(ss.str()); } pos_ = pos; @@ -215,8 +204,8 @@ std::shared_ptr BufferReader::Read(int64_t nbytes) { // ---------------------------------------------------------------------- // InMemoryInputStream -InMemoryInputStream::InMemoryInputStream(const std::shared_ptr& buffer) : - buffer_(buffer), offset_(0) { +InMemoryInputStream::InMemoryInputStream(const std::shared_ptr& buffer) + : buffer_(buffer), offset_(0) { len_ = buffer_->size(); } @@ -235,4 +224,4 @@ void InMemoryInputStream::Advance(int64_t num_bytes) { offset_ += num_bytes; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/input.h b/src/parquet/util/input.h index dae3e918..04bbb34d 100644 --- a/src/parquet/util/input.h +++ b/src/parquet/util/input.h @@ -53,11 +53,10 @@ class RandomAccessSource { int64_t size_; }; - class LocalFileSource : public RandomAccessSource { public: - explicit LocalFileSource(MemoryAllocator* allocator = default_allocator()) : - file_(nullptr), is_open_(false), allocator_(allocator) {} + explicit LocalFileSource(MemoryAllocator* allocator = default_allocator()) + : file_(nullptr), is_open_(false), allocator_(allocator) {} virtual ~LocalFileSource(); @@ -72,8 +71,8 @@ class LocalFileSource : public RandomAccessSource { virtual std::shared_ptr Read(int64_t nbytes); - bool is_open() const { return is_open_;} - const std::string& path() const { return path_;} + bool is_open() const { return is_open_; } + const std::string& path() const { return path_; } // Return the integer file descriptor int file_descriptor() const; @@ -90,8 +89,8 @@ class LocalFileSource : public RandomAccessSource { class MemoryMapSource : public LocalFileSource { public: - explicit MemoryMapSource(MemoryAllocator* allocator = default_allocator()) : - LocalFileSource(allocator), data_(nullptr), pos_(0) {} + explicit MemoryMapSource(MemoryAllocator* allocator = default_allocator()) + : LocalFileSource(allocator), data_(nullptr), pos_(0) {} virtual ~MemoryMapSource(); @@ -130,9 +129,7 @@ class BufferReader : public RandomAccessSource { virtual std::shared_ptr Read(int64_t nbytes); protected: - const uint8_t* Head() { - return data_ + pos_; - } + const uint8_t* Head() { return data_ + pos_; } std::shared_ptr buffer_; const uint8_t* data_; @@ -183,6 +180,6 @@ class InMemoryInputStream : public InputStream { int64_t offset_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_INPUT_H +#endif // PARQUET_UTIL_INPUT_H diff --git a/src/parquet/util/logging.h b/src/parquet/util/logging.h index 8aa750b8..8d3e88a8 100644 --- a/src/parquet/util/logging.h +++ b/src/parquet/util/logging.h @@ -43,13 +43,27 @@ namespace parquet { #ifdef NDEBUG #define PARQUET_DFATAL PARQUET_WARNING -#define DCHECK(condition) while (false) parquet::internal::NullLog() -#define DCHECK_EQ(val1, val2) while (false) parquet::internal::NullLog() -#define DCHECK_NE(val1, val2) while (false) parquet::internal::NullLog() -#define DCHECK_LE(val1, val2) while (false) parquet::internal::NullLog() -#define DCHECK_LT(val1, val2) while (false) parquet::internal::NullLog() -#define DCHECK_GE(val1, val2) while (false) parquet::internal::NullLog() -#define DCHECK_GT(val1, val2) while (false) parquet::internal::NullLog() +#define DCHECK(condition) \ + while (false) \ + parquet::internal::NullLog() +#define DCHECK_EQ(val1, val2) \ + while (false) \ + parquet::internal::NullLog() +#define DCHECK_NE(val1, val2) \ + while (false) \ + parquet::internal::NullLog() +#define DCHECK_LE(val1, val2) \ + while (false) \ + parquet::internal::NullLog() +#define DCHECK_LT(val1, val2) \ + while (false) \ + parquet::internal::NullLog() +#define DCHECK_GE(val1, val2) \ + while (false) \ + parquet::internal::NullLog() +#define DCHECK_GT(val1, val2) \ + while (false) \ + parquet::internal::NullLog() #else #define PARQUET_DFATAL PARQUET_FATAL @@ -62,13 +76,13 @@ namespace parquet { #define DCHECK_GE(val1, val2) PARQUET_CHECK((val1) >= (val2)) #define DCHECK_GT(val1, val2) PARQUET_CHECK((val1) > (val2)) -#endif // NDEBUG +#endif // NDEBUG namespace internal { class NullLog { public: - template + template NullLog& operator<<(const T& t) { return *this; } @@ -76,21 +90,16 @@ class NullLog { class CerrLog { public: - CerrLog(int severity) // NOLINT(runtime/explicit) - : severity_(severity), - has_logged_(false) { - } + CerrLog(int severity) // NOLINT(runtime/explicit) + : severity_(severity), + has_logged_(false) {} ~CerrLog() { - if (has_logged_) { - std::cerr << std::endl; - } - if (severity_ == PARQUET_FATAL) { - exit(1); - } + if (has_logged_) { std::cerr << std::endl; } + if (severity_ == PARQUET_FATAL) { exit(1); } } - template + template CerrLog& operator<<(const T& t) { has_logged_ = true; std::cerr << t; @@ -102,8 +111,8 @@ class CerrLog { bool has_logged_; }; -} // namespace internal +} // namespace internal -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_LOGGING_H +#endif // PARQUET_UTIL_LOGGING_H diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h index d2211732..8f704b22 100644 --- a/src/parquet/util/macros.h +++ b/src/parquet/util/macros.h @@ -21,8 +21,8 @@ // Useful macros from elsewhere // From Google gutil -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ void operator=(const TypeName&) = delete // ---------------------------------------------------------------------- @@ -46,7 +46,7 @@ // // Can call MyClass::MyMethod() here. // } -#define FRIEND_TEST(test_case_name, test_name)\ -friend class test_case_name##_##test_name##_Test +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test -#endif // PARQUET_UTIL_MACROS_H +#endif // PARQUET_UTIL_MACROS_H diff --git a/src/parquet/util/mem-allocator-test.cc b/src/parquet/util/mem-allocator-test.cc index 2e86e19b..336d3b45 100644 --- a/src/parquet/util/mem-allocator-test.cc +++ b/src/parquet/util/mem-allocator-test.cc @@ -64,4 +64,4 @@ TEST(TestAllocator, TotalMax) { ASSERT_EQ(110, allocator.MaxMemory()); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/mem-allocator.cc b/src/parquet/util/mem-allocator.cc index dc2df2b6..4d42bc49 100644 --- a/src/parquet/util/mem-allocator.cc +++ b/src/parquet/util/mem-allocator.cc @@ -26,18 +26,12 @@ namespace parquet { MemoryAllocator::~MemoryAllocator() {} uint8_t* TrackingAllocator::Malloc(int64_t size) { - if (0 == size) { - return nullptr; - } + if (0 == size) { return nullptr; } uint8_t* p = static_cast(std::malloc(size)); - if (!p) { - throw ParquetException("OOM: memory allocation failed"); - } + if (!p) { throw ParquetException("OOM: memory allocation failed"); } total_memory_ += size; - if (total_memory_ > max_memory_) { - max_memory_ = total_memory_; - } + if (total_memory_ > max_memory_) { max_memory_ = total_memory_; } return p; } @@ -58,4 +52,4 @@ MemoryAllocator* default_allocator() { return &default_allocator; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/mem-allocator.h b/src/parquet/util/mem-allocator.h index 612a050f..eb68f02a 100644 --- a/src/parquet/util/mem-allocator.h +++ b/src/parquet/util/mem-allocator.h @@ -34,7 +34,7 @@ class MemoryAllocator { MemoryAllocator* default_allocator(); -class TrackingAllocator: public MemoryAllocator { +class TrackingAllocator : public MemoryAllocator { public: TrackingAllocator() : total_memory_(0), max_memory_(0) {} virtual ~TrackingAllocator(); @@ -42,19 +42,15 @@ class TrackingAllocator: public MemoryAllocator { uint8_t* Malloc(int64_t size) override; void Free(uint8_t* p, int64_t size) override; - int64_t TotalMemory() { - return total_memory_; - } + int64_t TotalMemory() { return total_memory_; } - int64_t MaxMemory() { - return max_memory_; - } + int64_t MaxMemory() { return max_memory_; } private: int64_t total_memory_; int64_t max_memory_; }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_MEMORY_POOL_H +#endif // PARQUET_UTIL_MEMORY_POOL_H diff --git a/src/parquet/util/mem-pool-test.cc b/src/parquet/util/mem-pool-test.cc index b5151e5d..7e1443a8 100644 --- a/src/parquet/util/mem-pool-test.cc +++ b/src/parquet/util/mem-pool-test.cc @@ -137,16 +137,16 @@ TEST(MemPoolTest, Basic) { // free chunks. TEST(MemPoolTest, Keep) { MemPool p; - p.Allocate(4*1024); - p.Allocate(8*1024); - p.Allocate(16*1024); + p.Allocate(4 * 1024); + p.Allocate(8 * 1024); + p.Allocate(16 * 1024); EXPECT_EQ((4 + 8 + 16) * 1024, p.total_allocated_bytes()); EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); p.Clear(); EXPECT_EQ(0, p.total_allocated_bytes()); EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); - p.Allocate(1*1024); - p.Allocate(4*1024); + p.Allocate(1 * 1024); + p.Allocate(4 * 1024); EXPECT_EQ((1 + 4) * 1024, p.total_allocated_bytes()); EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); @@ -244,4 +244,4 @@ TEST(MemPoolTest, FragmentationOverhead) { p.FreeAll(); } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/mem-pool.cc b/src/parquet/util/mem-pool.cc index 73817dae..b211beec 100644 --- a/src/parquet/util/mem-pool.cc +++ b/src/parquet/util/mem-pool.cc @@ -35,18 +35,15 @@ const int MemPool::INITIAL_CHUNK_SIZE; const int MemPool::MAX_CHUNK_SIZE; MemPool::MemPool(MemoryAllocator* allocator) - : current_chunk_idx_(-1), - next_chunk_size_(INITIAL_CHUNK_SIZE), - total_allocated_bytes_(0), - peak_allocated_bytes_(0), - total_reserved_bytes_(0), - allocator_(allocator) {} + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + peak_allocated_bytes_(0), + total_reserved_bytes_(0), + allocator_(allocator) {} MemPool::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf) - : data(buf), - size(size), - allocated_bytes(0) { -} + : data(buf), size(size), allocated_bytes(0) {} MemPool::~MemPool() { int64_t total_bytes_released = 0; @@ -86,7 +83,7 @@ bool MemPool::FindChunk(int64_t min_size) { int first_free_idx = current_chunk_idx_ + 1; // (cast size() to signed int in order to avoid everything else being cast to // unsigned long, in particular -1) - while (++current_chunk_idx_ < static_cast(chunks_.size())) { + while (++current_chunk_idx_ < static_cast(chunks_.size())) { // we found a free chunk DCHECK_EQ(chunks_[current_chunk_idx_].allocated_bytes, 0); @@ -127,8 +124,8 @@ bool MemPool::FindChunk(int64_t min_size) { total_reserved_bytes_ += chunk_size; // Don't increment the chunk size until the allocation succeeds: if an attempted // large allocation fails we don't want to increase the chunk size further. - next_chunk_size_ = static_cast(std::min( - chunk_size * 2, MAX_CHUNK_SIZE)); + next_chunk_size_ = + static_cast(std::min(chunk_size * 2, MAX_CHUNK_SIZE)); } DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); @@ -188,16 +185,13 @@ std::string MemPool::DebugString() { char str[16]; out << "MemPool(#chunks=" << chunks_.size() << " ["; for (size_t i = 0; i < chunks_.size(); ++i) { - sprintf(str, "0x%lx=", reinterpret_cast(chunks_[i].data)); // NOLINT - out << (i > 0 ? " " : "") - << str - << chunks_[i].size - << "/" << chunks_[i].allocated_bytes; + sprintf(str, "0x%lx=", reinterpret_cast(chunks_[i].data)); // NOLINT + out << (i > 0 ? " " : "") << str << chunks_[i].size << "/" + << chunks_[i].allocated_bytes; } out << "] current_chunk=" << current_chunk_idx_ << " total_sizes=" << GetTotalChunkSizes() - << " total_alloc=" << total_allocated_bytes_ - << ")"; + << " total_alloc=" << total_allocated_bytes_ << ")"; return out.str(); } @@ -232,4 +226,4 @@ bool MemPool::CheckIntegrity(bool current_chunk_empty) { return true; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/mem-pool.h b/src/parquet/util/mem-pool.h index 1022eb8b..b64ee29e 100644 --- a/src/parquet/util/mem-pool.h +++ b/src/parquet/util/mem-pool.h @@ -85,9 +85,7 @@ class MemPool { /// Allocates 8-byte aligned section of memory of 'size' bytes at the end /// of the the current chunk. Creates a new chunk if there aren't any chunks /// with enough capacity. - uint8_t* Allocate(int size) { - return Allocate(size); - } + uint8_t* Allocate(int size) { return Allocate(size); } /// Returns 'byte_size' to the current chunk back to the mem pool. This can /// only be used to return either all or part of the previous allocation returned @@ -131,18 +129,15 @@ class MemPool { static const int MAX_CHUNK_SIZE = 1024 * 1024; struct ChunkInfo { - uint8_t* data; // Owned by the ChunkInfo. - int64_t size; // in bytes + uint8_t* data; // Owned by the ChunkInfo. + int64_t size; // in bytes /// bytes allocated via Allocate() in this chunk int64_t allocated_bytes; explicit ChunkInfo(int64_t size, uint8_t* buf); - ChunkInfo() - : data(NULL), - size(0), - allocated_bytes(0) {} + ChunkInfo() : data(NULL), size(0), allocated_bytes(0) {} }; /// chunk from which we served the last Allocate() call; @@ -189,9 +184,9 @@ class MemPool { if (size == 0) return NULL; int64_t num_bytes = BitUtil::RoundUp(size, 8); - if (current_chunk_idx_ == -1 - || num_bytes + chunks_[current_chunk_idx_].allocated_bytes - > chunks_[current_chunk_idx_].size) { + if (current_chunk_idx_ == -1 || + num_bytes + chunks_[current_chunk_idx_].allocated_bytes > + chunks_[current_chunk_idx_].size) { // If we couldn't allocate a new chunk, return NULL. if (UNLIKELY(!FindChunk(num_bytes))) return NULL; } @@ -206,6 +201,6 @@ class MemPool { } }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_MEM_POOL_H +#endif // PARQUET_UTIL_MEM_POOL_H diff --git a/src/parquet/util/output.cc b/src/parquet/util/output.cc index 704a13b4..1d273552 100644 --- a/src/parquet/util/output.cc +++ b/src/parquet/util/output.cc @@ -28,11 +28,10 @@ namespace parquet { // ---------------------------------------------------------------------- // In-memory output stream -InMemoryOutputStream::InMemoryOutputStream(int64_t initial_capacity, - MemoryAllocator* allocator) : size_(0), capacity_(initial_capacity) { - if (initial_capacity == 0) { - initial_capacity = IN_MEMORY_DEFAULT_CAPACITY; - } +InMemoryOutputStream::InMemoryOutputStream( + int64_t initial_capacity, MemoryAllocator* allocator) + : size_(0), capacity_(initial_capacity) { + if (initial_capacity == 0) { initial_capacity = IN_MEMORY_DEFAULT_CAPACITY; } buffer_.reset(new OwnedMutableBuffer(initial_capacity, allocator)); } @@ -64,4 +63,4 @@ std::shared_ptr InMemoryOutputStream::GetBuffer() { return result; } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/output.h b/src/parquet/util/output.h index 4cfa162b..472a9cce 100644 --- a/src/parquet/util/output.h +++ b/src/parquet/util/output.h @@ -74,6 +74,6 @@ class InMemoryOutputStream : public OutputStream { DISALLOW_COPY_AND_ASSIGN(InMemoryOutputStream); }; -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_OUTPUT_H +#endif // PARQUET_UTIL_OUTPUT_H diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h index 52388daf..20c96211 100644 --- a/src/parquet/util/rle-encoding.h +++ b/src/parquet/util/rle-encoding.h @@ -85,11 +85,11 @@ class RleDecoder { /// Create a decoder object. buffer/buffer_len is the decoded data. /// bit_width is the width of each value (before encoding). RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0) { + : bit_reader_(buffer, buffer_len), + bit_width_(bit_width), + current_value_(0), + repeat_count_(0), + literal_count_(0) { DCHECK_GE(bit_width_, 0); DCHECK_LE(bit_width_, 64); } @@ -107,7 +107,7 @@ class RleDecoder { } /// Gets the next value. Returns false if there are no more. - template + template bool Get(T* val); protected: @@ -121,7 +121,7 @@ class RleDecoder { private: /// Fills literal_count_ and repeat_count_ with next values. Returns false if there /// are no more. - template + template bool NextCounts(); }; @@ -140,8 +140,7 @@ class RleEncoder { /// based on the bit_width, which can determine a storage optimal choice. /// TODO: allow 0 bit_width (and have dict encoder use it) RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) - : bit_width_(bit_width), - bit_writer_(buffer, buffer_len) { + : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { DCHECK_GE(bit_width_, 0); DCHECK_LE(bit_width_, 64); max_run_byte_size_ = MinBufferSize(bit_width); @@ -154,8 +153,8 @@ class RleEncoder { /// It is not valid to pass a buffer less than this length. static int MinBufferSize(int bit_width) { /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. - int max_literal_run_size = 1 + - BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8); + int max_literal_run_size = + 1 + BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8); /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. int max_repeated_run_size = BitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8); return std::max(max_literal_run_size, max_repeated_run_size); @@ -248,7 +247,7 @@ class RleEncoder { uint8_t* literal_indicator_byte_; }; -template +template inline bool RleDecoder::Get(T* val) { DCHECK_GE(bit_width_, 0); if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) { @@ -268,7 +267,7 @@ inline bool RleDecoder::Get(T* val) { return true; } -template +template bool RleDecoder::NextCounts() { // Read the next run's indicator int, it could be a literal or repeated run. // The int is encoded as a vlq-encoded value. @@ -399,16 +398,16 @@ inline void RleEncoder::FlushBufferedValues(bool done) { inline int RleEncoder::Flush() { if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { - bool all_repeat = literal_count_ == 0 && - (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); + bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || + num_buffered_values_ == 0); // There is something pending, figure out if it's a repeated or literal run if (repeat_count_ > 0 && all_repeat) { FlushRepeatedRun(); - } else { + } else { DCHECK_EQ(literal_count_ % 8, 0); // Buffer the last group of literals to 8 by padding with 0s. for (; num_buffered_values_ != 0 && num_buffered_values_ < 8; - ++num_buffered_values_) { + ++num_buffered_values_) { buffered_values_[num_buffered_values_] = 0; } literal_count_ += num_buffered_values_; @@ -441,6 +440,6 @@ inline void RleEncoder::Clear() { bit_writer_.Clear(); } -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_RLE_ENCODING_H +#endif // PARQUET_UTIL_RLE_ENCODING_H diff --git a/src/parquet/util/rle-test.cc b/src/parquet/util/rle-test.cc index 64fd57b9..d781137b 100644 --- a/src/parquet/util/rle-test.cc +++ b/src/parquet/util/rle-test.cc @@ -104,7 +104,7 @@ TEST(BitArray, TestBool) { // Writes 'num_vals' values with width 'bit_width' and reads them back. void TestBitArrayValues(int bit_width, int num_vals) { const int len = BitUtil::Ceil(bit_width * num_vals, 8); - const uint64_t mod = bit_width == 64? 1 : 1LL << bit_width; + const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width; uint8_t buffer[len]; BitWriter writer(buffer, len); @@ -176,8 +176,8 @@ TEST(BitArray, TestMixed) { // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. // if expected_len is not -1, it will validate the encoded size is correct. -void ValidateRle(const vector& values, int bit_width, - uint8_t* expected_encoding, int expected_len) { +void ValidateRle(const vector& values, int bit_width, uint8_t* expected_encoding, + int expected_len) { const int len = 64 * 1024; uint8_t buffer[len]; EXPECT_LE(expected_len, len); @@ -189,9 +189,7 @@ void ValidateRle(const vector& values, int bit_width, } int encoded_len = encoder.Flush(); - if (expected_len != -1) { - EXPECT_EQ(encoded_len, expected_len); - } + if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); } if (expected_encoding != NULL) { EXPECT_TRUE(memcmp(buffer, expected_encoding, expected_len) == 0); } @@ -214,9 +212,7 @@ bool CheckRoundTrip(const vector& values, int bit_width) { RleEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); - if (!result) { - return false; - } + if (!result) { return false; } } int encoded_len = encoder.Flush(); int out; @@ -224,9 +220,7 @@ bool CheckRoundTrip(const vector& values, int bit_width) { RleDecoder decoder(buffer, encoded_len, bit_width); for (size_t i = 0; i < values.size(); ++i) { EXPECT_TRUE(decoder.Get(&out)); - if (values[i] != out) { - return false; - } + if (values[i] != out) { return false; } } return true; } @@ -264,11 +258,11 @@ TEST(Rle, SpecificSequences) { } int num_groups = BitUtil::Ceil(100, 8); expected_buffer[0] = (num_groups << 1) | 1; - for (int i = 1; i <= 100/8; ++i) { + for (int i = 1; i <= 100 / 8; ++i) { expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0); } // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0. - expected_buffer[100/8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0); + expected_buffer[100 / 8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0); // num_groups and expected_buffer only valid for bit width = 1 ValidateRle(values, 1, expected_buffer, 1 + num_groups); @@ -301,13 +295,13 @@ TEST(Rle, TestValues) { TEST(Rle, BitWidthZeroRepeated) { uint8_t buffer[1]; const int num_values = 15; - buffer[0] = num_values << 1; // repeated indicator byte + buffer[0] = num_values << 1; // repeated indicator byte RleDecoder decoder(buffer, sizeof(buffer), 0); uint8_t val; for (int i = 0; i < num_values; ++i) { bool result = decoder.Get(&val); EXPECT_TRUE(result); - EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 + EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 } EXPECT_FALSE(decoder.Get(&val)); } @@ -315,14 +309,14 @@ TEST(Rle, BitWidthZeroRepeated) { TEST(Rle, BitWidthZeroLiteral) { uint8_t buffer[1]; const int num_groups = 4; - buffer[0] = num_groups << 1 | 1; // literal indicator byte + buffer[0] = num_groups << 1 | 1; // literal indicator byte RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0); const int num_values = num_groups * 8; uint8_t val; for (int i = 0; i < num_values; ++i) { bool result = decoder.Get(&val); EXPECT_TRUE(result); - EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 + EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 } EXPECT_FALSE(decoder.Get(&val)); } @@ -331,7 +325,8 @@ TEST(Rle, BitWidthZeroLiteral) { // group but flush before finishing. TEST(BitRle, Flush) { vector values; - for (int i = 0; i < 16; ++i) values.push_back(1); + for (int i = 0; i < 16; ++i) + values.push_back(1); values.push_back(0); ValidateRle(values, 1, NULL, -1); values.push_back(1); @@ -363,9 +358,7 @@ TEST(BitRle, Random) { for (int i = 0; i < ngroups; ++i) { int group_size = dist(gen); - if (group_size > max_group_size) { - group_size = 1; - } + if (group_size > max_group_size) { group_size = 1; } for (int i = 0; i < group_size; ++i) { values.push_back(parity); } @@ -437,4 +430,4 @@ TEST(BitRle, Overflow) { } } -} // namespace parquet +} // namespace parquet diff --git a/src/parquet/util/sse-util.h b/src/parquet/util/sse-util.h index 6288d1d2..653c1714 100644 --- a/src/parquet/util/sse-util.h +++ b/src/parquet/util/sse-util.h @@ -27,51 +27,36 @@ namespace parquet { - /// This class contains constants useful for text processing with SSE4.2 intrinsics. namespace SSEUtil { - /// Number of characters that fit in 64/128 bit register. SSE provides instructions - /// for loading 64 or 128 bits into a register at a time. - static const int CHARS_PER_64_BIT_REGISTER = 8; - static const int CHARS_PER_128_BIT_REGISTER = 16; - - /// SSE4.2 adds instructions for text processing. The instructions have a control - /// byte that determines some of functionality of the instruction. (Equivalent to - /// GCC's _SIDD_CMP_EQUAL_ANY, etc). - static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr - static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp - static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) - static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. - - /// In this mode, SSE text processing functions will return a mask of all the - /// characters that matched. - static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; - - /// In this mode, SSE text processing functions will return the number of - /// bytes that match consecutively from the beginning. - static const int STRCMP_MODE = PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | - PCMPSTR_NEG_POLARITY; - - /// Precomputed mask values up to 16 bits. - static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { - 1 << 0, - 1 << 1, - 1 << 2, - 1 << 3, - 1 << 4, - 1 << 5, - 1 << 6, - 1 << 7, - 1 << 8, - 1 << 9, - 1 << 10, - 1 << 11, - 1 << 12, - 1 << 13, - 1 << 14, - 1 << 15, - }; -} // namespace SSEUtil +/// Number of characters that fit in 64/128 bit register. SSE provides instructions +/// for loading 64 or 128 bits into a register at a time. +static const int CHARS_PER_64_BIT_REGISTER = 8; +static const int CHARS_PER_128_BIT_REGISTER = 16; + +/// SSE4.2 adds instructions for text processing. The instructions have a control +/// byte that determines some of functionality of the instruction. (Equivalent to +/// GCC's _SIDD_CMP_EQUAL_ANY, etc). +static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr +static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp +static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) +static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. + +/// In this mode, SSE text processing functions will return a mask of all the +/// characters that matched. +static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; + +/// In this mode, SSE text processing functions will return the number of +/// bytes that match consecutively from the beginning. +static const int STRCMP_MODE = + PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY; + +/// Precomputed mask values up to 16 bits. +static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { + 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, + 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, +}; +} // namespace SSEUtil #ifdef PARQUET_USE_SSE @@ -88,29 +73,35 @@ namespace SSEUtil { /// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an /// immediate. So, those need to be always inlined in order to always propagate the /// mode constant into the inline asm. -#define SSE_ALWAYS_INLINE inline __attribute__ ((__always_inline__)) +#define SSE_ALWAYS_INLINE inline __attribute__((__always_inline__)) -template +template static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { #ifdef __clang__ /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 - /// clang doesn't support Y-prefixed asm constraints. register volatile __m128i result asm("xmm0"); - __asm__ volatile ("pcmpestrm %5, %2, %1" - : "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); + __asm__ volatile("pcmpestrm %5, %2, %1" + : "=x"(result) + : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) + : "cc"); #else __m128i result; - __asm__ volatile ("pcmpestrm %5, %2, %1" - : "=Yz"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); + __asm__ volatile("pcmpestrm %5, %2, %1" + : "=Yz"(result) + : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) + : "cc"); #endif return result; } -template +template static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { int result; __asm__("pcmpestri %5, %2, %1" - : "=c"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); + : "=c"(result) + : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) + : "cc"); return result; } @@ -143,7 +134,7 @@ static inline int64_t POPCNT_popcnt_u64(uint64_t a) { #undef SSE_ALWAYS_INLINE -#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2. +#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2. /// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not /// support it. However, the cross-compiled IR is compiled twice: with and without /// -msse4.2. When -msse4.2 is enabled in the cross-compile, we can just use the @@ -151,15 +142,13 @@ static inline int64_t POPCNT_popcnt_u64(uint64_t a) { #include -template -static inline __m128i SSE4_cmpestrm( - __m128i str1, int len1, __m128i str2, int len2) { +template +static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { return _mm_cmpestrm(str1, len1, str2, len2, MODE); } -template -static inline int SSE4_cmpestri( - __m128i str1, int len1, __m128i str2, int len2) { +template +static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { return _mm_cmpestri(str1, len1, str2, len2, MODE); } @@ -175,13 +164,13 @@ static inline int SSE4_cmpestri( /// support SSE 4.2. However, because the caller isn't allowed to call these routines /// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case. -template +template static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { DCHECK(false) << "CPU doesn't support SSE 4.2"; - return (__m128i) { 0 }; // NOLINT + return (__m128i){0}; // NOLINT } -template +template static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { DCHECK(false) << "CPU doesn't support SSE 4.2"; return 0; @@ -212,7 +201,7 @@ static inline int64_t POPCNT_popcnt_u64(uint64_t a) { return 0; } -#endif // IR_COMPILE +#endif // IR_COMPILE #else @@ -241,8 +230,8 @@ static inline int64_t POPCNT_popcnt_u64(uint64_t a) { return 0; } -#endif // PARQUET_USE_SSE +#endif // PARQUET_USE_SSE -} // namespace parquet +} // namespace parquet -#endif // PARQUET_UTIL_SSE_UTIL_H +#endif // PARQUET_UTIL_SSE_UTIL_H diff --git a/src/parquet/util/stopwatch.h b/src/parquet/util/stopwatch.h index 612a00c0..b940d8cd 100644 --- a/src/parquet/util/stopwatch.h +++ b/src/parquet/util/stopwatch.h @@ -28,26 +28,23 @@ namespace parquet { class StopWatch { public: - StopWatch() { - } + StopWatch() {} - void Start() { - gettimeofday(&start_time, 0); - } + void Start() { gettimeofday(&start_time, 0); } // Returns time in nanoseconds. uint64_t Stop() { struct timeval t_time; gettimeofday(&t_time, 0); - return (1000L * 1000L * 1000L * (t_time.tv_sec - start_time.tv_sec) - + (t_time.tv_usec - start_time.tv_usec)); + return (1000L * 1000L * 1000L * (t_time.tv_sec - start_time.tv_sec) + + (t_time.tv_usec - start_time.tv_usec)); } private: - struct timeval start_time; + struct timeval start_time; }; -} // namespace parquet +} // namespace parquet #endif diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h index 019af8ec..edadb533 100644 --- a/src/parquet/util/test-common.h +++ b/src/parquet/util/test-common.h @@ -31,13 +31,11 @@ namespace parquet { namespace test { -typedef ::testing::Types ParquetTypes; +typedef ::testing::Types ParquetTypes; template -static inline void assert_vector_equal(const vector& left, - const vector& right) { +static inline void assert_vector_equal(const vector& left, const vector& right) { ASSERT_EQ(left.size(), right.size()); for (size_t i = 0; i < left.size(); ++i) { @@ -47,15 +45,11 @@ static inline void assert_vector_equal(const vector& left, template static inline bool vector_equal(const vector& left, const vector& right) { - if (left.size() != right.size()) { - return false; - } + if (left.size() != right.size()) { return false; } for (size_t i = 0; i < left.size(); ++i) { if (left[i] != right[i]) { - std::cerr << "index " << i - << " left was " << left[i] - << " right was " << right[i] + std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i] << std::endl; return false; } @@ -66,9 +60,7 @@ static inline bool vector_equal(const vector& left, const vector& right) { template static vector slice(const vector& values, int start, int end) { - if (end < start) { - return vector(0); - } + if (end < start) { return vector(0); } vector out(end - start); for (int i = start; i < end; ++i) { @@ -137,8 +129,8 @@ void random_numbers(int n, uint32_t seed, float min_value, float max_value, floa } template <> -void random_numbers(int n, uint32_t seed, double min_value, double max_value, - double* out) { +void random_numbers( + int n, uint32_t seed, double min_value, double max_value, double* out) { std::mt19937 gen(seed); std::uniform_real_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { @@ -146,8 +138,8 @@ void random_numbers(int n, uint32_t seed, double min_value, double max_value, } } -void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value, - Int96* out) { +void random_Int96_numbers( + int n, uint32_t seed, int32_t min_value, int32_t max_value, Int96* out) { std::mt19937 gen(seed); std::uniform_int_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { @@ -157,8 +149,7 @@ void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_v } } -void random_fixed_byte_array(int n, uint32_t seed, uint8_t *buf, int len, - FLBA* out) { +void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) { std::mt19937 gen(seed); std::uniform_int_distribution d(0, 255); for (int i = 0; i < n; ++i) { @@ -170,8 +161,8 @@ void random_fixed_byte_array(int n, uint32_t seed, uint8_t *buf, int len, } } -void random_byte_array(int n, uint32_t seed, uint8_t *buf, - ByteArray* out, int min_size, int max_size) { +void random_byte_array( + int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, int max_size) { std::mt19937 gen(seed); std::uniform_int_distribution d1(min_size, max_size); std::uniform_int_distribution d2(0, 255); @@ -186,12 +177,11 @@ void random_byte_array(int n, uint32_t seed, uint8_t *buf, } } -void random_byte_array(int n, uint32_t seed, uint8_t *buf, - ByteArray* out, int max_size) { +void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size) { random_byte_array(n, seed, buf, out, 0, max_size); } -} // namespace test -} // namespace parquet +} // namespace test +} // namespace parquet -#endif // PARQUET_UTIL_TEST_COMMON_H +#endif // PARQUET_UTIL_TEST_COMMON_H diff --git a/src/parquet/util/test_main.cc b/src/parquet/util/test_main.cc index 00139f36..6fb7c053 100644 --- a/src/parquet/util/test_main.cc +++ b/src/parquet/util/test_main.cc @@ -17,7 +17,7 @@ #include -int main(int argc, char **argv) { +int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); int ret = RUN_ALL_TESTS();