From b91a9e177e773193e45ffdca933c6378e7f41732 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 30 Jun 2019 16:30:08 -0700 Subject: [PATCH 1/6] ARROW-5791: [C++] Fix infinite loop with more the 32768 columns. Cap max columns --- cpp/src/arrow/csv/parser-test.cc | 33 ++++++++++++++++++++++++++++++++ cpp/src/arrow/csv/parser.cc | 11 +++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc index 36552309b27..abeef7bfe6d 100644 --- a/cpp/src/arrow/csv/parser-test.cc +++ b/cpp/src/arrow/csv/parser-test.cc @@ -439,6 +439,39 @@ TEST(BlockParser, Escaping) { } } +// Generate test data with the given number of columns. +std::string MakeLotsOfCsvColumns(int32_t num_columns) { + std::string values, header; + header.reserve(num_columns * 10); + values.reserve(num_columns * 10); + for (int x = 0; x < num_columns; x++) { + if (x != 0) { + header += ","; + values += ","; + } + header += "c" + std::to_string(x); + values += std::to_string(x); + } + + header += "\n"; + values += "\n"; + return MakeCSVData({header, values}); +} + +TEST(BlockParser, MaxAllowedColumns) { + auto options = ParseOptions::Defaults(); + BlockParser parser(options); + AssertParseOk(parser, MakeLotsOfCsvColumns(1000 * 1024)); +} + +TEST(BlockParser, MoreThanMaxAllowedColumns) { + auto options = ParseOptions::Defaults(); + BlockParser parser(options); + uint32_t parsed_size = static_cast(-1); + ASSERT_RAISES(Invalid, + Parse(parser, MakeLotsOfCsvColumns((1024 * 1000) + 1), &parsed_size)); +} + TEST(BlockParser, QuotedEscape) { auto options = ParseOptions::Defaults(); options.escaping = true; diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index a7ca71c9fd7..8e554973390 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -397,16 +397,23 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is return ParseError("Empty CSV file or block: cannot infer number of columns"); } } + constexpr int32_t kMaxCols = 1000 * 1024; + if (num_cols_ > kMaxCols) { + return Status::Invalid("CSV parsing only supports upto ", kMaxCols, + " in CSV Files. Found: ", num_cols_); + } while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) { // We know the number of columns, so can presize a values array for // a given number of rows DCHECK_GE(num_cols_, 0); int32_t rows_in_chunk; + constexpr int32_t kTargetChunkSize = 32768; if (num_cols_ > 0) { - rows_in_chunk = std::min(32768 / num_cols_, max_num_rows_ - num_rows_); + rows_in_chunk = + std::max(std::min(kTargetChunkSize / num_cols_, max_num_rows_ - num_rows_), 1); } else { - rows_in_chunk = std::min(32768, max_num_rows_ - num_rows_); + rows_in_chunk = std::max(std::min(kTargetChunkSize, max_num_rows_ - num_rows_), 1); } PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_); From 211472a128454a15ee5ed9a4816c4645814a6fd2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 30 Jun 2019 17:07:06 -0700 Subject: [PATCH 2/6] powers of 2 are better --- cpp/src/arrow/csv/parser-test.cc | 4 ++-- cpp/src/arrow/csv/parser.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc index abeef7bfe6d..ef90f36e14f 100644 --- a/cpp/src/arrow/csv/parser-test.cc +++ b/cpp/src/arrow/csv/parser-test.cc @@ -461,7 +461,7 @@ std::string MakeLotsOfCsvColumns(int32_t num_columns) { TEST(BlockParser, MaxAllowedColumns) { auto options = ParseOptions::Defaults(); BlockParser parser(options); - AssertParseOk(parser, MakeLotsOfCsvColumns(1000 * 1024)); + AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 1024)); } TEST(BlockParser, MoreThanMaxAllowedColumns) { @@ -469,7 +469,7 @@ TEST(BlockParser, MoreThanMaxAllowedColumns) { BlockParser parser(options); uint32_t parsed_size = static_cast(-1); ASSERT_RAISES(Invalid, - Parse(parser, MakeLotsOfCsvColumns((1024 * 1000) + 1), &parsed_size)); + Parse(parser, MakeLotsOfCsvColumns((1024 * 1024) + 1), &parsed_size)); } TEST(BlockParser, QuotedEscape) { diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index 8e554973390..fea783e0543 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -397,7 +397,7 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is return ParseError("Empty CSV file or block: cannot infer number of columns"); } } - constexpr int32_t kMaxCols = 1000 * 1024; + constexpr int32_t kMaxCols = 1024 * 1024; if (num_cols_ > kMaxCols) { return Status::Invalid("CSV parsing only supports upto ", kMaxCols, " in CSV Files. Found: ", num_cols_); From 08ddc2238f542dfc0d9e77ff386c27f84b6f5175 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 30 Jun 2019 18:14:08 -0700 Subject: [PATCH 3/6] remove floor duplication --- cpp/src/arrow/csv/parser.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index fea783e0543..5f5cb9594c1 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -410,11 +410,11 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is int32_t rows_in_chunk; constexpr int32_t kTargetChunkSize = 32768; if (num_cols_ > 0) { - rows_in_chunk = - std::max(std::min(kTargetChunkSize / num_cols_, max_num_rows_ - num_rows_), 1); + rows_in_chunk = std::min(kTargetChunkSize / num_cols_, max_num_rows_ - num_rows_); } else { - rows_in_chunk = std::max(std::min(kTargetChunkSize, max_num_rows_ - num_rows_), 1); + rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_); } + rows_in_chunk = std::max(1, rows_in_chunk); PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_); values_writer.Start(parsed_writer); From acfe2d894651be274da7c058f4d8ffdbbce42676 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 1 Jul 2019 04:55:56 -0700 Subject: [PATCH 4/6] remove cap, make min rows_in_chunk 512 --- cpp/src/arrow/csv/parser.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index 5f5cb9594c1..89c3f4cb168 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -397,11 +397,7 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is return ParseError("Empty CSV file or block: cannot infer number of columns"); } } - constexpr int32_t kMaxCols = 1024 * 1024; - if (num_cols_ > kMaxCols) { - return Status::Invalid("CSV parsing only supports upto ", kMaxCols, - " in CSV Files. Found: ", num_cols_); - } + while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) { // We know the number of columns, so can presize a values array for // a given number of rows @@ -410,11 +406,11 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is int32_t rows_in_chunk; constexpr int32_t kTargetChunkSize = 32768; if (num_cols_ > 0) { - rows_in_chunk = std::min(kTargetChunkSize / num_cols_, max_num_rows_ - num_rows_); + rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512), + max_num_rows_ - num_rows_); } else { rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_); } - rows_in_chunk = std::max(1, rows_in_chunk); PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_); values_writer.Start(parsed_writer); From 8f53a8a58caca022c283d1237085bd21b6e148bf Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 1 Jul 2019 04:58:53 -0700 Subject: [PATCH 5/6] remove test --- cpp/src/arrow/csv/parser-test.cc | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc index ef90f36e14f..757e6ac3f07 100644 --- a/cpp/src/arrow/csv/parser-test.cc +++ b/cpp/src/arrow/csv/parser-test.cc @@ -458,20 +458,12 @@ std::string MakeLotsOfCsvColumns(int32_t num_columns) { return MakeCSVData({header, values}); } -TEST(BlockParser, MaxAllowedColumns) { +TEST(BlockParser, LotsOfColumns) { auto options = ParseOptions::Defaults(); BlockParser parser(options); AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 1024)); } -TEST(BlockParser, MoreThanMaxAllowedColumns) { - auto options = ParseOptions::Defaults(); - BlockParser parser(options); - uint32_t parsed_size = static_cast(-1); - ASSERT_RAISES(Invalid, - Parse(parser, MakeLotsOfCsvColumns((1024 * 1024) + 1), &parsed_size)); -} - TEST(BlockParser, QuotedEscape) { auto options = ParseOptions::Defaults(); options.escaping = true; From ab0504c16e0a698c538837f2ff8605bdc5e622ac Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 1 Jul 2019 08:26:29 -0700 Subject: [PATCH 6/6] lower number of columns in test to satisfy ming --- cpp/src/arrow/csv/parser-test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc index 757e6ac3f07..d1790b23da1 100644 --- a/cpp/src/arrow/csv/parser-test.cc +++ b/cpp/src/arrow/csv/parser-test.cc @@ -461,7 +461,7 @@ std::string MakeLotsOfCsvColumns(int32_t num_columns) { TEST(BlockParser, LotsOfColumns) { auto options = ParseOptions::Defaults(); BlockParser parser(options); - AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 1024)); + AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100)); } TEST(BlockParser, QuotedEscape) {