diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc index 36552309b27..d1790b23da1 100644 --- a/cpp/src/arrow/csv/parser-test.cc +++ b/cpp/src/arrow/csv/parser-test.cc @@ -439,6 +439,31 @@ TEST(BlockParser, Escaping) { } } +// Generate test data with the given number of columns. +std::string MakeLotsOfCsvColumns(int32_t num_columns) { + std::string values, header; + header.reserve(num_columns * 10); + values.reserve(num_columns * 10); + for (int x = 0; x < num_columns; x++) { + if (x != 0) { + header += ","; + values += ","; + } + header += "c" + std::to_string(x); + values += std::to_string(x); + } + + header += "\n"; + values += "\n"; + return MakeCSVData({header, values}); +} + +TEST(BlockParser, LotsOfColumns) { + auto options = ParseOptions::Defaults(); + BlockParser parser(options); + AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100)); +} + TEST(BlockParser, QuotedEscape) { auto options = ParseOptions::Defaults(); options.escaping = true; diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index a7ca71c9fd7..89c3f4cb168 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -397,16 +397,19 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is return ParseError("Empty CSV file or block: cannot infer number of columns"); } } + while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) { // We know the number of columns, so can presize a values array for // a given number of rows DCHECK_GE(num_cols_, 0); int32_t rows_in_chunk; + constexpr int32_t kTargetChunkSize = 32768; if (num_cols_ > 0) { - rows_in_chunk = std::min(32768 / num_cols_, max_num_rows_ - num_rows_); + rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512), + max_num_rows_ - num_rows_); } else { - rows_in_chunk = std::min(32768, max_num_rows_ - num_rows_); + rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_); } PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_);