From 8c4a5c34e055d84fa92bdc7dc8b1e91bf0c6279d Mon Sep 17 00:00:00 2001 From: Kyle Date: Sat, 12 Feb 2022 17:25:34 +0800 Subject: [PATCH 1/3] ORC-1116: [C++] Fix csv-import tool when exporting long bytes Let each string type columns use different databuffers in fillStringValues() function, so when one column is calling buffer.resize(), the previous invalidated buffer.data() is not affecting other columns. --- tools/src/CSVFileImport.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc index 7bbc9c64ae..67a4169bb9 100644 --- a/tools/src/CSVFileImport.cc +++ b/tools/src/CSVFileImport.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -77,8 +78,8 @@ void fillStringValues(const std::vector& data, orc::ColumnVectorBatch* batch, uint64_t numValues, uint64_t colIndex, - orc::DataBuffer& buffer, - uint64_t& offset) { + orc::DataBuffer& buffer) { + uint64_t offset = 0; orc::StringVectorBatch* stringBatch = dynamic_cast(batch); bool hasNull = false; @@ -365,7 +366,8 @@ int main(int argc, char* argv[]) { double totalElapsedTime = 0.0; clock_t totalCPUTime = 0; - orc::DataBuffer buffer(*orc::getDefaultPool(), 4 * 1024 * 1024); + typedef std::list> DataBufferList; + DataBufferList bufferList; orc::WriterOptions options; options.setStripeSize(stripeSize); @@ -385,7 +387,6 @@ int main(int argc, char* argv[]) { std::ifstream finput(input.c_str()); while (!eof) { uint64_t numValues = 0; // num of lines read in a batch - uint64_t bufferOffset = 0; // current offset in the string buffer data.clear(); memset(rowBatch->notNull.data(), 1, batchSize); @@ -420,13 +421,13 @@ int main(int argc, char* argv[]) { case orc::STRING: case orc::CHAR: case orc::VARCHAR: - case orc::BINARY: + case orc::BINARY: + bufferList.emplace_back(*orc::getDefaultPool(), 4 * 1024 * 1024); fillStringValues(data, structBatch->fields[i], numValues, i, - buffer, - bufferOffset); + bufferList.back()); break; case orc::FLOAT: case orc::DOUBLE: From 2ab2cae5371a391ae4cbe71d1a55b1f8a1bd0cd6 Mon Sep 17 00:00:00 2001 From: Kyle Date: Wed, 16 Feb 2022 10:49:18 +0800 Subject: [PATCH 2/3] Reduce inited buffer size to 1MB per column --- tools/src/CSVFileImport.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc index 67a4169bb9..393367babe 100644 --- a/tools/src/CSVFileImport.cc +++ b/tools/src/CSVFileImport.cc @@ -422,7 +422,7 @@ int main(int argc, char* argv[]) { case orc::CHAR: case orc::VARCHAR: case orc::BINARY: - bufferList.emplace_back(*orc::getDefaultPool(), 4 * 1024 * 1024); + bufferList.emplace_back(*orc::getDefaultPool(), 1 * 1024 * 1024); fillStringValues(data, structBatch->fields[i], numValues, From 3534497891f173f8031bcdaee1bb9ab853277aa0 Mon Sep 17 00:00:00 2001 From: Kyle Date: Wed, 16 Feb 2022 16:49:17 +0800 Subject: [PATCH 3/3] Add a test coverage for importing long string data --- tools/test/TestCSVFileImport.cc | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/test/TestCSVFileImport.cc b/tools/test/TestCSVFileImport.cc index b4cd9679f1..0ae7f2cf1f 100644 --- a/tools/test/TestCSVFileImport.cc +++ b/tools/test/TestCSVFileImport.cc @@ -24,6 +24,8 @@ #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" +#include + TEST (TestCSVFileImport, test10rows) { // create an ORC file from importing the CSV file const std::string pgm1 = findProgram("tools/src/csv-import"); @@ -90,3 +92,36 @@ TEST (TestCSVFileImport, testTimezoneOption) { EXPECT_EQ("", error); } } + +TEST (TestCSVFileImport, testLongString) { + // create an ORC file from importing the CSV file + const std::string pgm1 = findProgram("tools/src/csv-import"); + const std::string csvFile = "/tmp/test_csv_import_test_long_string.csv"; + const std::string orcFile = "/tmp/test_csv_import_test_long_string.orc"; + const std::string schema = "'struct<_a:string,b_:binary,_c:varchar(10)>'"; + std::string output; + std::string error; + + std::ofstream csvFileStream(csvFile, std::ios::binary | std::ios::out | std::ios::trunc); + if(csvFileStream.is_open()) + { + std::string longStr; + longStr.resize(4 * 1024 * 1024 + 1, 'x'); + csvFileStream << "str1," << longStr << ",var1\n"; + csvFileStream << "str2," << longStr << ",var2\n"; + csvFileStream.close(); + } + + EXPECT_EQ(0, runProgram({pgm1, schema, csvFile, orcFile}, output, error)); + EXPECT_EQ("", error); + + // verify the ORC file content + const std::string pgm2 = findProgram("tools/src/orc-contents"); + std::string option = "--columns=0,2"; + const std::string expected = + "{\"_a\": \"str1\", \"_c\": \"var1\"}\n" + "{\"_a\": \"str2\", \"_c\": \"var2\"}\n"; + EXPECT_EQ(0, runProgram({pgm2, option, orcFile}, output, error)); + EXPECT_EQ(expected, output); + EXPECT_EQ("", error); +}