diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc index 7bbc9c64ae..393367babe 100644 --- a/tools/src/CSVFileImport.cc +++ b/tools/src/CSVFileImport.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -77,8 +78,8 @@ void fillStringValues(const std::vector& data, orc::ColumnVectorBatch* batch, uint64_t numValues, uint64_t colIndex, - orc::DataBuffer& buffer, - uint64_t& offset) { + orc::DataBuffer& buffer) { + uint64_t offset = 0; orc::StringVectorBatch* stringBatch = dynamic_cast(batch); bool hasNull = false; @@ -365,7 +366,8 @@ int main(int argc, char* argv[]) { double totalElapsedTime = 0.0; clock_t totalCPUTime = 0; - orc::DataBuffer buffer(*orc::getDefaultPool(), 4 * 1024 * 1024); + typedef std::list> DataBufferList; + DataBufferList bufferList; orc::WriterOptions options; options.setStripeSize(stripeSize); @@ -385,7 +387,6 @@ int main(int argc, char* argv[]) { std::ifstream finput(input.c_str()); while (!eof) { uint64_t numValues = 0; // num of lines read in a batch - uint64_t bufferOffset = 0; // current offset in the string buffer data.clear(); memset(rowBatch->notNull.data(), 1, batchSize); @@ -420,13 +421,13 @@ int main(int argc, char* argv[]) { case orc::STRING: case orc::CHAR: case orc::VARCHAR: - case orc::BINARY: + case orc::BINARY: + bufferList.emplace_back(*orc::getDefaultPool(), 1 * 1024 * 1024); fillStringValues(data, structBatch->fields[i], numValues, i, - buffer, - bufferOffset); + bufferList.back()); break; case orc::FLOAT: case orc::DOUBLE: diff --git a/tools/test/TestCSVFileImport.cc b/tools/test/TestCSVFileImport.cc index b4cd9679f1..0ae7f2cf1f 100644 --- a/tools/test/TestCSVFileImport.cc +++ b/tools/test/TestCSVFileImport.cc @@ -24,6 +24,8 @@ #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" +#include + TEST (TestCSVFileImport, test10rows) { // create an ORC file from importing the CSV file const std::string pgm1 = findProgram("tools/src/csv-import"); @@ -90,3 +92,36 @@ TEST (TestCSVFileImport, testTimezoneOption) { EXPECT_EQ("", error); } } + +TEST (TestCSVFileImport, testLongString) { + // create an ORC file from importing the CSV file + const std::string pgm1 = findProgram("tools/src/csv-import"); + const std::string csvFile = "/tmp/test_csv_import_test_long_string.csv"; + const std::string orcFile = "/tmp/test_csv_import_test_long_string.orc"; + const std::string schema = "'struct<_a:string,b_:binary,_c:varchar(10)>'"; + std::string output; + std::string error; + + std::ofstream csvFileStream(csvFile, std::ios::binary | std::ios::out | std::ios::trunc); + if(csvFileStream.is_open()) + { + std::string longStr; + longStr.resize(4 * 1024 * 1024 + 1, 'x'); + csvFileStream << "str1," << longStr << ",var1\n"; + csvFileStream << "str2," << longStr << ",var2\n"; + csvFileStream.close(); + } + + EXPECT_EQ(0, runProgram({pgm1, schema, csvFile, orcFile}, output, error)); + EXPECT_EQ("", error); + + // verify the ORC file content + const std::string pgm2 = findProgram("tools/src/orc-contents"); + std::string option = "--columns=0,2"; + const std::string expected = + "{\"_a\": \"str1\", \"_c\": \"var1\"}\n" + "{\"_a\": \"str2\", \"_c\": \"var2\"}\n"; + EXPECT_EQ(0, runProgram({pgm2, option, orcFile}, output, error)); + EXPECT_EQ(expected, output); + EXPECT_EQ("", error); +}