From b7b54a5745d82f29738c7006de0a7833efc5a4fb Mon Sep 17 00:00:00 2001 From: Pasha Stetsenko Date: Fri, 27 Apr 2018 10:09:30 -0700 Subject: [PATCH] Fix reading of files where fields may contain many newlines --- NEWS.md | 2 +- inst/tests/tests.Rraw | 6 ++++++ src/fread.c | 3 --- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index bf1fdf9a0e..45e34bb3cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -68,7 +68,7 @@ These options are meant for temporary use to aid your migration, [#2652](https:/ Warning message: In fread(txt) : Found and resolved improper quoting ``` - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney, Ananda Mahto for testing dev and reporting these regressions before release to CRAN: #2070, #2073, #2087, #2091, #2107, #2118, #2092, #1888, #2123, #2167, #2194, #2238, #2228, #1464, #2201, #2287, #2299, #2285, #2251, #2347, #2222, #2352, #2246, #2370, #2371, #2404, #2196, #2322, #2453, #2446, #2464, #2457, #1895, #2481, #2499, #2516, #2520, #2512, #2523, #2542, #2526, #2518, #2515, #1671, #2267, #2561, #2625, #2265, #2548, #2535, #2744, #2735, #2697, #2666 + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney, Ananda Mahto, @memoryfull, @brandenkmurray for testing dev and reporting these regressions before release to CRAN: #1464, #1671, #1888, #1895, #2070, #2073, #2087, #2091, #2092, #2107, #2118, #2123, #2167, #2194, #2196, #2201, #2222, #2228, #2238, #2246, #2251, #2265, #2267, #2285, #2287, #2299, #2322, #2347, #2352, #2370, #2371, #2395, #2404, #2446, #2453, #2457, #2464, #2481, #2499, #2512, #2515, #2516, #2518, #2520, #2523, #2526, #2535, #2542, #2548, #2561, #2600, #2625, #2666, #2697, #2735, #2744. 2. `fwrite()`: * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c87a498d43..1dd22a3754 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11713,6 +11713,12 @@ test(1906.2, fread(txt, quote=""), data.table(A=1:3, B=c('"hello", said Joe', '""howdy""', '"as"'), C=c('14" pizza', 'easy"', '"""pie"'))) +# Issue #2395 : text field containing too many newlines +example <- data.table(column1 = 1:3, column2 = c("text", "text\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmany new lines\n\n\n\n\n\n", "text")) +fwrite(example, file = (f<-tempfile())) +test(1907, fread(f), example) + + ################################### # Add new tests above this line # ################################### diff --git a/src/fread.c b/src/fread.c index e4f344be94..ac31e2f5fb 100644 --- a/src/fread.c +++ b/src/fread.c @@ -503,12 +503,10 @@ static void Field(FieldParseContext *ctx) // the field is quoted and quotes are correctly escaped (quoteRule 0 and 1) // or the field is quoted but quotes are not escaped (quoteRule 2) // or the field is not quoted but the data contains a quote at the start (quoteRule 2 too) - int eolCount = 0; fieldStart++; // step over opening quote switch(quoteRule) { case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol while (*++ch) { - if (*ch=='\n' && ++eolCount==100) return; // TODO: expose this 100 to user to allow them to control limiting runaway fields if (*ch==quote) { if (ch[1]==quote) { ch++; continue; } break; // found undoubled closing quote @@ -517,7 +515,6 @@ static void Field(FieldParseContext *ctx) break; case 1: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol while (*++ch) { - if (*ch=='\n' && ++eolCount==100) return; if (*ch=='\\' && (ch[1]==quote || ch[1]=='\\')) { ch++; continue; } if (*ch==quote) break; }