From 781a167e4a73f9ddb67154307eb325b62d7046f6 Mon Sep 17 00:00:00 2001 From: DexGroves Date: Tue, 27 Oct 2015 15:17:28 +0000 Subject: [PATCH] Don't allow NA as a factor level Tidying up PR #1408 Fix referencing the wrong issue --- R/fread.R | 6 +++--- README.md | 2 ++ inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/fread.R b/R/fread.R index 7bc71e6bce..4625a0f21f 100644 --- a/R/fread.R +++ b/R/fread.R @@ -35,7 +35,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str } if (toupper(tt)!=toupper(i)) { warning(cmd, " returned '",tt,"' != '",i,"' (not NULL not '' and allowing for case differences). This may not be a problem but please report.") - } + } if (Sys.localeconv()["decimal_point"] == dec) break if (verbose) cat("Successfully changed locale but it provides dec='",Sys.localeconv()["decimal_point"],"' not the desired dec", sep="") } @@ -98,7 +98,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str setattr(ans, 'names', make.unique(names(ans))) } as_factor <- function(x) { - lev = forderv(x, retGrp = TRUE) + lev = forderv(x, retGrp = TRUE, na.last = NA) # get levels, also take care of all sorted condition if (length(lev)) lev = x[lev[attributes(lev)$starts]] else lev = x[attributes(lev)$starts] @@ -110,7 +110,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str cols = which(vapply(ans, is.character, TRUE)) if (length(cols)) { if (verbose) cat("Converting column(s) [", paste(names(ans)[cols], collapse = ", "), "] from 'char' to 'factor'\n", sep = "") - for (j in cols) + for (j in cols) set(ans, j = j, value = as_factor(.subset2(ans, j))) } } diff --git a/README.md b/README.md index 8bef27f266..6d01af6ec1 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,8 @@ 13. `fread` reads text input with empty newline but with just spaces properly, for e.g., fread('a,b\n1,2\n '), [#1384](https://github.com/Rdatatable/data.table/issues/1384). Thanks to @ladida771. + 14. `fread` with `stringsAsFactors = TRUE` no longer produces factors with NA as a factor level, [#1408](https://github.com/Rdatatable/data.table/pull/1408). Thanks to @DexGroves. + #### NOTES 1. Updated error message on invalid joins to reflect the new `on=` syntax, [#1368](https://github.com/Rdatatable/data.table/issues/1368). Thanks @MichaelChirico. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e6329e142e..a31c78e226 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7075,6 +7075,14 @@ setattr(attr(X, 'index'), 'x', 5:1) # auto indexed attribute as created from v1. ans = capture.output(X[, z := 1:5, verbose=TRUE]) test(1576, ans[4], "Dropping index 'x' as it doesn't have '__' at the beginning of index name. It is very likely created using v1.9.4 of data.table.") +# fix for #1408 +X = fread("a|b|c|d + this|is|row|1 + this|is|row|2 + this|NA|NA|3 + this|is|row|4", stringsAsFactors = TRUE) +test(201.1, is.na(X[3, b]), TRUE) + ##########################