diff --git a/CRAN_Release.cmd b/CRAN_Release.cmd index bd13780bb6..817dadd2a2 100644 --- a/CRAN_Release.cmd +++ b/CRAN_Release.cmd @@ -42,6 +42,9 @@ grep "class *=" data.table/src/*.c # quite a few but none global # Failed clang 3.9.1 -O3 due to this, I think. grep "&REAL" data.table/src/*.c +# No use of long long, instead use int64_t. TODO +# grep "long long" data.table/src/*.c + # seal leak potential where two unprotected API calls are passed to the same # function call, usually involving install() or mkChar() # Greppable thanks to single lines and wide screens @@ -71,7 +74,7 @@ grep ScalarString *.c cd R -cc(clean=TRUE) # to compile with -pedandic +cc(clean=TRUE) # to compile with -pedandic. Also use very latest gcc (currently gcc-7) as CRAN does q("no") R CMD build data.table R CMD check data.table_1.10.1.tar.gz --as-cran @@ -146,7 +149,7 @@ tar xvf R-devel.tar.gz cd R-devel # Following R-exts#4.3.3 # (clang 3.6.0 works but gcc 4.9.2 fails in R's distance.c:256 error: ‘*.Lubsan_data0’ not specified in enclosing parallel) -./configure CC="clang -std=gnu99 -fsanitize=undefined,address" CFLAGS="-fno-omit-frame-pointer -O0 -g -Wall -pedantic -mtune=native" --without-recommended-packages --disable-byte-compiled-packages +./configure CC="clang -std=gnu99 -fsanitize=undefined,address" CFLAGS="-fno-omit-frame-pointer -O0 -g -Wall -pedantic -mtune=native" --without-recommended-packages --disable-byte-compiled-packages make alias Rdevel='~/build/R-devel/bin/R --vanilla' Rdevel @@ -244,7 +247,7 @@ shutdown now # doesn't return you to host prompt properly so just kill the win sudo apt-get update sudo apt-get -y install htop sudo apt-get -y install r-base r-base-dev -sudo apt-get -y build-dep r-base-dev +sudo apt-get -y build-dep r-base-dev sudo apt-get -y build-dep qpdf sudo apt-get -y build-dep r-cran-rgl sudo apt-get -y build-dep r-cran-rmpi @@ -302,7 +305,7 @@ old = 0 new = 0 for (p in deps) { fn = paste0(p, "_", avail[p,"Version"], ".tar.gz") - if (!file.exists(fn) || + if (!file.exists(fn) || identical(tryCatch(packageVersion(p), error=function(e)FALSE), FALSE) || packageVersion(p) != avail[p,"Version"]) { system(paste0("rm -f ", p, "*.tar.gz")) # Remove previous *.tar.gz. -f to be silent if not there (i.e. first time seeing this package) @@ -388,7 +391,7 @@ run = function(all=FALSE) { x = deps[!x] if (!length(x)) { cat("All package checks have already run. To rerun all: run(all=TRUE).\n"); return(); } cat("Running checks for",length(x),"packages\n") - cmd = paste0("ls -1 *.tar.gz | grep -E '", paste0(x,collapse="|"),"' | parallel R CMD check") + cmd = paste0("ls -1 *.tar.gz | grep -E '", paste0(x,collapse="|"),"' | parallel R CMD check") } else { cmd = "rm -rf *.Rcheck ; ls -1 *.tar.gz | parallel R CMD check" # apx 2.5 hrs for 313 packages on my 4 cpu laptop with 8 threads diff --git a/NEWS.md b/NEWS.md index e1bd2d672a..26a953931f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -54,62 +54,70 @@ #### BUG FIXES -1. The type pun fix (using union) in 1.10.4 resolved some CRAN flavors but still failed the new `fwrite` nanotime test with R-devel on MacOS using latest clang from latest Xcode 8.2. It seems that clang optimizations in Xcode 8 are particularly aggressive and require even stricter adherence to C standards. The type pun was already centralized and now uses `memcpy` which is ok by C standards and compilers apparently know to optimize to avoid call overhead. - -2. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding +1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding Great Stocks\"", I discussed the value of stock screeners as a powerful tool"`, [#2051](https://github.com/Rdatatable/data.table/issues/2051). Thanks to @scarrascoso for reporting. Example file added to test suite. -3. `fwrite()` creates a file with permissions that now play correctly with `Sys.umask()`, [#2049](https://github.com/Rdatatable/data.table/issues/2049). Thanks to @gnguy for reporting. - -4. `fread()` no longer holds an open lock on the file when a line outside the large sample has too many fields and generates an error, [#2044](https://github.com/Rdatatable/data.table/issues/2044). Thanks to Hugh Parsonage for reporting. +2. `fwrite()` creates a file with permissions that now play correctly with `Sys.umask()`, [#2049](https://github.com/Rdatatable/data.table/issues/2049). Thanks to @gnguy for reporting. -5. When `fread()` and `print()` see `integer64` columns are present but package `bit64` is not installed, the warning is now displayed as intended. Thanks to a question by Santosh on r-help and forwarded by Bill Dunlap. +3. `fread()` no longer holds an open lock on the file when a line outside the large sample has too many fields and generates an error, [#2044](https://github.com/Rdatatable/data.table/issues/2044). Thanks to Hugh Parsonage for reporting. -6. Setting `j = {}` no longer results in an error, [#2142](https://github.com/Rdatatable/data.table/issues/2142). Thanks Michael Chirico for the pull request. +4. Setting `j = {}` no longer results in an error, [#2142](https://github.com/Rdatatable/data.table/issues/2142). Thanks Michael Chirico for the pull request. -7. Seg fault in `rbindlist()` when one or more items are empty, [#2019](https://github.com/Rdatatable/data.table/issues/2019). Thanks Michael Lang for the pull request. +5. Seg fault in `rbindlist()` when one or more items are empty, [#2019](https://github.com/Rdatatable/data.table/issues/2019). Thanks Michael Lang for the pull request. -8. Error printing 0-length `ITime` and `NA` objects, [#2032](https://github.com/Rdatatable/data.table/issues/2032) and [#2171](https://github.com/Rdatatable/data.table/issues/2171). Thanks Michael Chirico for the pull requests and @franknarf1 for pointing out a shortcoming of the initial fix. +6. Error printing 0-length `ITime` and `NA` objects, [#2032](https://github.com/Rdatatable/data.table/issues/2032) and [#2171](https://github.com/Rdatatable/data.table/issues/2171). Thanks Michael Chirico for the pull requests and @franknarf1 for pointing out a shortcoming of the initial fix. -9. `as.IDate.POSIXct` error with `NULL` timezone, [#1973](https://github.com/Rdatatable/data.table/issues/1973). Thanks @lbilli for reporting and Michael Chirico for the pull request. +7. `as.IDate.POSIXct` error with `NULL` timezone, [#1973](https://github.com/Rdatatable/data.table/issues/1973). Thanks @lbilli for reporting and Michael Chirico for the pull request. -10. Printing a null `data.table` with `print` no longer visibly outputs `NULL`, [#1852](https://github.com/Rdatatable/data.table/issues/1852). Thanks @aaronmcdaid for spotting and @MichaelChirico for the PR. +8. Printing a null `data.table` with `print` no longer visibly outputs `NULL`, [#1852](https://github.com/Rdatatable/data.table/issues/1852). Thanks @aaronmcdaid for spotting and @MichaelChirico for the PR. -11. `data.table` now works with Shiny Reactivity / Flexdashboard. The error was typically something like `col not found` in `DT[col==val]`. Thanks to Dirk Eddelbuettel leading Matt through reproducible steps and @sergeganakou and Richard White for reporting. Closes [#2001](https://github.com/Rdatatable/data.table/issues/2001) and [shiny/#1696](https://github.com/rstudio/shiny/issues/1696). +9. `data.table` now works with Shiny Reactivity / Flexdashboard. The error was typically something like `col not found` in `DT[col==val]`. Thanks to Dirk Eddelbuettel leading Matt through reproducible steps and @sergeganakou and Richard White for reporting. Closes [#2001](https://github.com/Rdatatable/data.table/issues/2001) and [shiny/#1696](https://github.com/rstudio/shiny/issues/1696). -12. The `as.IDate.POSIXct` method passed `tzone` along but was not exported. So `tzone` is now taken into account by `as.IDate` too as well as `IDateTime`, [#977](https://github.com/Rdatatable/data.table/issues/977) and [#1498](https://github.com/Rdatatable/data.table/issues/1498). Tests added. +10. The `as.IDate.POSIXct` method passed `tzone` along but was not exported. So `tzone` is now taken into account by `as.IDate` too as well as `IDateTime`, [#977](https://github.com/Rdatatable/data.table/issues/977) and [#1498](https://github.com/Rdatatable/data.table/issues/1498). Tests added. -13. Named logical vector now select rows as expected from single row data.table. Thanks to @skranz for reporting. Closes [#2152](https://github.com/Rdatatable/data.table/issues/2152). +11. Named logical vector now select rows as expected from single row data.table. Thanks to @skranz for reporting. Closes [#2152](https://github.com/Rdatatable/data.table/issues/2152). -14. `fread()`'s rare `Internal error: Sampling jump point 10 is before the last jump ended` has been fixed, [#2157](https://github.com/Rdatatable/data.table/issues/2157). Thanks to Frank Erickson and Artem Klevtsov for reporting with example files which are now added to the test suite. +12. `fread()`'s rare `Internal error: Sampling jump point 10 is before the last jump ended` has been fixed, [#2157](https://github.com/Rdatatable/data.table/issues/2157). Thanks to Frank Erickson and Artem Klevtsov for reporting with example files which are now added to the test suite. -15. `CJ()` no longer loses attribute information, [#2029](https://github.com/Rdatatable/data.table/issues/2029). Thanks to @MarkusBonsch and @royalts for the pull request. +13. `CJ()` no longer loses attribute information, [#2029](https://github.com/Rdatatable/data.table/issues/2029). Thanks to @MarkusBonsch and @royalts for the pull request. -16. `split.data.table` respects `factor` ordering in `by` argument, [#2082](https://github.com/Rdatatable/data.table/issues/2082). Thanks to @MichaelChirico for identifying and fixing the issue. +14. `split.data.table` respects `factor` ordering in `by` argument, [#2082](https://github.com/Rdatatable/data.table/issues/2082). Thanks to @MichaelChirico for identifying and fixing the issue. -17. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR, and @ProfFancyPants for reporing a regression introduced in the PR. Closes [#2326](https://github.com/Rdatatable/data.table/issues/2326) and [#2338](https://github.com/Rdatatable/data.table/issues/2338). +15. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR, and @ProfFancyPants for reporing a regression introduced in the PR. Closes [#2326](https://github.com/Rdatatable/data.table/issues/2326) and [#2338](https://github.com/Rdatatable/data.table/issues/2338). -18. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). +16. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). -19. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). +17. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). #### NOTES 1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing. -2. The `nanotime` v0.2.0 update on CRAN 22 June 2017 changed from `integer64` to `S4` and broke `fwrite` of `nanotime` columns. The onus is on package maintainers to check downstream packages before release to CRAN. `fwrite` updated to work with `nanotime` both before and after v0.2.0. +2. Test 1675.1 updated to cope with a change in R-devel in June 2017 related to `factor()` and `NA` levels. + +3. Package `ezknitr` has been added to the whitelist of packages that run user code and should be consider data.table-aware, [#2266](https://github.com/Rdatatable/data.table/issues/2266). Thanks to Matt Mills for testing and reporting. + +4. Printing with `quote = TRUE` now quotes column names as well, [#1319](https://github.com/Rdatatable/data.table/issues/1319). Thanks @jan-glx for the suggestion and @MichaelChirico for the PR. -3. Test 1675.1 updated to cope with a change in R-devel in June 2017 related to `factor()` and `NA` levels. +5. Added a blurb to `?melt.data.table` explicating the subtle difference in behavior of the `id.vars` argument vis-a-vis its analog in `reshape2::melt`, [#1699](https://github.com/Rdatatable/data.table/issues/1699). Thanks @MichaelChirico for uncovering and filing. -4. Package `ezknitr` has been added to the whitelist of packages that run user code and should be consider data.table-aware, [#2266](https://github.com/Rdatatable/data.table/issues/2266). Thanks to Matt Mills for testing and reporting. +6. Added some clarification about the usage of `on` to `?data.table`, [#2383](https://github.com/Rdatatable/data.table/issues/2383). Thanks to @peterlittlejohn for volunteering his confusion and @MichaelChirico for brushing things up. -5. Printing with `quote = TRUE` now quotes column names as well, [#1319](https://github.com/Rdatatable/data.table/issues/1319). Thanks @jan-glx for the suggestion and @MichaelChirico for the PR. +7. Clarified that "data.table always sorts in `C-locale`" means that upper-case letters are sorted before lower-case letters by ordering in data.table (e.g. `setorder`, `setkey`, `DT[order(...)]`). Thanks to @hughparsonage for the pull request editing the documentation. Note this makes no difference in most cases of data; e.g. ids where only uppercase or lowercase letters are used (`"AB123"<"AC234"` is always true, regardless), or country names and words which are consistently capitalized. For example, `"America" < "Brazil"` is not affected (it's always true), and neither is `"america" < "brazil"` (always true too); since the first letter is consistently capitalized. But, whether `"america" < "Brazil"` (the words are not consistently capitalized) is true or false in base R depends on the locale of your R session. In America it is true by default and false if you i) type `Sys.setlocale(locale="C")`, ii) the R session has been started in a C locale for you which can happen on servers/services (the locale comes from the environment the R session is started in). However, `"america" < "Brazil"` is always, consistently false in data.table which can be a surprise because it differs to base R by default in most regions. It is false because `"B"<"a"` is true because all upper-case letters come first, followed by all lower case letters (the ascii number of each letter determines the order, which is what is meant by `C-locale`). -6. Added a blurb to `?melt.data.table` explicating the subtle difference in behavior of the `id.vars` argument vis-a-vis its analog in `reshape2::melt`, [#1699](https://github.com/Rdatatable/data.table/issues/1699). Thanks @MichaelChirico for uncovering and filing. -7. Added some clarification about the usage of `on` to `?data.table`, [#2383](https://github.com/Rdatatable/data.table/issues/2383). Thanks to @peterlittlejohn for volunteering his confusion and @MichaelChirico for brushing things up. +### Changes in v1.10.4-1 (on CRAN 09 Oct 2017) +#### (Minimal patch release to pass CRAN checks) -8. Clarified that "data.table always sorts in `C-locale`" means that upper-case letters are sorted before lower-case letters by ordering in data.table (e.g. `setorder`, `setkey`, `DT[order(...)]`). Thanks to @hughparsonage for the pull request editing the documentation. Note this makes no difference in most cases of data; e.g. ids where only uppercase or lowercase letters are used (`"AB123"<"AC234"` is always true, regardless), or country names and words which are consistently capitalized. For example, `"America" < "Brazil"` is not affected (it's always true), and neither is `"america" < "brazil"` (always true too); since the first letter is consistently capitalized. But, whether `"america" < "Brazil"` (the words are not consistently capitalized) is true or false in base R depends on the locale of your R session. In America it is true by default and false if you i) type `Sys.setlocale(locale="C")`, ii) the R session has been started in a C locale for you which can happen on servers/services (the locale comes from the environment the R session is started in). However, `"america" < "Brazil"` is always, consistently false in data.table which can be a surprise because it differs to base R by default in most regions. It is false because `"B"<"a"` is true because all upper-case letters come first, followed by all lower case letters (the ascii number of each letter determines the order, which is what is meant by `C-locale`). +1. The `nanotime` v0.2.0 update on CRAN 22 June 2017 changed from `integer64` to `S4` and broke `fwrite` of `nanotime` columns. Fixed to work with `nanotime` both before and after v0.2.0. + +2. Pass R-devel changes related to `deparse(,backtick=)` and `factor()`. + +3. Internal `NAMED()==2` now `MAYBE_SHARED()`, [#2330](https://github.com/Rdatatable/data.table/issues/2330). Back-ported to pass under the stated dependency, R 3.0.0. + +4. Attempted improvement on Mac-only when the `parallel` package is used too (which forks), [#2137](https://github.com/Rdatatable/data.table/issues/2137). Intel's OpenMP implementation appears to leave threads running after the OpenMP parallel region (inside data.table) has finished unlike GNU libgomp. So, if and when `parallel`'s `fork` is invoked by the user after data.table has run in parallel already, instability occurs. The problem only occurs with Mac package binaries from CRAN because they are built by CRAN with Intel's OpenMP library. No known problems on Windows or Linux and no known problems on any platform when `parallel` is not used. If this Mac-only fix still doesn't work, call `setDTthreads(1)` immediately after `library(data.table)` which has been reported to fix the problem by putting `data.table` into single threaded mode earlier. + +5. When `fread()` and `print()` see `integer64` columns are present but package `bit64` is not installed, the warning is now displayed as intended. Thanks to a question by Santosh on r-help and forwarded by Bill Dunlap. ### Changes in v1.10.4 (on CRAN 01 Feb 2017) diff --git a/R/data.table.R b/R/data.table.R index 073313913a..bd89964202 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2411,7 +2411,7 @@ alloc.col <- function(DT, n=getOption("datatable.alloccol"), verbose=getOption(" name = as.character(name) assign(name,ans,parent.frame(),inherits=TRUE) } - .Call(Csetnamed,ans,0L) + .Call(Csetmutable,ans) } selfrefok <- function(DT,verbose=getOption("datatable.verbose")) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3e112e807d..25198dd014 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -46,7 +46,7 @@ if (!.devtesting) { # tests can be used but in dev they test the package in .GlobalEnv. If we used ::: throughout tests, that # would pick up the installed version and in dev you'd have to reinstall every time which slows down dev. # NB: The string "data.table:::" should exist nowhere else in this file other than here inside this branch. - + test = data.table:::test INT = data.table:::INT compactprint = data.table:::compactprint @@ -78,8 +78,8 @@ if (!.devtesting) { .shallow = data.table:::.shallow getdots = data.table:::getdots binary = data.table:::binary - - # Also, for functions that are masked by other packages, we need to map the data.table one. Or else, + + # Also, for functions that are masked by other packages, we need to map the data.table one. Or else, # the other package's function would be picked up. As above, we only need to do this because we desire # to develop in .GlobalEnv with cc(). # NB: The string "data.table::" should exist nowhere else in this file other than here inside this branch. @@ -5785,7 +5785,7 @@ test(1418.2, DT[2,c("foo","baz"):=10L,verbose=TRUE], output=".*Dropping index 'b DT <- data.table(x1 = c(1,1,1,1,1,2,2,2,2,2), x2 = c(1,1,2,2,2,1,1,2,2,2), x3 = c(1,2,1,1,2,1,1,1,1,2), - y = rnorm(10), + y = rnorm(10), key = c("x1", "x2", "x3")) thisDT <- copy(DT)[2, x1 := 3] test(1419.1, key(thisDT), NULL) @@ -7041,8 +7041,8 @@ test(1545.137, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1") test(1545.138, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) # tests for #2336. .shallow now retains indices as well -x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), - a2 = c(1L, 1L, 1L, 2L, 2L, 2L), +x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), + a2 = c(1L, 1L, 1L, 2L, 2L, 2L), a3 = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE)) setindex(x1, a1, a2, a3) setindex(x1, a1, a3) @@ -9505,12 +9505,12 @@ if (.Platform$OS.type=="unix") { # as used by package popEpi in its tests test(1704, all.equal(data.table( a=1:3, b=4:6 ), data.table( A=1:3, B=4:6 ), check.attributes=FALSE)) -if (.Platform$OS.type!="windows") { +# setDTthreads(2) was at the top of this file (tests.Rraw). Since CRAN's policy is max two. +# However, under UBSAN and ASAN, threads are limited to 1 thread, so only run this test when we have 2 threads. +if (.Platform$OS.type!="windows" && getDTthreads()==2) { # On Windows: "'mc.cores' > 1 is not supported on Windows". # parallel package isn't parallel on Windows, but data.table is. if ("package:parallel" %in% search()) { #1745 and #1727 - test(1705, getDTthreads()<=2) # this was set at the top of tests.Rraw. CRAN's policy is max two. - # not 1, to pass on Rdevel clag UBSAN and ASAN without OpenMP lx <- replicate(4, runif(1e5), simplify=FALSE) f <- function(mc.cores = 2, threads = 2) { setDTthreads(threads) @@ -9519,9 +9519,11 @@ if (.Platform$OS.type!="windows") { f(1, 1) # was always ok f(2, 1) # was always ok f(1, 2) # was always ok - f(2, 2) # used to hang. Now should not because data.table auto switches to single threaded - # commenting out avoid_openmp_hang_within_fork() confirms this test catches catches the hang - test(1706, getDTthreads()<=2) # Tests that it reverts to previous state after use of mclapply + f(2, 2) # Used to hang. Now should not because data.table auto switches to single threaded + # Commenting out avoid_openmp_hang_within_fork() confirms this test catches catches the hang + test(1705, getDTthreads()==1) # Stays in single-threaded mode after returning from mclapply's fork + setDTthreads(2) + test(1706, getDTthreads()==2) # User returned to multi-threaded after fork. } else { cat("Tests 1705 and 1706 not run. If required call library(parallel) first.\n") } @@ -10362,11 +10364,15 @@ test(1759, fread("alluniquechar.csv")[c(1,2,4999,5000)], # fread should use multiple threads on single column input. # tests 2 threads; the very reasonable limit on CRAN # file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) -N = if (TRUE) 2e6 else 1e9 # offline speed check -fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) -test(1760.1, file.info(f)$size > 4*1024*1024) -test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") -unlink(f) +if (getDTthreads() != 2) { + cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") +} else { + N = if (TRUE) 2e6 else 1e9 # offline speed check + fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) + test(1760.1, file.info(f)$size > 4*1024*1024) + test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") + unlink(f) +} # fread single column with superfluous fill=TRUE, #2118 test(1761.1, fread("1\n2\n3", fill=TRUE), data.table(V1=1:3)) diff --git a/src/assign.c b/src/assign.c index e9c63bc85d..f8e87cdf0f 100644 --- a/src/assign.c +++ b/src/assign.c @@ -185,7 +185,6 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) SETLENGTH(newdt,l); SET_TRUELENGTH(newdt,n); setselfref(newdt); - // SET_NAMED(dt,1); // for some reason, R seems to set NAMED=2 via setAttrib? Need NAMED to be 1 for passing to assign via a .C dance before .Call (which sets NAMED to 2), and we can't use .C with DUP=FALSE on lists. UNPROTECT(protecti); return(newdt); } @@ -477,11 +476,16 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v } vlen = length(thisvalue); if (length(rows)==0 && targetlen==vlen && (vlen>0 || nrow==0)) { - if ( NAMED(thisvalue)==2 || // set() protects the NAMED of atomic vectors from .Call setting arguments to 2 by wrapping with list + if ( MAYBE_SHARED(thisvalue) || // set() protects the NAMED of atomic vectors from .Call setting arguments to 2 by wrapping with list (TYPEOF(values)==VECSXP && i>LENGTH(values)-1)) { // recycled RHS would have columns pointing to others, #185. if (verbose) { - if (NAMED(thisvalue)==2) Rprintf("RHS for item %d has been duplicated because NAMED is %d, but then is being plonked.\n",i+1, NAMED(thisvalue)); - else Rprintf("RHS for item %d has been duplicated because the list of RHS values (length %d) is being recycled, but then is being plonked.\n", i+1, length(values)); + if (length(values)==length(cols)) { + // usual branch + Rprintf("RHS for item %d has been duplicated because NAMED is %d, but then is being plonked.\n", i+1, NAMED(thisvalue)); + } else { + // rare branch where the lhs of := is longer than the items on the rhs of := + Rprintf("RHS for item %d has been duplicated because the list of RHS values (length %d) is being recycled, but then is being plonked.\n", i+1, length(values)); + } } thisvalue = duplicate(thisvalue); // PROTECT not needed as assigned as element to protected list below. } else { @@ -778,7 +782,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v } static Rboolean anyNamed(SEXP x) { - if (NAMED(x)) return TRUE; + if (MAYBE_REFERENCED(x)) return TRUE; if (isNewList(x)) for (int i=0; i(b))?(a):(b)) +// Backport macros added to R in 2017 so we don't need to update dependency from R 3.0.0 +#ifndef MAYBE_SHARED +# define MAYBE_SHARED(x) (NAMED(x) > 1) +#endif +#ifndef MAYBE_REFERENCED +# define MAYBE_REFERENCED(x) ( NAMED(x) > 0 ) +#endif + // init.c void setSizes(); SEXP char_integer64; diff --git a/src/frank.c b/src/frank.c index 150be382eb..5ba651030e 100644 --- a/src/frank.c +++ b/src/frank.c @@ -5,11 +5,6 @@ extern SEXP char_integer64; -static union { - double d; - unsigned long long ull; -} u; - SEXP dt_na(SEXP x, SEXP cols) { int i, j, n=0, this; double *dv; @@ -45,8 +40,7 @@ SEXP dt_na(SEXP x, SEXP cols) { if (isString(class) && STRING_ELT(class, 0) == char_integer64) { dv = (double *)REAL(v); for (j=0; j= 0. \ Default 0 is recommended to use all CPU."); } - // do not call omp_set_num_threads() here as that affects other openMP - // packages and base R as well potentially. int old = DTthreads; DTthreads = INTEGER(threads)[0]; + if (omp_get_max_threads() < omp_get_thread_limit()) { + if (DTthreads==0) { + // for example after test 1705 has auto switched to single-threaded for parallel's fork, + // we want to return to multi-threaded. + // omp_set_num_threads() sets the value returned by omp_get_max_threads() + omp_set_num_threads(omp_get_thread_limit()); + } else if (DTthreads > omp_get_max_threads()) { + omp_set_num_threads( MIN(DTthreads, omp_get_thread_limit()) ); + } + } return ScalarInteger(old); } // auto avoid deadlock when data.table called from parallel::mclapply -static int preFork_DTthreads = 0; void when_fork() { - preFork_DTthreads = DTthreads; + + // attempted workaround for Intel's OpenMP implementation which leaves threads running after + // parallel region; these crash when forked. + omp_set_num_threads(1); + + // GNU OpenMP seems ok with just setting DTthreads to 1 which limits the next parallel region + // if data.table is used within the fork'd proceess. DTthreads = 1; + + // We used to have an after_fork() callback too, to return to multi-threaded mode after parallel's + // fork completes. But now in an attempt to alleviate problems propogating (likely Intel's OpenMP only) + // we now leave data.table in single-threaded mode after parallel's fork. User can call setDTthreads(0) + // to return to multi-threaded as we do in tests on Linux. } -void when_fork_end() { - DTthreads = preFork_DTthreads; -} + void avoid_openmp_hang_within_fork() { // Called once on loading data.table from init.c #ifdef _OPENMP - pthread_atfork(&when_fork, &when_fork_end, NULL); + pthread_atfork(&when_fork, NULL, NULL); #endif } - diff --git a/src/wrappers.c b/src/wrappers.c index f1dada5eaf..65f5dc3c53 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -16,12 +16,12 @@ SEXP setattrib(SEXP x, SEXP name, SEXP value) error("Internal structure doesn't seem to be a list. Can't set class to be 'data.table' or 'data.frame'. Use 'as.data.table()' or 'as.data.frame()' methods instead."); if (isLogical(x) && x == ScalarLogical(TRUE)) { // ok not to protect this ScalarLogical() as not assigned or passed x = PROTECT(duplicate(x)); - setAttrib(x, name, NAMED(value) ? duplicate(value) : value); + setAttrib(x, name, MAYBE_REFERENCED(value) ? duplicate(value) : value); UNPROTECT(1); return(x); } setAttrib(x, name, - NAMED(value) ? duplicate(value) : value); + MAYBE_REFERENCED(value) ? duplicate(value) : value); // duplicate is temp fix to restore R behaviour prior to R-devel change on 10 Jan 2014 (r64724). // TO DO: revisit. Enough to reproduce is: DT=data.table(a=1:3); DT[2]; DT[,b:=2] // ... Error: selfrefnames is ok but tl names [1] != tl [100] @@ -67,10 +67,11 @@ SEXP setlistelt(SEXP l, SEXP i, SEXP value) return(R_NilValue); } -SEXP setnamed(SEXP x, SEXP value) +SEXP setmutable(SEXP x) { - if (!isInteger(value) || LENGTH(value)!=1) error("Second argument to setnamed must a length 1 integer vector"); - SET_NAMED(x,INTEGER(value)[0]); + // called from one single place at R level. TODO: avoid somehow, but fails tests without + // At least the SET_NAMED() direct call is passed 0 and makes no assumptions about >0. Good enough for now as patch for CRAN is needed. + SET_NAMED(x,0); return(x); } @@ -95,7 +96,7 @@ SEXP copyNamedInList(SEXP x) if (TYPEOF(x) != VECSXP) error("x isn't a VECSXP"); for (int i=0; i