diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 902d66efd1..ce73aa84fe 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -308,7 +308,7 @@ make # use latest available `apt-cache search gcc-` or `clang-` cd ~/build/R-devel-strict-clang -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-10 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make cd ~/build/R-devel-strict-gcc diff --git a/NEWS.md b/NEWS.md index afde983b46..94e1fee81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ 1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. +2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, and if so `fsort` will check that and work fine. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. + ## NOTES 1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). diff --git a/configure b/configure index c0745e527e..e29156d61a 100755 --- a/configure +++ b/configure @@ -85,7 +85,11 @@ EOF if [ "$R_NO_OPENMP" = "1" ]; then # Compilation failed -- try forcing -fopenmp instead. + # TODO: doesn't R_NO_OPENMP need to be set to 0 before next line? ${CC} ${CFLAGS} -fopenmp test-omp.c || R_NO_OPENMP=1 + # TODO: and then nothing seems to be done with this outcome +else + echo "R CMD SHLIB supports OpenMP without any extra hint" fi # Clean up. @@ -100,7 +104,6 @@ if [ "$R_NO_OPENMP" = "1" ]; then echo "*** Continuing installation without OpenMP support..." sed -e "s|@openmp_cflags@||" src/Makevars.in > src/Makevars else - echo "OpenMP supported" sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars fi # retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9d207f8866..4dd2809f7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12512,7 +12512,7 @@ x <- as.integer(x) test(1888.5, fsort(x), base::sort(x, na.last = FALSE), warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread") x = runif(1e6) -test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 5 MSB counts") +test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") rm(x) diff --git a/src/Makevars.in b/src/Makevars.in index 76218cb65a..7750c1e8ac 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -7,5 +7,7 @@ PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ -lz # Hence the onerous @...@ substitution. Is it still appropriate in 2020 that we can't use +=? all: $(SHLIB) + @echo PKG_CFLAGS = $(PKG_CFLAGS) + @echo PKG_LIBS = $(PKG_LIBS) if [ "$(SHLIB)" != "datatable$(SHLIB_EXT)" ]; then mv $(SHLIB) datatable$(SHLIB_EXT); fi if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id datatable$(SHLIB_EXT) datatable$(SHLIB_EXT); fi diff --git a/src/fsort.c b/src/fsort.c index 00c7e5c10b..748c2ad85d 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -2,43 +2,39 @@ #define INSERT_THRESH 200 // TODO: expose via api and test -static void dinsert(double *x, int n) { // TODO: if and when twiddled, double => ull +static void dinsert(double *x, const int n) { // TODO: if and when twiddled, double => ull if (n<2) return; - for (int i=1; i=0 && xtmp=0 && xtmp> fromBit & mask]++; + for (uint64_t i=0; i> fromBit & mask]++; tmp++; } - int last = (*(unsigned long long *)--tmp - minULL) >> fromBit & mask; + int last = (*(uint64_t *)--tmp - minULL) >> fromBit & mask; if (counts[last] == n) { // Single value for these bits here. All counted in one bucket which must be the bucket for the last item. counts[last] = 0; // clear ready for reuse. All other counts must be zero already so save time by not setting to 0. @@ -47,9 +43,9 @@ static void dradix_r( // single-threaded recursive worker return; } - R_xlen_t cumSum=0; - for (R_xlen_t i=0; cumSum> fromBit & mask; + for (uint64_t i=0; i> fromBit & mask; working[ counts[thisx]++ ] = *tmp; tmp++; } @@ -71,14 +67,14 @@ static void dradix_r( // single-threaded recursive worker // Also this way, we don't need to know how big thisCounts is and therefore no possibility of getting that wrong. // wasteful thisCounts[i]=0 even when already 0 is better than a branch. We are highly recursive at this point // so avoiding memset() is known to be worth it. - for (int i=0; counts[i]0 if the element a goes after the element b // doesn't master if stable or not - R_xlen_t x = qsort_data[*(int *)a]; - R_xlen_t y = qsort_data[*(int *)b]; + uint64_t x = qsort_data[*(int *)a]; + uint64_t y = qsort_data[*(int *)b]; // return x-y; would like this, but this is long and the cast to int return may not preserve sign // We have long vectors in mind (1e10(74GB), 1e11(740GB)) where extreme skew may feasibly mean the largest count // is greater than 2^32. The first split is (currently) 16 bits so should be very rare but to be safe keep 64bit counts. @@ -132,12 +128,12 @@ SEXP fsort(SEXP x, SEXP verboseArg) { double mins[nBatch], maxs[nBatch]; const double *restrict xp = REAL(x); #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nBatch, false)) - for (int batch=0; batchmyMax) myMax=*d; @@ -148,7 +144,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { } t[2] = wallclock(); double min=mins[0], max=maxs[0]; - for (int i=1; imax) max=maxs[i]; @@ -158,10 +154,9 @@ SEXP fsort(SEXP x, SEXP verboseArg) { // TODO: -0ULL should allow negatives // avoid twiddle function call as expensive in recent tests (0.34 vs 2.7) // possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep - u.d = max; - unsigned long long maxULL = u.ull; - u.d = min; - minULL = u.ull; // set static global for use by dradix_r + + uint64_t maxULL = *(uint64_t *)&max; + minULL = *(uint64_t *)&min; // set static global for use by dradix_r int maxBit = floor(log(maxULL-minULL) / log(2)); // 0 is the least significant bit int MSBNbits = maxBit > 15 ? 16 : maxBit+1; // how many bits make up the MSB @@ -169,33 +164,32 @@ SEXP fsort(SEXP x, SEXP verboseArg) { size_t MSBsize = 1LL< 65,536) if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n"), maxBit, MSBNbits, shift, MSBsize); - R_xlen_t *counts = calloc(nBatch*MSBsize, sizeof(R_xlen_t)); - if (counts==NULL) error(_("Unable to allocate working memory")); + uint64_t *counts = (uint64_t *)R_alloc(nBatch*MSBsize, sizeof(uint64_t)); + memset(counts, 0, nBatch*MSBsize*sizeof(uint64_t)); // provided MSBsize>=9, each batch is a multiple of at least one 4k page, so no page overlap - // TODO: change all calloc, malloc and free to Calloc and Free to be robust to error() and catch ooms. if (verbose) Rprintf(_("counts is %dMB (%d pages per nBatch=%d, batchSize=%"PRIu64", lastBatchSize=%"PRIu64")\n"), - (int)(nBatch*MSBsize*sizeof(R_xlen_t)/(1024*1024)), - (int)(nBatch*MSBsize*sizeof(R_xlen_t)/(4*1024*nBatch)), + (int)(nBatch*MSBsize*sizeof(uint64_t)/(1024*1024)), + (int)(nBatch*MSBsize*sizeof(uint64_t)/(4*1024*nBatch)), nBatch, (uint64_t)batchSize, (uint64_t)lastBatchSize); t[3] = wallclock(); #pragma omp parallel for num_threads(nth) - for (int batch=0; batch> shift]++; tmp++; } } // cumulate columnwise; parallel histogram; small so no need to parallelize - R_xlen_t rollSum=0; - for (int msb=0; msb> shift]++ ] = *source; // This assignment to ans is not random access as it may seem, but cache efficient by // design since target pages are written to contiguously. MSBsize * 4k < cache. @@ -226,13 +220,13 @@ SEXP fsort(SEXP x, SEXP verboseArg) { int fromBit = toBit>7 ? toBit-7 : 0; // sort bins by size, largest first to minimise last-man-home - R_xlen_t *msbCounts = counts + (nBatch-1)*MSBsize; + uint64_t *msbCounts = counts + (nBatch-1)*MSBsize; // msbCounts currently contains the ending position of each MSB (the starting location of the next) even across empty if (msbCounts[MSBsize-1] != xlength(x)) error(_("Internal error: counts[nBatch-1][MSBsize-1] != length(x)")); // # nocov - R_xlen_t *msbFrom = malloc(MSBsize*sizeof(R_xlen_t)); - int *order = malloc(MSBsize*sizeof(int)); - R_xlen_t cumSum = 0; - for (int i=0; i0 && msbCounts[order[MSBsize-1]] < 2) MSBsize--; @@ -252,63 +246,83 @@ SEXP fsort(SEXP x, SEXP verboseArg) { Rprintf(_("%d by excluding 0 and 1 counts\n"), MSBsize); } + bool failed=false, alloc_fail=false, non_monotonic=false; // shared bools only ever assigned true; no need for atomic or critical assign t[6] = wallclock(); #pragma omp parallel num_threads(getDTthreads(MSBsize, false)) { - R_xlen_t *counts = calloc((toBit/8 + 1)*256, sizeof(R_xlen_t)); - // each thread has its own (small) stack of counts + // each thread has its own small stack of counts // don't use VLAs here: perhaps too big for stack yes but more that VLAs apparently fail with schedule(dynamic) - - double *working=NULL; - // the working memory (for the largest groups) is allocated the first time the thread is assigned to - // an iteration. - - #pragma omp for schedule(dynamic,1) - // All we assume here is that a thread can never be assigned to an earlier iteration; i.e. threads 0:(nth-1) - // get iterations 0:(nth-1) possibly out of order, then first-come-first-served in order after that. - // If a thread deals with an msb lower than the first one it dealt with, then its *working will be too small. - for (int msb=0; msb 65,536) that the largest MSB should be // relatively small anyway (n/65,536 if uniformly distributed). - // For msb>=nth, that thread's *working will already be big - // enough because the smallest *working (for thread nth-1) is big enough for all iterations following. + // For msb>=nth, that thread's *myworking will already be big enough because + // the smallest *myworking (for thread nth-1) is big enough for all iterations following. // Progressively, less and less of the working will be needed by the thread (just the first thisN will be - // used) and the unused pages will simply not be cached. - // TODO: Calloc isn't thread-safe. But this deep malloc should be ok here as no possible error() points - // before free. Just need to add the check and exit thread safely somehow. + // used) and the unused lines will simply not be cached. if (thisN <= INSERT_THRESH) { dinsert(ans+from, thisN); } else { - dradix_r(ans+from, working, thisN, fromBit, toBit, counts); + dradix_r(ans+from, myworking, thisN, fromBit, toBit, mycounts); } } - free(counts); - free(working); + free(mycounts); + free(myworking); } - free(msbFrom); - free(order); + if (non_monotonic) + error("OpenMP %d did not assign threads to iterations monotonically. Please search Stack Overflow for this message.", MY_OPENMP); // # nocov; #4786 in v1.13.4 + if (alloc_fail) + error(_("Unable to allocate working memory")); // # nocov } t[7] = wallclock(); - free(counts); - + // TODO: parallel sweep to check sorted using <= on original input. Feasible that twiddling messed up. // After a few years of heavy use remove this check for speed, and move into unit tests. // It's a perfectly contiguous and cache efficient parallel scan so should be relatively negligible. double tot = t[7]-t[0]; - if (verbose) for (int i=1; i<=7; i++) { + if (verbose) for (int i=1; i<=7; ++i) { Rprintf(_("%d: %.3f (%4.1f%%)\n"), i, t[i]-t[i-1], 100.*(t[i]-t[i-1])/tot); } - UNPROTECT(nprotect); return(ansVec); } diff --git a/src/init.c b/src/init.c index 1247c585b6..6a5377ca2a 100644 --- a/src/init.c +++ b/src/init.c @@ -393,11 +393,11 @@ SEXP hasOpenMP() { // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is // why RPRINTF is capitalized in this comment to avoid that grep. - // TODO: perhaps .Platform or .Machine in R itself could contain whether OpenMP is available. + // .Platform or .Machine in R itself does not contain whether OpenMP is available because compiler and flags are per-package. #ifdef _OPENMP - return ScalarLogical(TRUE); + return ScalarInteger(_OPENMP); // return the version; e.g. 201511 (i.e. 4.5) #else - return ScalarLogical(FALSE); + return ScalarInteger(0); // 0 rather than NA so that if() can be used on the result #endif } // # nocov end diff --git a/src/myomp.h b/src/myomp.h index 58a5703f00..57d8b58734 100644 --- a/src/myomp.h +++ b/src/myomp.h @@ -1,5 +1,13 @@ #ifdef _OPENMP #include + #if _OPENMP >= 201511 + #define monotonic_dynamic monotonic:dynamic // #4786 + #else + #define monotonic_dynamic dynamic + #endif + #define MY_OPENMP _OPENMP + // for use in error messages (e.g. fsort.c; #4786) to save an #ifdef each time + // initially chose OMP_VERSION but figured OpenMP might define that in future, so picked MY_ prefix #else // for machines with compilers void of openmp support #define omp_get_num_threads() 1 @@ -9,5 +17,6 @@ #define omp_get_num_procs() 1 #define omp_set_nested(a) // empty statement to remove the call #define omp_get_wtime() 0 + #define MY_OPENMP 0 #endif diff --git a/src/openmp-utils.c b/src/openmp-utils.c index 51393f3b7c..b65a661eaf 100644 --- a/src/openmp-utils.c +++ b/src/openmp-utils.c @@ -79,6 +79,8 @@ SEXP getDTthreads_R(SEXP verbose) { if (LOGICAL(verbose)[0]) { #ifndef _OPENMP Rprintf(_("This installation of data.table has not been compiled with OpenMP support.\n")); + #else + Rprintf(_(" OpenMP version (_OPENMP) %d\n"), _OPENMP); // user can use Google to map 201511 to 4.5; it's odd that OpenMP API does not provide 4.5 #endif // this output is captured, paste0(collapse="; ")'d, and placed at the end of test.data.table() for display in the last 13 lines of CRAN check logs // it is also printed at the start of test.data.table() so that we can trace any Killed events on CRAN before the end is reached