From a36d32eb4bc9cb242b9af3c7ed5b66a518a0045e Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Wed, 11 Nov 2020 20:29:48 -0700
Subject: [PATCH 01/10] added @echo to Makevars, comments to configure, and
 clang-11 with ASAN and OpenMP too now works for CRAN_Release.cmd

---
 .dev/CRAN_Release.cmd | 2 +-
 configure             | 4 ++++
 src/Makevars.in       | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd
index 902d66efd1..ce73aa84fe 100644
--- a/.dev/CRAN_Release.cmd
+++ b/.dev/CRAN_Release.cmd
@@ -308,7 +308,7 @@ make
 
 # use latest available `apt-cache search gcc-` or `clang-`
 cd ~/build/R-devel-strict-clang
-./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-10 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer"
+./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer"
 make
 
 cd ~/build/R-devel-strict-gcc
diff --git a/configure b/configure
index c0745e527e..c5c1338a82 100755
--- a/configure
+++ b/configure
@@ -85,7 +85,11 @@ EOF
 
 if [ "$R_NO_OPENMP" = "1" ]; then
   # Compilation failed -- try forcing -fopenmp instead.
+  # TODO: doesn't R_NO_OPENMP need to be set to 0 before next line?
   ${CC} ${CFLAGS} -fopenmp test-omp.c || R_NO_OPENMP=1
+  # TODO: and then nothing seems to be done with this outcome
+else
+  echo "R CMD SHLIB supports OpenMP without any exra hint"
 fi
 
 # Clean up.
diff --git a/src/Makevars.in b/src/Makevars.in
index 76218cb65a..7750c1e8ac 100644
--- a/src/Makevars.in
+++ b/src/Makevars.in
@@ -7,5 +7,7 @@ PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ -lz
 # Hence the onerous @...@ substitution. Is it still appropriate in 2020 that we can't use +=?
 
 all: $(SHLIB)
+	@echo PKG_CFLAGS = $(PKG_CFLAGS)
+	@echo PKG_LIBS = $(PKG_LIBS)
 	if [ "$(SHLIB)" != "datatable$(SHLIB_EXT)" ]; then mv $(SHLIB) datatable$(SHLIB_EXT); fi
 	if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id datatable$(SHLIB_EXT) datatable$(SHLIB_EXT); fi

From 091ec3944c98f7128417cdbffd71886adae0ca27 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Wed, 11 Nov 2020 20:30:52 -0700
Subject: [PATCH 02/10] aside: j++ to ++j in loops

---
 src/fsort.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/fsort.c b/src/fsort.c
index 00c7e5c10b..57410d4e65 100644
--- a/src/fsort.c
+++ b/src/fsort.c
@@ -2,15 +2,15 @@
 
 #define INSERT_THRESH 200  // TODO: expose via api and test
 
-static void dinsert(double *x, int n) {   // TODO: if and when twiddled, double => ull
+static void dinsert(double *x, const int n) {   // TODO: if and when twiddled, double => ull
   if (n<2) return;
-  for (int i=1; i<n; i++) {
+  for (int i=1; i<n; ++i) {
     double xtmp = x[i];
     int j = i-1;
     if (xtmp<x[j]) {
       x[j+1] = x[j];
       j--;
-      while (j>=0 && xtmp<x[j]) { x[j+1] = x[j]; j--; }
+      while (j>=0 && xtmp<x[j]) { x[j+1] = x[j]; --j; }
       x[j+1] = xtmp;
     }
   }
@@ -34,7 +34,7 @@ static void dradix_r(  // single-threaded recursive worker
   unsigned long long mask = width-1;
 
   const double *tmp=in;
-  for (R_xlen_t i=0; i<n; i++) {
+  for (R_xlen_t i=0; i<n; ++i) {
     counts[(*(unsigned long long *)tmp - minULL) >> fromBit & mask]++;
     tmp++;
   }
@@ -48,7 +48,7 @@ static void dradix_r(  // single-threaded recursive worker
   }
 
   R_xlen_t cumSum=0;
-  for (R_xlen_t i=0; cumSum<n; i++) { // cumSum<n better than i<width as may return early
+  for (R_xlen_t i=0; cumSum<n; ++i) { // cumSum<n better than i<width as may return early
     unsigned long long tmp;
     if ((tmp=counts[i])) {  // don't cumulate through 0s, important below to save a wasteful memset to zero
       counts[i] = cumSum;
@@ -57,7 +57,7 @@ static void dradix_r(  // single-threaded recursive worker
   } // leaves cumSum==n && 0<i && i<=width
 
   tmp=in;
-  for (R_xlen_t i=0; i<n; i++) {  // go forwards not backwards to give cpu pipeline better chance
+  for (R_xlen_t i=0; i<n; ++i) {  // go forwards not backwards to give cpu pipeline better chance
     int thisx = (*(unsigned long long *)tmp - minULL) >> fromBit & mask;
     working[ counts[thisx]++ ] = *tmp;
     tmp++;
@@ -71,12 +71,12 @@ static void dradix_r(  // single-threaded recursive worker
     // Also this way, we don't need to know how big thisCounts is and therefore no possibility of getting that wrong.
     // wasteful thisCounts[i]=0 even when already 0 is better than a branch. We are highly recursive at this point
     // so avoiding memset() is known to be worth it.
-    for (int i=0; counts[i]<n; i++) counts[i]=0;
+    for (int i=0; counts[i]<n; ++i) counts[i]=0;
     return;
   }
 
   cumSum=0;
-  for (int i=0; cumSum<n; i++) {   // again, cumSum<n better than i<width as it can return early
+  for (int i=0; cumSum<n; ++i) {   // again, cumSum<n better than i<width as it can return early
     if (counts[i] == 0) continue;
     R_xlen_t thisN = counts[i] - cumSum;  // undo cummulate; i.e. diff
     if (thisN <= INSERT_THRESH) {
@@ -132,12 +132,12 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   double mins[nBatch], maxs[nBatch];
   const double *restrict xp = REAL(x);
   #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nBatch, false))
-  for (int batch=0; batch<nBatch; batch++) {
+  for (int batch=0; batch<nBatch; ++batch) {
     R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const double *restrict d = xp + batchSize*batch;
     double myMin=*d, myMax=*d;
     d++;
-    for (R_xlen_t j=1; j<thisLen; j++) {
+    for (R_xlen_t j=1; j<thisLen; ++j) {
       // TODO: test for sortedness here as well.
       if (*d<myMin) myMin=*d;
       else if (*d>myMax) myMax=*d;
@@ -148,7 +148,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   }
   t[2] = wallclock();
   double min=mins[0], max=maxs[0];
-  for (int i=1; i<nBatch; i++) {
+  for (int i=1; i<nBatch; ++i) {
     // TODO: if boundaries are sorted then we only need sort the unsorted batches known above
     if (mins[i]<min) min=mins[i];
     if (maxs[i]>max) max=maxs[i];
@@ -180,11 +180,11 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
                        nBatch, (uint64_t)batchSize, (uint64_t)lastBatchSize);
   t[3] = wallclock();
   #pragma omp parallel for num_threads(nth)
-  for (int batch=0; batch<nBatch; batch++) {
+  for (int batch=0; batch<nBatch; ++batch) {
     R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const uint64_t *restrict tmp = (uint64_t *)(xp + batchSize*batch);
     R_xlen_t *restrict thisCounts = counts + batch*MSBsize;
-    for (R_xlen_t j=0; j<thisLen; j++) {
+    for (R_xlen_t j=0; j<thisLen; ++j) {
       thisCounts[(*tmp - minULL) >> shift]++;
       tmp++;
     }
@@ -192,9 +192,9 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
 
   // cumulate columnwise; parallel histogram; small so no need to parallelize
   R_xlen_t rollSum=0;
-  for (int msb=0; msb<MSBsize; msb++) {
+  for (int msb=0; msb<MSBsize; ++msb) {
     int j = msb;
-    for (int batch=0; batch<nBatch; batch++) {
+    for (int batch=0; batch<nBatch; ++batch) {
       R_xlen_t tmp = counts[j];
       counts[j] = rollSum;
       rollSum += tmp;
@@ -205,11 +205,11 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   t[4] = wallclock();
   uint64_t *restrict ansi64 = (uint64_t *)ans;
   #pragma omp parallel for num_threads(nth)
-  for (int batch=0; batch<nBatch; batch++) {
+  for (int batch=0; batch<nBatch; ++batch) {
     R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const uint64_t *restrict source = (uint64_t *)(xp + batchSize*batch);
     R_xlen_t *restrict thisCounts = counts + batch*MSBsize;
-    for (R_xlen_t j=0; j<thisLen; j++) {
+    for (R_xlen_t j=0; j<thisLen; ++j) {
       ansi64[ thisCounts[(*source - minULL) >> shift]++ ] = *source;
       // This assignment to ans is not random access as it may seem, but cache efficient by
       // design since target pages are written to contiguously. MSBsize * 4k < cache.
@@ -232,7 +232,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
     R_xlen_t *msbFrom = malloc(MSBsize*sizeof(R_xlen_t));
     int *order = malloc(MSBsize*sizeof(int));
     R_xlen_t cumSum = 0;
-    for (int i=0; i<MSBsize; i++) {
+    for (int i=0; i<MSBsize; ++i) {
       msbFrom[i] = cumSum;
       msbCounts[i] = msbCounts[i] - cumSum;
       cumSum += msbCounts[i];
@@ -267,7 +267,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
       // All we assume here is that a thread can never be assigned to an earlier iteration; i.e. threads 0:(nth-1)
       // get iterations 0:(nth-1) possibly out of order, then first-come-first-served in order after that.
       // If a thread deals with an msb lower than the first one it dealt with, then its *working will be too small.
-      for (int msb=0; msb<MSBsize; msb++) {
+      for (int msb=0; msb<MSBsize; ++msb) {
 
         R_xlen_t from= msbFrom[order[msb]];
         R_xlen_t thisN = msbCounts[order[msb]];
@@ -305,7 +305,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   //       It's a perfectly contiguous and cache efficient parallel scan so should be relatively negligible.
 
   double tot = t[7]-t[0];
-  if (verbose) for (int i=1; i<=7; i++) {
+  if (verbose) for (int i=1; i<=7; ++i) {
     Rprintf(_("%d: %.3f (%4.1f%%)\n"), i, t[i]-t[i-1], 100.*(t[i]-t[i-1])/tot);
   }
 

From f8ea5945b6d63f9e43fe251ca99dc6c052210739 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Wed, 11 Nov 2020 20:49:55 -0700
Subject: [PATCH 03/10] unsigned long long, and R_xlen_t, to uint64_t

---
 src/fsort.c | 76 ++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/fsort.c b/src/fsort.c
index 57410d4e65..934d54e23e 100644
--- a/src/fsort.c
+++ b/src/fsort.c
@@ -18,27 +18,27 @@ static void dinsert(double *x, const int n) {   // TODO: if and when twiddled, d
 
 static union {
   double d;
-  unsigned long long ull;
+  uint64_t ull;
 } u;
-static unsigned long long minULL;
+static uint64_t minULL;
 
 static void dradix_r(  // single-threaded recursive worker
   double *in,          // n doubles to be sorted
   double *working,     // working memory to put the sorted items before copying over *in; must not overlap *in
-  R_xlen_t n,          // number of items to sort.  *in and *working must be at least n long
+  uint64_t n,          // number of items to sort.  *in and *working must be at least n long
   int fromBit,         // After twiddle to ordered ull, the bits [fromBit,toBit] are used to count
   int toBit,           //   fromBit<toBit; bit 0 is the least significant; fromBit is right shift amount too
-  R_xlen_t *counts     // already zero'd counts vector, 2^(toBit-fromBit+1) long. A stack of these is reused.
+  uint64_t *counts     // already zero'd counts vector, 2^(toBit-fromBit+1) long. A stack of these is reused.
 ) {
-  unsigned long long width = 1ULL<<(toBit-fromBit+1);
-  unsigned long long mask = width-1;
+  uint64_t width = 1ULL<<(toBit-fromBit+1);
+  uint64_t mask = width-1;
 
   const double *tmp=in;
-  for (R_xlen_t i=0; i<n; ++i) {
-    counts[(*(unsigned long long *)tmp - minULL) >> fromBit & mask]++;
+  for (uint64_t i=0; i<n; ++i) {
+    counts[(*(uint64_t *)tmp - minULL) >> fromBit & mask]++;
     tmp++;
   }
-  int last = (*(unsigned long long *)--tmp - minULL) >> fromBit & mask;
+  int last = (*(uint64_t *)--tmp - minULL) >> fromBit & mask;
   if (counts[last] == n) {
     // Single value for these bits here. All counted in one bucket which must be the bucket for the last item.
     counts[last] = 0;  // clear ready for reuse. All other counts must be zero already so save time by not setting to 0.
@@ -47,9 +47,9 @@ static void dradix_r(  // single-threaded recursive worker
     return;
   }
 
-  R_xlen_t cumSum=0;
-  for (R_xlen_t i=0; cumSum<n; ++i) { // cumSum<n better than i<width as may return early
-    unsigned long long tmp;
+  uint64_t cumSum=0;
+  for (uint64_t i=0; cumSum<n; ++i) { // cumSum<n better than i<width as may return early
+    uint64_t tmp;
     if ((tmp=counts[i])) {  // don't cumulate through 0s, important below to save a wasteful memset to zero
       counts[i] = cumSum;
       cumSum += tmp;
@@ -57,8 +57,8 @@ static void dradix_r(  // single-threaded recursive worker
   } // leaves cumSum==n && 0<i && i<=width
 
   tmp=in;
-  for (R_xlen_t i=0; i<n; ++i) {  // go forwards not backwards to give cpu pipeline better chance
-    int thisx = (*(unsigned long long *)tmp - minULL) >> fromBit & mask;
+  for (uint64_t i=0; i<n; ++i) {  // go forwards not backwards to give cpu pipeline better chance
+    int thisx = (*(uint64_t *)tmp - minULL) >> fromBit & mask;
     working[ counts[thisx]++ ] = *tmp;
     tmp++;
   }
@@ -78,7 +78,7 @@ static void dradix_r(  // single-threaded recursive worker
   cumSum=0;
   for (int i=0; cumSum<n; ++i) {   // again, cumSum<n better than i<width as it can return early
     if (counts[i] == 0) continue;
-    R_xlen_t thisN = counts[i] - cumSum;  // undo cummulate; i.e. diff
+    uint64_t thisN = counts[i] - cumSum;  // undo cummulate; i.e. diff
     if (thisN <= INSERT_THRESH) {
       dinsert(in+cumSum, thisN);  // for thisN==1 this'll return instantly. Probably better than several branches here.
     } else {
@@ -89,13 +89,13 @@ static void dradix_r(  // single-threaded recursive worker
   }
 }
 
-R_xlen_t *qsort_data;
+uint64_t *qsort_data;
 // would have liked to define cmp inside fsort where qsort is called but wasn't sure that's portable
 int qsort_cmp(const void *a, const void *b) {
   // return >0 if the element a goes after the element b
   // doesn't master if stable or not
-  R_xlen_t x = qsort_data[*(int *)a];
-  R_xlen_t y = qsort_data[*(int *)b];
+  uint64_t x = qsort_data[*(int *)a];
+  uint64_t y = qsort_data[*(int *)b];
   // return x-y;  would like this, but this is long and the cast to int return may not preserve sign
   // We have long vectors in mind (1e10(74GB), 1e11(740GB)) where extreme skew may feasibly mean the largest count
   // is greater than 2^32. The first split is (currently) 16 bits so should be very rare but to be safe keep 64bit counts.
@@ -133,11 +133,11 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   const double *restrict xp = REAL(x);
   #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nBatch, false))
   for (int batch=0; batch<nBatch; ++batch) {
-    R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
+    uint64_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const double *restrict d = xp + batchSize*batch;
     double myMin=*d, myMax=*d;
     d++;
-    for (R_xlen_t j=1; j<thisLen; ++j) {
+    for (uint64_t j=1; j<thisLen; ++j) {
       // TODO: test for sortedness here as well.
       if (*d<myMin) myMin=*d;
       else if (*d>myMax) myMax=*d;
@@ -159,7 +159,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   //       avoid twiddle function call as expensive in recent tests (0.34 vs 2.7)
   //       possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep
   u.d = max;
-  unsigned long long maxULL = u.ull;
+  uint64_t maxULL = u.ull;
   u.d = min;
   minULL = u.ull;  // set static global for use by dradix_r
 
@@ -169,33 +169,33 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   size_t MSBsize = 1LL<<MSBNbits;                   // the number of possible MSB values (16 bits => 65,536)
   if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n"), maxBit, MSBNbits, shift, MSBsize);
 
-  R_xlen_t *counts = calloc(nBatch*MSBsize, sizeof(R_xlen_t));
+  uint64_t *counts = calloc(nBatch*MSBsize, sizeof(uint64_t));
   if (counts==NULL) error(_("Unable to allocate working memory"));
   // provided MSBsize>=9, each batch is a multiple of at least one 4k page, so no page overlap
   // TODO: change all calloc, malloc and free to Calloc and Free to be robust to error() and catch ooms.
 
   if (verbose) Rprintf(_("counts is %dMB (%d pages per nBatch=%d, batchSize=%"PRIu64", lastBatchSize=%"PRIu64")\n"),
-                       (int)(nBatch*MSBsize*sizeof(R_xlen_t)/(1024*1024)),
-                       (int)(nBatch*MSBsize*sizeof(R_xlen_t)/(4*1024*nBatch)),
+                       (int)(nBatch*MSBsize*sizeof(uint64_t)/(1024*1024)),
+                       (int)(nBatch*MSBsize*sizeof(uint64_t)/(4*1024*nBatch)),
                        nBatch, (uint64_t)batchSize, (uint64_t)lastBatchSize);
   t[3] = wallclock();
   #pragma omp parallel for num_threads(nth)
   for (int batch=0; batch<nBatch; ++batch) {
-    R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
+    uint64_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const uint64_t *restrict tmp = (uint64_t *)(xp + batchSize*batch);
-    R_xlen_t *restrict thisCounts = counts + batch*MSBsize;
-    for (R_xlen_t j=0; j<thisLen; ++j) {
+    uint64_t *restrict thisCounts = counts + batch*MSBsize;
+    for (uint64_t j=0; j<thisLen; ++j) {
       thisCounts[(*tmp - minULL) >> shift]++;
       tmp++;
     }
   }
 
   // cumulate columnwise; parallel histogram; small so no need to parallelize
-  R_xlen_t rollSum=0;
+  uint64_t rollSum=0;
   for (int msb=0; msb<MSBsize; ++msb) {
     int j = msb;
     for (int batch=0; batch<nBatch; ++batch) {
-      R_xlen_t tmp = counts[j];
+      uint64_t tmp = counts[j];
       counts[j] = rollSum;
       rollSum += tmp;
       j += MSBsize;  // deliberately non-contiguous here
@@ -206,10 +206,10 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   uint64_t *restrict ansi64 = (uint64_t *)ans;
   #pragma omp parallel for num_threads(nth)
   for (int batch=0; batch<nBatch; ++batch) {
-    R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
+    uint64_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const uint64_t *restrict source = (uint64_t *)(xp + batchSize*batch);
-    R_xlen_t *restrict thisCounts = counts + batch*MSBsize;
-    for (R_xlen_t j=0; j<thisLen; ++j) {
+    uint64_t *restrict thisCounts = counts + batch*MSBsize;
+    for (uint64_t j=0; j<thisLen; ++j) {
       ansi64[ thisCounts[(*source - minULL) >> shift]++ ] = *source;
       // This assignment to ans is not random access as it may seem, but cache efficient by
       // design since target pages are written to contiguously. MSBsize * 4k < cache.
@@ -226,12 +226,12 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
     int fromBit = toBit>7 ? toBit-7 : 0;
 
     // sort bins by size, largest first to minimise last-man-home
-    R_xlen_t *msbCounts = counts + (nBatch-1)*MSBsize;
+    uint64_t *msbCounts = counts + (nBatch-1)*MSBsize;
     // msbCounts currently contains the ending position of each MSB (the starting location of the next) even across empty
     if (msbCounts[MSBsize-1] != xlength(x)) error(_("Internal error: counts[nBatch-1][MSBsize-1] != length(x)")); // # nocov
-    R_xlen_t *msbFrom = malloc(MSBsize*sizeof(R_xlen_t));
+    uint64_t *msbFrom = malloc(MSBsize*sizeof(uint64_t));
     int *order = malloc(MSBsize*sizeof(int));
-    R_xlen_t cumSum = 0;
+    uint64_t cumSum = 0;
     for (int i=0; i<MSBsize; ++i) {
       msbFrom[i] = cumSum;
       msbCounts[i] = msbCounts[i] - cumSum;
@@ -255,7 +255,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
     t[6] = wallclock();
     #pragma omp parallel num_threads(getDTthreads(MSBsize, false))
     {
-      R_xlen_t *counts = calloc((toBit/8 + 1)*256, sizeof(R_xlen_t));
+      uint64_t *counts = calloc((toBit/8 + 1)*256, sizeof(uint64_t));
       // each thread has its own (small) stack of counts
       // don't use VLAs here: perhaps too big for stack yes but more that VLAs apparently fail with schedule(dynamic)
 
@@ -269,8 +269,8 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
       // If a thread deals with an msb lower than the first one it dealt with, then its *working will be too small.
       for (int msb=0; msb<MSBsize; ++msb) {
 
-        R_xlen_t from= msbFrom[order[msb]];
-        R_xlen_t thisN = msbCounts[order[msb]];
+        uint64_t from= msbFrom[order[msb]];
+        uint64_t thisN = msbCounts[order[msb]];
 
         if (working==NULL) working = malloc(thisN * sizeof(double)); // TODO: check succeeded otherwise exit gracefully
         // Depends on msbCounts being sorted largest first before this parallel loop

From 544a674da8afc9c532b85acf2fa12bdc5f945d18 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Wed, 11 Nov 2020 21:08:40 -0700
Subject: [PATCH 04/10] remove union

---
 src/fsort.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/fsort.c b/src/fsort.c
index 934d54e23e..6c776127e7 100644
--- a/src/fsort.c
+++ b/src/fsort.c
@@ -16,10 +16,6 @@ static void dinsert(double *x, const int n) {   // TODO: if and when twiddled, d
   }
 }
 
-static union {
-  double d;
-  uint64_t ull;
-} u;
 static uint64_t minULL;
 
 static void dradix_r(  // single-threaded recursive worker
@@ -158,10 +154,9 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   // TODO: -0ULL should allow negatives
   //       avoid twiddle function call as expensive in recent tests (0.34 vs 2.7)
   //       possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep
-  u.d = max;
-  uint64_t maxULL = u.ull;
-  u.d = min;
-  minULL = u.ull;  // set static global for use by dradix_r
+ 
+  uint64_t maxULL = *(uint64_t *)&max;
+  minULL = *(uint64_t *)&min;  // set static global for use by dradix_r
 
   int maxBit = floor(log(maxULL-minULL) / log(2));  // 0 is the least significant bit
   int MSBNbits = maxBit > 15 ? 16 : maxBit+1;       // how many bits make up the MSB

From a15495ceb2380a8010e188a7f4f8bcdffaf87de0 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Thu, 12 Nov 2020 20:20:05 -0700
Subject: [PATCH 05/10] monotonic:dynamic schedule, and alloc fail todos done

---
 NEWS.md            |  2 ++
 src/fsort.c        | 83 ++++++++++++++++++++++++++++------------------
 src/init.c         |  6 ++--
 src/myomp.h        |  9 +++++
 src/openmp-utils.c |  2 ++
 5 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index afde983b46..49fb886d1b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -18,6 +18,8 @@
 
 3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example wich has also been added to the test suite.
 
+4. `getDTthreads(verbose=TRUE)` now reports the version of OpenMP being used. It is the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API as far as we know.
+
 
 # data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1)  (19 Oct 2020)
 
diff --git a/src/fsort.c b/src/fsort.c
index 6c776127e7..748c2ad85d 100644
--- a/src/fsort.c
+++ b/src/fsort.c
@@ -164,10 +164,9 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   size_t MSBsize = 1LL<<MSBNbits;                   // the number of possible MSB values (16 bits => 65,536)
   if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n"), maxBit, MSBNbits, shift, MSBsize);
 
-  uint64_t *counts = calloc(nBatch*MSBsize, sizeof(uint64_t));
-  if (counts==NULL) error(_("Unable to allocate working memory"));
+  uint64_t *counts = (uint64_t *)R_alloc(nBatch*MSBsize, sizeof(uint64_t));
+  memset(counts, 0, nBatch*MSBsize*sizeof(uint64_t));
   // provided MSBsize>=9, each batch is a multiple of at least one 4k page, so no page overlap
-  // TODO: change all calloc, malloc and free to Calloc and Free to be robust to error() and catch ooms.
 
   if (verbose) Rprintf(_("counts is %dMB (%d pages per nBatch=%d, batchSize=%"PRIu64", lastBatchSize=%"PRIu64")\n"),
                        (int)(nBatch*MSBsize*sizeof(uint64_t)/(1024*1024)),
@@ -224,8 +223,8 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
     uint64_t *msbCounts = counts + (nBatch-1)*MSBsize;
     // msbCounts currently contains the ending position of each MSB (the starting location of the next) even across empty
     if (msbCounts[MSBsize-1] != xlength(x)) error(_("Internal error: counts[nBatch-1][MSBsize-1] != length(x)")); // # nocov
-    uint64_t *msbFrom = malloc(MSBsize*sizeof(uint64_t));
-    int *order = malloc(MSBsize*sizeof(int));
+    uint64_t *msbFrom = (uint64_t *)R_alloc(MSBsize, sizeof(uint64_t));
+    int *order = (int *)R_alloc(MSBsize, sizeof(int));
     uint64_t cumSum = 0;
     for (int i=0; i<MSBsize; ++i) {
       msbFrom[i] = cumSum;
@@ -239,7 +238,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
     // TODO: time this qsort but likely insignificant.
 
     if (verbose) {
-      Rprintf(_("Top 5 MSB counts: ")); for(int i=0; i<5; i++) Rprintf(_("%"PRId64" "), (int64_t)msbCounts[order[i]]); Rprintf(_("\n"));
+      Rprintf(_("Top 20 MSB counts: ")); for(int i=0; i<MIN(MSBsize,20); i++) Rprintf(_("%"PRId64" "), (int64_t)msbCounts[order[i]]); Rprintf(_("\n"));
       Rprintf(_("Reduced MSBsize from %d to "), MSBsize);
     }
     while (MSBsize>0 && msbCounts[order[MSBsize-1]] < 2) MSBsize--;
@@ -247,54 +246,75 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
       Rprintf(_("%d by excluding 0 and 1 counts\n"), MSBsize);
     }
 
+    bool failed=false, alloc_fail=false, non_monotonic=false; // shared bools only ever assigned true; no need for atomic or critical assign
     t[6] = wallclock();
     #pragma omp parallel num_threads(getDTthreads(MSBsize, false))
     {
-      uint64_t *counts = calloc((toBit/8 + 1)*256, sizeof(uint64_t));
-      // each thread has its own (small) stack of counts
+      // each thread has its own small stack of counts
       // don't use VLAs here: perhaps too big for stack yes but more that VLAs apparently fail with schedule(dynamic)
-
-      double *working=NULL;
-      // the working memory (for the largest groups) is allocated the first time the thread is assigned to
-      // an iteration.
-
-      #pragma omp for schedule(dynamic,1)
-      // All we assume here is that a thread can never be assigned to an earlier iteration; i.e. threads 0:(nth-1)
-      // get iterations 0:(nth-1) possibly out of order, then first-come-first-served in order after that.
-      // If a thread deals with an msb lower than the first one it dealt with, then its *working will be too small.
+      uint64_t *restrict mycounts = calloc((toBit/8 + 1)*256, sizeof(uint64_t));
+      if (mycounts==NULL) {
+        failed=true; alloc_fail=true;  // # nocov
+      }
+      double *restrict myworking = NULL;
+      // the working memory for the largest group per thread is allocated when the thread receives its first iteration
+      int myfirstmsb = -1;  // for the monotonicity check
+      
+      #pragma omp for schedule(monotonic_dynamic,1)
+      // We require here that a thread can never be assigned to an earlier iteration; e.g. threads 0:(nth-1)
+      // get iterations 0:(nth-1), possibly out of order, then first-come-first-served in order after that.
+      // If a thread deals with an msb earlier than the first one it dealt with, then its *working will be too small.
+      // This is not true in clang-11 using OpenMP 5.0 (_OPENMP==201811); #4786. But the monotonic: modifier (OpenMP 4.5+)
+      // makes it true. To continue support of OpenMP<4.5:
+      //      i) myomp.h defines monotonic_dynamic since macro name cannot contain colon
+      // and ii) we now check monotonicity is true, otherwise halt with helpful error. It's likely true anyway in older implementations
+      //         since that's the simplest implementation and what we thought was the case anyway. I guess that clang-11 is doing
+      //         some new advanced dynamic optimizations; e.g. allocating threads to iterations based on knowledge of data locality rather
+      //         than simple monotonic first-come-first-served. However, we have arranged for the iterations to be in size order, so
+      //         monotonic:dynamic is the right schedule here.
       for (int msb=0; msb<MSBsize; ++msb) {
+        if (failed) continue;
 
-        uint64_t from= msbFrom[order[msb]];
+        uint64_t from = msbFrom[order[msb]];
         uint64_t thisN = msbCounts[order[msb]];
 
-        if (working==NULL) working = malloc(thisN * sizeof(double)); // TODO: check succeeded otherwise exit gracefully
+        if (myworking==NULL) {
+          myworking = malloc(thisN * sizeof(double));
+          if (myworking==NULL) {
+            failed=true; alloc_fail=true; continue;  // # nocov
+          }
+          myfirstmsb = msb;
+        }
+        if (myfirstmsb==-1 || msb<myfirstmsb) {
+          failed=true; non_monotonic=true; continue;  // # nocov
+        }
+        
         // Depends on msbCounts being sorted largest first before this parallel loop
         // Could be significant RAM saving if the largest msb is
         // a lot larger than the 2nd largest msb, especially as nth grows to perhaps 128 on X1.
         // However, the initial split is so large (16bits => 65,536) that the largest MSB should be
         // relatively small anyway (n/65,536 if uniformly distributed).
-        // For msb>=nth, that thread's *working will already be big
-        // enough because the smallest *working (for thread nth-1) is big enough for all iterations following.
+        // For msb>=nth, that thread's *myworking will already be big enough because
+        // the smallest *myworking (for thread nth-1) is big enough for all iterations following.
         // Progressively, less and less of the working will be needed by the thread (just the first thisN will be
-        // used) and the unused pages will simply not be cached.
-        // TODO: Calloc isn't thread-safe. But this deep malloc should be ok here as no possible error() points
-        //       before free. Just need to add the check and exit thread safely somehow.
+        // used) and the unused lines will simply not be cached.
 
         if (thisN <= INSERT_THRESH) {
           dinsert(ans+from, thisN);
         } else {
-          dradix_r(ans+from, working, thisN, fromBit, toBit, counts);
+          dradix_r(ans+from, myworking, thisN, fromBit, toBit, mycounts);
         }
       }
-      free(counts);
-      free(working);
+      free(mycounts);
+      free(myworking);
     }
-    free(msbFrom);
-    free(order);
+    if (non_monotonic)
+      error("OpenMP %d did not assign threads to iterations monotonically. Please search Stack Overflow for this message.", MY_OPENMP); // # nocov; #4786 in v1.13.4
+    if (alloc_fail)
+      error(_("Unable to allocate working memory")); // # nocov
   }
   t[7] = wallclock();
-  free(counts);
-
+  
   // TODO: parallel sweep to check sorted using <= on original input. Feasible that twiddling messed up.
   //       After a few years of heavy use remove this check for speed, and move into unit tests.
   //       It's a perfectly contiguous and cache efficient parallel scan so should be relatively negligible.
@@ -303,7 +323,6 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   if (verbose) for (int i=1; i<=7; ++i) {
     Rprintf(_("%d: %.3f (%4.1f%%)\n"), i, t[i]-t[i-1], 100.*(t[i]-t[i-1])/tot);
   }
-
   UNPROTECT(nprotect);
   return(ansVec);
 }
diff --git a/src/init.c b/src/init.c
index 1247c585b6..6a5377ca2a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -393,11 +393,11 @@ SEXP hasOpenMP() {
   // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN
   // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is
   // why RPRINTF is capitalized in this comment to avoid that grep.
-  // TODO: perhaps .Platform or .Machine in R itself could contain whether OpenMP is available.
+  // .Platform or .Machine in R itself does not contain whether OpenMP is available because compiler and flags are per-package.
   #ifdef _OPENMP
-  return ScalarLogical(TRUE);
+  return ScalarInteger(_OPENMP); // return the version; e.g. 201511 (i.e. 4.5)
   #else
-  return ScalarLogical(FALSE);
+  return ScalarInteger(0);       // 0 rather than NA so that if() can be used on the result
   #endif
 }
 // # nocov end
diff --git a/src/myomp.h b/src/myomp.h
index 58a5703f00..57d8b58734 100644
--- a/src/myomp.h
+++ b/src/myomp.h
@@ -1,5 +1,13 @@
 #ifdef _OPENMP
   #include <omp.h>
+  #if _OPENMP >= 201511
+    #define monotonic_dynamic monotonic:dynamic // #4786
+  #else
+    #define monotonic_dynamic dynamic
+  #endif
+  #define MY_OPENMP              _OPENMP
+  // for use in error messages (e.g. fsort.c; #4786) to save an #ifdef each time
+  // initially chose OMP_VERSION but figured OpenMP might define that in future, so picked MY_ prefix
 #else
   // for machines with compilers void of openmp support
   #define omp_get_num_threads()  1
@@ -9,5 +17,6 @@
   #define omp_get_num_procs()    1
   #define omp_set_nested(a)   // empty statement to remove the call
   #define omp_get_wtime()        0
+  #define MY_OPENMP              0
 #endif
 
diff --git a/src/openmp-utils.c b/src/openmp-utils.c
index 51393f3b7c..b65a661eaf 100644
--- a/src/openmp-utils.c
+++ b/src/openmp-utils.c
@@ -79,6 +79,8 @@ SEXP getDTthreads_R(SEXP verbose) {
   if (LOGICAL(verbose)[0]) {
     #ifndef _OPENMP
       Rprintf(_("This installation of data.table has not been compiled with OpenMP support.\n"));
+    #else
+      Rprintf(_("  OpenMP version (_OPENMP)       %d\n"), _OPENMP); // user can use Google to map 201511 to 4.5; it's odd that OpenMP API does not provide 4.5
     #endif
     // this output is captured, paste0(collapse="; ")'d, and placed at the end of test.data.table() for display in the last 13 lines of CRAN check logs
     // it is also printed at the start of test.data.table() so that we can trace any Killed events on CRAN before the end is reached

From 980ebdfb16726d46b9b2d8be28a45596bc72930b Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Thu, 12 Nov 2020 21:04:56 -0700
Subject: [PATCH 06/10] news item, and moved configure echo

---
 NEWS.md   | 4 ++--
 configure | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 49fb886d1b..f6c5a6995f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,6 +10,8 @@
 
 1. `as.matrix(<empty DT>)` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting.
 
+2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API.
+
 ## NOTES
 
 1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/).
@@ -18,8 +20,6 @@
 
 3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example wich has also been added to the test suite.
 
-4. `getDTthreads(verbose=TRUE)` now reports the version of OpenMP being used. It is the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API as far as we know.
-
 
 # data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1)  (19 Oct 2020)
 
diff --git a/configure b/configure
index c5c1338a82..e29156d61a 100755
--- a/configure
+++ b/configure
@@ -89,7 +89,7 @@ if [ "$R_NO_OPENMP" = "1" ]; then
   ${CC} ${CFLAGS} -fopenmp test-omp.c || R_NO_OPENMP=1
   # TODO: and then nothing seems to be done with this outcome
 else
-  echo "R CMD SHLIB supports OpenMP without any exra hint"
+  echo "R CMD SHLIB supports OpenMP without any extra hint"
 fi
 
 # Clean up.
@@ -104,7 +104,6 @@ if [ "$R_NO_OPENMP" = "1" ]; then
   echo "*** Continuing installation without OpenMP support..."
   sed -e "s|@openmp_cflags@||" src/Makevars.in > src/Makevars
 else
-  echo "OpenMP supported"
   sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars
 fi
 # retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too.

From 68350abe3abbb91a4d52f4f136003bf406f112d0 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Thu, 12 Nov 2020 21:18:59 -0700
Subject: [PATCH 07/10] news item tweak

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index f6c5a6995f..e47cf50edc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,7 +10,7 @@
 
 1. `as.matrix(<empty DT>)` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting.
 
-2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API.
+2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it may accept `-fopenmp-version=45`. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful.
 
 ## NOTES
 

From b1a650ef68cdf2ced86ef3da6e250b5aa2f643b9 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Thu, 12 Nov 2020 21:30:43 -0700
Subject: [PATCH 08/10] news item tweak

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index e47cf50edc..a7456b9c7c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,7 +10,7 @@
 
 1. `as.matrix(<empty DT>)` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting.
 
-2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it may accept `-fopenmp-version=45`. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful.
+2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway and `fsort` will check that. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful.
 
 ## NOTES
 

From f5c62bf8d6aac51fdbccb4e7795c3a86f9c75ebb Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Thu, 12 Nov 2020 21:52:30 -0700
Subject: [PATCH 09/10] test output= increased from top 5 to top 20

---
 inst/tests/tests.Rraw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 9d207f8866..4dd2809f7d 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -12512,7 +12512,7 @@ x <- as.integer(x)
 test(1888.5, fsort(x), base::sort(x, na.last = FALSE),
              warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread")
 x = runif(1e6)
-test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 5 MSB counts")
+test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts")
 test(1888.7, !base::is.unsorted(y))
 test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE")
 rm(x)

From 16622dac4ed37f550d93c6ca8691ec12351db53c Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Thu, 12 Nov 2020 22:08:13 -0700
Subject: [PATCH 10/10] news item tweak

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index a7456b9c7c..94e1fee81c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,7 +10,7 @@
 
 1. `as.matrix(<empty DT>)` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting.
 
-2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway and `fsort` will check that. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful.
+2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, and if so `fsort` will check that and work fine. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful.
 
 ## NOTES