data.table/src/forder.c at master · Rdatatable/data.table · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "data.table.h"
/*
  Inspired by :
  icount in do_radixsort in src/main/sort.c @ rev 51389.
  base::sort.list(method="radix") turns out not to be a radix sort, but a counting sort :
    http://r.789695.n4.nabble.com/method-radix-in-sort-list-isn-t-actually-a-radix-sort-tp3309470p3309470.html
  Wish granted for R to allow negatives (simple change) : https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15644
  Terdiman and Herf (LSB radix for floating point and other efficiency tricks) :
  http://codercorner.com/RadixSortRevisited.htm
  http://stereopsis.com/radix.html

  Previous version of this file was promoted into base R, see ?base::sort.
  Denmark useR! presentation https://github.com/Rdatatable/data.table/wiki/talks/useR2015_Matt.pdf
  Stanford DSC presentation https://github.com/Rdatatable/data.table/wiki/talks/DSC2016_ParallelSort.pdf
  JSM presentation https://github.com/Rdatatable/data.table/wiki/talks/JSM2018_Matt.pdf
  Techniques used :
    skewed groups are split in parallel
    finds unique bytes to save 256 sweeping
    skips already-grouped yet unsorted
    recursive group gathering for cache efficiency
    reuses R's global character cache (truelength on CHARSXP)
    compressed column can cross byte boundaries to use spare bits (e.g. 2 16-level columns in one byte)
    just the remaining part of key is reordered as the radix progresses
    columnar byte-key for within-radix MT cache efficiency

  Only forder() and dtwiddle() functions are meant for use by other C code in data.table, hence all other functions here are static.
  The coding techniques deployed here are for efficiency. The static functions are recursive or called repetitively and we wish to minimise
  overhead. They reach outside themselves to place results in the end result directly rather than returning many small pieces of memory.
*/

// #define TIMING_ON

static int nth = 1;                 // number of threads to use, throttled by default; used by cleanup() to ensure no mismatch in getDTthreads() calls
static bool retgrp = true;          // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn :
static bool retstats = true;        // return extra flags for any NA, NaN, -Inf, +Inf, non-ASCII, non-UTF8
static int nrow = 0;                // used as group size stack allocation limit (when all groups are 1 row)
static int *gs = NULL;              // gs = final groupsizes e.g. 23,12,87,2,1,34,...
static int gs_alloc = 0;            // allocated size of gs
static int gs_n = 0;                // the number of groups found so far (how much of the allocated gs is used)
static int **gs_thread=NULL;        // each thread has a private buffer which gets flushed to the final gs appropriately
static int *gs_thread_alloc=NULL;
static int *gs_thread_n=NULL;
static int *TMP=NULL;               // UINT16_MAX*sizeof(int) for each thread; used by counting sort in radix_r()
static uint8_t *UGRP=NULL;          // 256 bytes for each thread; used by counting sort in radix_r() when sortType==0 (byte appearance order)

static int  *cradix_counts = NULL;
static SEXP *cradix_xtmp   = NULL;
static SEXP *ustr = NULL;
static int ustr_alloc = 0;
static int ustr_n = 0;
static int ustr_maxlen = 0;
static int sortType = 0;             // 0 just group; -1 descending, +1 ascending
static int nalast = 0;               // 1 (true i.e. last), 0 (false i.e. first), -1 (na i.e. remove)
static int nradix = 0;
static uint8_t **key = NULL;
static int *anso = NULL;
static bool notFirst=false;

static char msg[1001];
// use STOP in this file (not error()) to ensure cleanup() is called first
// snprintf to msg first in case nrow (just as an example) is provided in the message because cleanup() sets nrow to 0
#define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error("%s", msg);} while(0) // # notranslate. http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon

#undef warning
#define warning(...) Do not use warning in this file                // since it can be turned to error via warn=2
/* Using OS realloc() in this file to benefit from (often) in-place realloc() to save copy
 * NB: R_alloc() would be more convenient (fails within) and robust (auto free) but there is no R_realloc(). Implementing R_realloc() would be an alloc and copy, iiuc.
 *     R_Calloc/R_Realloc needs to be R_Free'd, even before error() [R-exts$6.1.2]. An oom within R_Calloc causes a previous R_Calloc to leak so R_Calloc would still needs to be trapped anyway.
 * Therefore, using <<if (!malloc()) STOP(_("helpful context msg"))>> approach to cleanup() on error.
 */

static void free_ustr(void) {
  free(ustr); ustr=NULL;
  ustr_alloc=0; ustr_n=0; ustr_maxlen=0;
}

static void cleanup(void) {
  free(gs); gs=NULL;
  gs_alloc = 0;
  gs_n = 0;

  if (gs_thread!=NULL) for (int i=0; i<nth; i++) free(gs_thread[i]);
  free(gs_thread);       gs_thread=NULL;
  free(gs_thread_alloc); gs_thread_alloc=NULL;
  free(gs_thread_n);     gs_thread_n=NULL;

  free(TMP); TMP=NULL;
  free(UGRP); UGRP=NULL;

  nrow = 0;
  free(cradix_counts); cradix_counts=NULL;
  free(cradix_xtmp);   cradix_xtmp=NULL;
  free_ustr();
  if (key!=NULL) { int i=0; while (key[i]!=NULL) free(key[i++]); }  // ==nradix, other than rare cases e.g. tests 1844.5-6 (#3940), and if a calloc fails
  free(key); key=NULL; nradix=0;
}

// # nocov start
void internal_error_with_cleanup(const char *call_name, const char *format, ...) {
  char buff[1024];
  va_list args;
  va_start(args, format);

  vsnprintf(buff, sizeof(buff), format, args);
  va_end(args);

  cleanup();
  error("%s %s: %s. %s", _("Internal error in"), call_name, buff, _("Please report to the data.table issues tracker."));
}
// # nocov end

static void push(const int *x, const int n) {
  if (!retgrp) return;  // clearer to have the switch here rather than before each call
  int me = omp_get_thread_num();
  int newn = gs_thread_n[me] + n;
  if (gs_thread_alloc[me] < newn) {
    gs_thread_alloc[me] = (newn < nrow/3) ? (1+(newn*2)/4096)*4096 : nrow;  // [2|3] to not overflow and 3 not 2 to avoid allocating close to nrow (nrow groups occurs when all size 1 groups)
    gs_thread[me] = realloc(gs_thread[me], sizeof(*gs_thread[me])*gs_thread_alloc[me]);
    if (gs_thread[me]==NULL) STOP(_("Failed to realloc thread private group size buffer to %d*4bytes"), (int)gs_thread_alloc[me]);
  }
  memcpy(gs_thread[me]+gs_thread_n[me], x, n*sizeof(*gs_thread[me]));
  gs_thread_n[me] += n;
}

static void flush(void) {
  if (!retgrp) return;
  int me = omp_get_thread_num();
  int n = gs_thread_n[me];
  // normally doesn't happen, can be encountered under heavy load, #7051
  if (!n) return; // # nocov
  int newn = gs_n + n;
  if (gs_alloc < newn) {
    gs_alloc = (newn < nrow/3) ? (1+(newn*2)/4096)*4096 : nrow;
    gs = realloc(gs, sizeof(*gs)*gs_alloc);
    if (gs==NULL) STOP(_("Failed to realloc group size result to %d*4bytes"), (int)gs_alloc);
  }
  memcpy(gs+gs_n, gs_thread[me], sizeof(*gs)*n);
  gs_n += n;
  gs_thread_n[me] = 0;
}

#ifdef TIMING_ON
  #define NBLOCK 64
  #define MAX_NTH 256
  static double tblock[MAX_NTH*NBLOCK];
  static int nblock[MAX_NTH*NBLOCK];
  static uint64_t stat[257];
  #define TBEG() double tstart = wallclock();   // tstart declared locally for thread safety
  #define TEND(i) {  \
    double now = wallclock(); \
    int w = omp_get_thread_num()*NBLOCK + i; \
    tblock[w] += now-tstart; \
    nblock[w]++; \
    tstart = now; \
  }
#else
  #define TBEG()
  #define TEND(i)
#endif

// range_* functions return [min,max] of the non-NAs as common uint64_t type
// TODO parallelize these; not a priority according to TIMING_ON though (contiguous read with prefetch)

static void range_i32(const int32_t *x, const int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
{
  int32_t min = NA_INTEGER;
  int32_t max = NA_INTEGER;
  int i=0;
  while(i<n && x[i]==NA_INTEGER) i++;
  int na_count = i;
  if (i<n) max = min = x[i++];
  for(; i<n; i++) {
    int tmp = x[i];
    if (tmp>max) max=tmp;
    else if (tmp<min) {
      if (tmp==NA_INTEGER) na_count++;
      else min=tmp;
    }
  }
  *out_na_count = na_count;
  *out_min = min ^ 0x80000000u;  // map [-2147483648(INT32_MIN), 2147483647(INT32_MAX)] => [0, 4294967295(UINT32_MAX)]
  *out_max = max ^ 0x80000000u;
}

static void range_i64(int64_t *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
{
  int64_t min = INT64_MIN;
  int64_t max = INT64_MIN;
  int i=0;
  while(i<n && x[i]==INT64_MIN) i++;
  int na_count = i;
  if (i<n) max = min = x[i++];
  for(; i<n; i++) {
    int64_t tmp = x[i];
    if (tmp>max) max=tmp;
    else if (tmp<min) {
      if (tmp==INT64_MIN) na_count++;
      else min=tmp;
    }
  }
  *out_na_count = na_count;
  // map [INT64_MIN, INT64_MAX] => [0, UINT64_MAX]
  *out_min = min ^ 0x8000000000000000u;
  *out_max = max ^ 0x8000000000000000u;
}

static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, int *out_infnan_count)
// return range of finite numbers (excluding NA, NaN, -Inf, +Inf), a count of NA and a count of Inf|-Inf|NaN
{
  uint64_t min=0, max=0;
  int na_count=0, infnan_count=0;
  int i=0;
  while(i<n && !R_FINITE(x[i])) { ISNA(x[i++]) ? na_count++ : infnan_count++; }
  if (i<n) { max = min = dtwiddle(x[i++]); }
  for(; i<n; i++) {
    if (!R_FINITE(x[i])) { ISNA(x[i]) ? na_count++ : infnan_count++; continue; }
    uint64_t tmp = dtwiddle(x[i]);
    if (tmp>max) max=tmp;
    else if (tmp<min) min=tmp;
  }
  *out_na_count = na_count;
  *out_infnan_count = infnan_count;
  *out_min = min;
  *out_max = max;
}

// used by bmerge only; chmatch uses coerceUtf8IfNeeded
int StrCmp(SEXP x, SEXP y)
{
  if (x == y) return 0;             // same cached pointer (including NA_STRING==NA_STRING)
  if (x == NA_STRING) return -1;    // x<y
  if (y == NA_STRING) return 1;     // x>y
  return strcmp(CHAR(x), CHAR(y));  // bmerge calls ENC2UTF8 on x and y before passing here
}

static void cradix_r(SEXP *xsub, int n, int radix)
// xsub is a unique set of CHARSXP, to be ordered by reference
// First time, radix==0, and xsub==x. Then recursively moves SEXP together for cache efficiency.
// Quite different to iradix because
//   1) x is known to be unique so fits in cache (wide random access not an issue)
//   2) they're variable length character strings
//   3) no need to maintain o.  Just simply reorder x. No grps or push.
{
  if (n<=1) return;
  while (TRUE) {
  int *thiscounts = cradix_counts + radix*256;
  uint8_t lastx = 0;  // the last x is used to test its bin
  for (int i=0; i<n; i++) {
    lastx = radix<LENGTH(xsub[i]) ? (uint8_t)(CHAR(xsub[i])[radix]) : 1;  // no NA_STRING present,  1 for "" (could use 0 too maybe since NA_STRING not present)
    thiscounts[ lastx ]++;
  }
  if (thiscounts[lastx]==n && radix<ustr_maxlen-1) {
    thiscounts[lastx] = 0;  // all x same value, the rest must be 0 already, save the memset
    radix++;                // tail recursion eliminated to avoid segfault when prefixes are long
    continue;
  }
  int itmp = thiscounts[0];
  for (int i=1; i<256; i++) {
    if (thiscounts[i]) thiscounts[i] = (itmp += thiscounts[i]);  // don't cummulate through 0s, important below
  }
  for (int i=n-1; i>=0; i--) {
    uint8_t thisx = radix<LENGTH(xsub[i]) ? (uint8_t)(CHAR(xsub[i])[radix]) : 1;
    cradix_xtmp[--thiscounts[thisx]] = xsub[i];
  }
  memcpy(xsub, cradix_xtmp, n*sizeof(SEXP));
  if (radix == ustr_maxlen-1) {
    memset(thiscounts, 0, 256*sizeof(*thiscounts));
    return;
  }
  if (thiscounts[0] != 0) STOP(_("Logical error. counts[0]=%d in cradix but should have been decremented to 0. radix=%d"), thiscounts[0], radix);
  itmp = 0;
  for (int i=1; i<256; i++) {
    if (thiscounts[i] == 0) continue;
    int thisgrpn = thiscounts[i] - itmp;  // undo cumulate; i.e. diff
    cradix_r(xsub+itmp, thisgrpn, radix+1);
    itmp = thiscounts[i];
    thiscounts[i] = 0;  // set to 0 now since we're here, saves memset afterwards. Important to do this ready for this memory's reuse
  }
  if (itmp<n-1) cradix_r(xsub+itmp, n-itmp, radix+1);  // final group
  break;
  }
}

static void cradix(SEXP *x, int n)
{
  cradix_counts = calloc(ustr_maxlen*256, sizeof(*cradix_counts));  // counts for the letters of left-aligned strings
  cradix_xtmp = malloc(sizeof(*cradix_xtmp) * ustr_n);
  if (!cradix_counts || !cradix_xtmp) {
    free(cradix_counts); free(cradix_xtmp); // # nocov
    STOP(_("Failed to alloc cradix_counts and/or cradix_tmp")); // # nocov
  }
  cradix_r(x, n, 0);
  free(cradix_counts); cradix_counts=NULL;
  free(cradix_xtmp);   cradix_xtmp=NULL;
}

static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, hashtab **marks_)
// group numbers are left in truelength to be fetched by WRITE_KEY
{
  int na_count=0;
  bool anynotascii=false, anynotutf8=false;
  if (ustr_n!=0) internal_error_with_cleanup(__func__, "ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d", ustr_n, ustr_alloc);  // # nocov
  if (ustr_maxlen!=0) internal_error_with_cleanup(__func__, "ustr_maxlen isn't 0 when starting range_str");  // # nocov
  bool fail = false;
  hashtab *marks = *marks_;
  #pragma omp parallel for num_threads(getDTthreads(n, true)) shared(marks, fail)
  for(int i=0; i<n; i++) {
    SEXP s = x[i];
    if (s==NA_STRING) {
      #pragma omp atomic update
      na_count++;
      continue;
    }
    if (hash_lookup(marks,s,0)<0) continue; // seen this group before
    #pragma omp critical(range_str_write)
    if (hash_lookup(marks,s,0)>=0) {  // another thread may have set it while I was waiting, so check it again
      // now save unique SEXP in ustr so we can loop sort uniques when sorting too
      if (ustr_alloc<=ustr_n) {
        ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2;  // small initial guess, negligible time to alloc 128KiB (32 pages)
        if (ustr_alloc>n) ustr_alloc = n;  // clamp at n. Reaches n when fully unique (no dups)
        ustr = realloc(ustr, sizeof(SEXP) * ustr_alloc);
        if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP));  // # nocov
      }
      ustr[ustr_n++] = s;
      hashtab *new_marks = hash_set_shared(marks, s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group
      if (new_marks != marks) {
        if (new_marks) {
          // Under the OpenMP memory model, if the hash table is expanded, we have to explicitly update the pointer.
          #pragma omp atomic write
          marks = new_marks;
        } else { // longjmp() from a non-main thread not allowed
          fail = true;
        }
      }
      if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
      if (!anynotutf8 &&    // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already
            !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one
        if (!anynotascii)
          anynotascii=true;
        if (!IS_UTF8(s))
          anynotutf8=true;
      }
    }
  }
  // if the hash table grew, propagate the changes to the caller
  *marks_ = marks;
  if (fail) internal_error_with_cleanup(__func__, "failed to grow the 'marks' hash table");
  *out_na_count = na_count;
  *out_anynotascii = anynotascii;
  *out_anynotutf8 = anynotutf8;
  if (ustr_n==0) {  // all na
    *out_min = 0;
    *out_max = 0;
    return;
  }
  if (anynotutf8) {
    void *vmax = vmaxget();
    SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n));
    for (int i=0; i<ustr_n; i++) SET_STRING_ELT(ustr2, i, ENC2UTF8(ustr[i]));
    SEXP *ustr3 = (SEXP *)R_alloc(sizeof(*ustr3), ustr_n);
    memcpy(ustr3, STRING_PTR_RO(ustr2), sizeof(*ustr3) * ustr_n);
    // need to reset ustr_maxlen because we need ustr_maxlen for utf8 strings
    ustr_maxlen = 0;
    for (int i=0; i<ustr_n; i++) {
      SEXP s = ustr3[i];
      if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
    }
    cradix(ustr3, ustr_n);  // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8
    hash_set(marks, ustr3[0], -1);
    int o = -1;
    for (int i=1; i<ustr_n; i++) {
      if (ustr3[i] == ustr3[i-1]) continue;  // use the same o for duplicates
      hash_set(marks, ustr3[i], --o);
    }
    // now use the 1-1 mapping from ustr to ustr2 to get the ordering back into original ustr, being careful to reset tl to 0
    int *tl = (int *)R_alloc(sizeof(*tl), ustr_n);
    const SEXP *tt = STRING_PTR_RO(ustr2);
    for (int i=0; i<ustr_n; i++) tl[i] = hash_lookup(marks, tt[i], 0);   // fetches the o in ustr3 into tl which is ordered by ustr
    for (int i=0; i<ustr_n; i++) hash_set(marks, ustr3[i], 0);    // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
    for (int i=0; i<ustr_n; i++) hash_set(marks, ustr[i], tl[i]); // put back the o into ustr's tl
    UNPROTECT(1);
    vmaxset(vmax);
    *out_min = 1;
    *out_max = -o;  // could be less than ustr_n if there are duplicates in the utf8s
  } else {
    *out_min = 1;
    *out_max = ustr_n;
    if (sortType) {
      // that this is always ascending; descending is done in WRITE_KEY using max-this
      cradix(ustr, ustr_n);  // sorts ustr in-place by reference. assumes NA_STRING not present.
      for(int i=0; i<ustr_n; i++)     // save ordering in the CHARSXP. negative so as to distinguish with R's own usage.
        hash_set(marks, ustr[i], -i-1);
    }
    // else group appearance order was already saved to tl in the first pass
  }
}

static int dround=0;      // No rounding by default, for now. Handles #1642, #1728, #1463, #485
static uint64_t dmask=0;

SEXP setNumericRounding(SEXP droundArg)
// init.c has initial call with default of 2
{
  if (!isInteger(droundArg) || LENGTH(droundArg)!=1) error(_("Must an integer or numeric vector length 1"));
  if (INTEGER(droundArg)[0] < 0 || INTEGER(droundArg)[0] > 2) error(_("Must be 2, 1 or 0"));
  int oldRound = dround;
  dround = INTEGER(droundArg)[0];
  dmask = dround ? 1 << (8*dround-1) : 0;
  return ScalarInteger(oldRound);
}

SEXP getNumericRounding(void)
{
  return ScalarInteger(dround);
}

int getNumericRounding_C(void)
// for use in uniqlist.c
{
  return dround;
}

// for signed integers it's easy: flip sign bit to swap positives and negatives; the resulting unsigned is in the right order with INT_MIN ending up as 0
// for floating point finite you have to flip the other bits too if it was signed: http://stereopsis.com/radix.html
uint64_t dtwiddle(double x) //const void *p, int i)
{
  union {
    double d;
    uint64_t u64;
  } u;  // local for thread safety
  u.d = x; //((double *)p)[i];
  if (R_FINITE(u.d)) {
    if (u.d==0) u.d=0; // changes -0.0 to 0.0,  issue #743
    u.u64 ^= (u.u64 & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; // always flip sign bit and if negative (sign bit was set) flip other bits too
    u.u64 += (u.u64 & dmask) << 1/*is this shift really correct. No need to shift*/  ;   // when dround==1|2, if 8th|16th bit is set, round up before chopping last 1|2 bytes
    return u.u64 >> (dround*8);
  }
  if (ISNAN(u.d)) return ISNA(u.d)    ? 0 /*NA*/   : 1 /*NaN*/;  // also normalises a difference between NA on 32bit R (bit 13 set) and 64bit R (bit 13 not set)
  if (isinf(u.d)) return signbit(u.d) ? 2 /*-Inf*/ : (0xffffffffffffffff>>(dround*8)) /*+Inf*/;
  STOP(_("Unknown non-finite value; not NA, NaN, -Inf or +Inf"));  // # nocov
}

void radix_r(const int from, const int to, int radix);

/*
  OpenMP is used here to parallelize multiple operations that come together to
    sort a data.table using the Radix algorithm. These include:

    - The counting of unique values and recursively sorting subsets of data
      across different threads
    - The process of finding the range and distribution of data for efficient
      grouping and sorting
    - Creation of histograms which are used to sort data based on significant
      bits (each thread processes a separate batch of the data, computes the
      MSB of each element, and then increments the corresponding bins), with
      the distribution and merging of buckets
*/
SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg)
// sortGroups TRUE from setkey and regular forder, FALSE from by= for efficiency so strings don't have to be sorted and can be left in appearance order
// when sortGroups is TRUE, ascArg contains +1/-1 for ascending/descending of each by column; when FALSE ascArg is ignored
{

#ifdef TIMING_ON
  memset(tblock, 0, MAX_NTH*NBLOCK*sizeof(*tblock));
  memset(nblock, 0, MAX_NTH*NBLOCK*sizeof(*nblock));
  memset(stat,   0, 257*sizeof(*stat));
  TBEG()
#endif

  int n_protect = 0;
  const bool verbose = GetVerbose();

  if (!isNewList(DT)) {
    if (!isVectorAtomic(DT))
      internal_error_with_cleanup(__func__, "input is not either a list of columns, or an atomic vector.");  // # nocov; caught by colnamesInt at R level, test 1962.0472
    if (!isNull(by))
      internal_error_with_cleanup(__func__, "input is an atomic vector (not a list of columns) but by= is not NULL");  // # nocov; caught at R level, test 1962.043
    if (!isInteger(ascArg) || LENGTH(ascArg)!=1)
      STOP(_("Input is an atomic vector (not a list of columns) but order= is not a length 1 integer"));
    if (verbose)
      Rprintf(_("forder.c received a vector type '%s' length %d\n"), type2char(TYPEOF(DT)), length(DT));
    SEXP tt = PROTECT(allocVector(VECSXP, 1)); n_protect++;
    SET_VECTOR_ELT(tt, 0, DT);
    DT = tt;
    by = PROTECT(allocVector(INTSXP, 1)); n_protect++;
    INTEGER(by)[0] = 1;
  } else {
    if (verbose)
      Rprintf(_("forder.c received %d rows and %d columns\n"), length(VECTOR_ELT(DT,0)), length(DT));
  }
  if (!length(DT))
    internal_error_with_cleanup(__func__, "DT is an empty list() of 0 columns");  // # nocov # caught in reuseSorting forder
  if (!isInteger(by) || !LENGTH(by))
    internal_error_with_cleanup(__func__, "DT has %d columns but 'by' is either not integer or is length 0", length(DT));  // # nocov  colnamesInt catches, 2099.2
  if (!isInteger(ascArg))
    internal_error_with_cleanup(__func__, "'order' must be integer"); // # nocov
  if (LENGTH(ascArg) != LENGTH(by)) {
    if (LENGTH(ascArg)!=1)
      STOP(_("'order' length (%d) is different to by='s length (%d)"), LENGTH(ascArg), LENGTH(by));
    SEXP recycleAscArg = PROTECT(allocVector(INTSXP, LENGTH(by))); n_protect++;
    for (int j=0; j<LENGTH(recycleAscArg); j++)
      INTEGER(recycleAscArg)[j] = INTEGER(ascArg)[0];
    ascArg = recycleAscArg;
  }
  nrow = length(VECTOR_ELT(DT,0));
  int n_cplx = 0;
  for (int i=0; i<LENGTH(by); i++) {
    int by_i = INTEGER(by)[i];
    if (by_i < 1 || by_i > length(DT))
      internal_error_with_cleanup(__func__, "'by' value %d out of range [1,%d]", by_i, length(DT)); // # nocov # R forderv already catch that using C colnamesInt
    if ( nrow != length(VECTOR_ELT(DT, by_i-1)) )
      STOP(_("Column %d is length %d which differs from length of column 1 (%d), are you attempting to order by a list column?\n"), INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow);
    if (TYPEOF(VECTOR_ELT(DT, by_i-1)) == CPLXSXP) n_cplx++;
  }
  if (!IS_TRUE_OR_FALSE(retGrpArg))
    STOP(_("retGrp must be TRUE or FALSE")); // # nocov # covered in reuseSorting forder
  retgrp = LOGICAL(retGrpArg)[0]==TRUE;
  if (!IS_TRUE_OR_FALSE(retStatsArg))
    STOP(_("retStats must be TRUE or FALSE")); // # nocov # covered in reuseSorting forder
  retstats = LOGICAL(retStatsArg)[0]==TRUE;
  if (!retstats && retgrp)
    error(_("retStats must be TRUE whenever retGrp is TRUE")); // # nocov # covered in reuseSorting forder
  if (!IS_TRUE_OR_FALSE(sortGroupsArg))
    STOP(_("sort must be TRUE or FALSE")); // # nocov # covered in reuseSorting forder
  sortType = LOGICAL(sortGroupsArg)[0]==TRUE;   // if sortType is 1, it is later flipped between +1/-1 according to ascArg. Otherwise ascArg is ignored when sortType==0
  if (!retgrp && !sortType)
    STOP(_("At least one of retGrp= or sort= must be TRUE"));
  if (!isLogical(naArg) || LENGTH(naArg) != 1)
    STOP(_("na.last must be logical TRUE, FALSE or NA of length 1")); // # nocov # covered in reuseSorting forder
  nalast = (LOGICAL(naArg)[0] == NA_LOGICAL) ? -1 : LOGICAL(naArg)[0]; // 1=na last, 0=na first (default), -1=remove na

  if (nrow==0) {
    // empty vector or 0-row DT is always sorted
    SEXP ans = PROTECT(allocVector(INTSXP, 0)); n_protect++;
    if (retgrp) {
      setAttrib(ans, sym_starts, allocVector(INTSXP, 0));
      setAttrib(ans, sym_maxgrpn, ScalarInteger(0));
    }
    if (retstats) {
      setAttrib(ans, sym_anyna, ScalarInteger(0));
      setAttrib(ans, sym_anyinfnan, ScalarInteger(0));
      setAttrib(ans, sym_anynotascii, ScalarInteger(0));
      setAttrib(ans, sym_anynotutf8, ScalarInteger(0));
    }
    UNPROTECT(n_protect);
    return ans;
  }
  // if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now.
  notFirst = false;

  SEXP ans = PROTECT(allocVector(INTSXP, nrow)); n_protect++;
  anso = INTEGER(ans);
  TEND(0)
  #pragma omp parallel for num_threads(getDTthreads(nrow, true))
  for (int i=0; i<nrow; i++) anso[i]=i+1;   // gdb 8.1.0.20180409-git very slow here, oddly
  TEND(1)

  int ncol=length(by);
  int keyAlloc = (ncol+n_cplx)*8 + 1;         // +1 for NULL to mark end; calloc to initialize with NULLs
  key = calloc(keyAlloc, sizeof(*key));  // needs to be before loop because part II relies on part I, column-by-column.
  if (!key)
    STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)keyAlloc*sizeof(*key));  // # nocov
  nradix=0; // the current byte we're writing this column to; might be squashing into it (spare>0)
  int spare=0;  // the amount of bits remaining on the right of the current nradix byte
  bool isReal=false;
  bool complexRerun = false;   // see comments below in CPLXSXP case
  SEXP CplxPart = R_NilValue;
  if (n_cplx) { CplxPart=PROTECT(allocVector(REALSXP, nrow)); n_protect++; } // one alloc is reused for each part
  int any_na=0, any_infnan=0, any_notascii=0, any_notutf8=0; // collect more statistics about the data #2879, allow optimize of order(na.last=TRUE) as well #3023
  TEND(2);
  for (int col=0; col<ncol; col++) {
    // Rprintf(_("Finding range of column %d ...\n"), col);
    SEXP x = VECTOR_ELT(DT,INTEGER(by)[col]-1);
    uint64_t min=0, max=0;     // min and max of non-NA finite values
    int na_count=0, infnan_count=0;
    bool anynotascii=false, anynotutf8=false;
    if (sortType) {
      sortType=INTEGER(ascArg)[col];  // if sortType!=0 (not first-appearance) then +1/-1 comes from ascArg.
      if (sortType!=1 && sortType!=-1)
        STOP(_("Item %d of order (ascending/descending) is %d. Must be +1 or -1."), col+1, sortType);
    }
    //Rprintf(_("sortType = %d\n"), sortType);
    hashtab * marks = NULL; // only used for STRSXP below
    switch(TYPEOF(x)) {
    case INTSXP : case LGLSXP :  // TODO skip LGL and assume range [0,1]
      range_i32(INTEGER(x), nrow, &min, &max, &na_count);
      break;
    case CPLXSXP : {
      // treat as if two separate columns of double
      const Rcomplex *xd = COMPLEX_RO(x);
      double *tmp = REAL(CplxPart);
      if (!complexRerun) {
        for (int i=0; i<nrow; ++i) tmp[i] = xd[i].r;  // extract the real part on the first time
        complexRerun = true;
        col--;  // cause this loop iteration to rerun; decrement now in case of early continue below
      } else {
        for (int i=0; i<nrow; ++i) tmp[i] = xd[i].i;
        complexRerun = false;
      }
      x = CplxPart;
    } // !no break! so as to fall through to REAL case
    case REALSXP :
      if (INHERITS(x, char_integer64)) {
        range_i64((int64_t *)REAL(x), nrow, &min, &max, &na_count);
      } else {
        if (verbose && INHERITS(x, char_Date) && fitsInInt32(x)) {
          // Note the (slightly expensive) fitsInInt32 will only run when verbose is true. Prefix '***' just to make it stand out in verbose output
          // In future this could be upgraded to option warning. But I figured that's what we use verbose to do (to trace problems and look for efficiencies).
          // If an automatic coerce is desired (see discussion in #1738) then this is the point to do that in this file. Move the INTSXP case above to be
          // next, do the coerce of Date to integer now to a tmp, and then let this case fall through to INTSXP in the same way as CPLXSXP falls through to REALSXP.
          Rprintf(_("\n*** Column %d passed to forder is a date stored as an 8 byte double but no fractions are present. Please consider a 4 byte integer date such as IDate to save space and time.\n"), col+1);
        }
        range_d(REAL(x), nrow, &min, &max, &na_count, &infnan_count);
        if (min==0 && na_count<nrow) { min=3; max=4; } // column contains no finite numbers and is not-all NA; create dummies to yield positive min-2 later
        isReal = true;
      }
      break;
    case STRSXP :
      // need2utf8 now happens inside range_str on the uniques
      marks = hash_create(4096); // relatively small to allocate, can grow exponentially later
      PROTECT(marks->prot); n_protect++;
      range_str(STRING_PTR_RO(x), nrow, &min, &max, &na_count, &anynotascii, &anynotutf8, &marks);
      break;
    default:
      STOP(_("Column %d passed to [f]order is type '%s', not yet supported."), col+1, type2char(TYPEOF(x)));
    }
    TEND(3);
    if (na_count>0)
      any_na = 1; // may be written multiple times, for each column that has NA, but thats fine
    if (infnan_count>0)
      any_infnan = 1;
    if (anynotascii)
      any_notascii = 1;
    if (anynotutf8)
      any_notutf8 = 1;
    if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) {
      // all same value; skip column as nothing to do;  [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA)
      if (na_count==nrow && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
      if (TYPEOF(x)==STRSXP) free_ustr();
      continue;
    }

    uint64_t range = max-min+1 +1/*NA*/ +isReal*3/*NaN, -Inf, +Inf*/;
    // Rprintf(_("range=%"PRIu64"  min=%"PRIu64"  max=%"PRIu64"  na_count==%d\n"), range, min, max, na_count);

    int maxBit=0;
    while (range) { maxBit++; range>>=1; }
    int nbyte = 1+(maxBit-1)/8; // the number of bytes spanned by the value
    int firstBits = maxBit - (nbyte-1)*8;  // how many bits used in most significant byte
    if (spare==0) {
      spare = 8-firstBits; // left align to byte boundary to get better first split.
    } else {
      if (sortType==0) {
        // can't squash into previous spare as that will change first-appearance order within that radix-byte
        //   found thanks to test 1246.55 with seed 1540402216L and added new direct test 1954
        spare = 8-firstBits;
        nradix++;
      }
      else if (spare >= firstBits) {
        // easiest case. just left shift a few bits within these nbyte to fill the spare gap
        spare -= firstBits;      // new spare is also how many bits to now left shift
      }
      else {
        if (nbyte<8) {
          spare += 8-firstBits;
          nbyte++;               // after shift, will need an extra byte
        } else {
          // range takes 8 bytes so can't shift into 9th to use spare of last byte of previous column; start with a new left-aligned byte
          nradix++;
          spare = 8-firstBits;
        }
      }
    }

    for (int b=0; b<nbyte; b++) {
      if (key[nradix+b]==NULL) {
        uint8_t *tt = calloc(nrow, sizeof(*tt));  // 0 initialize so that NA's can just skip (NA is always the 0 offset)
        if (!tt) {
          free(key); // # nocov
          STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)nrow*sizeof(*tt)); // # nocov
        }
        key[nradix+b] = tt;
      }
    }

    const bool asc = (sortType>=0);
    uint64_t min2=min, max2=max;
    if (nalast<1 && asc) {
      min2--; // so that x-min results in 1 for the minimum; NA coded by 0. Always provide for the NA spot even if NAs aren't present for code brevity and robustness
      if (isReal) min2--;  // Nan first
    } else {
      max2++;            // NA is max+1 so max2-elem should result in 0
      if (isReal) max2++;  // Nan last
    }
    if (isReal) { min2--; max2++; }  // -Inf and +Inf  in min-1 and max+1 spots respectively

    const uint64_t naval = ((nalast==1) == asc) ? max+1+isReal*2 : min-1-isReal*2;
    const uint64_t nanval = ((nalast==1) == asc) ? max+2 : min-2;  // only used when isReal
    // Rprintf(_("asc=%d  min2=%"PRIu64"  max2=%"PRIu64"  naval==%"PRIu64"  nanval==%"PRIu64"\n"), asc, min2, max2, naval, nanval);

    // several columns could squash into 1 byte. due to this bit squashing is why we deal
    // with asc|desc here, otherwise it could be done in the ugrp sorting by reversing the ugrp insert sort
    #define WRITE_KEY                                   \
    elem = asc ? elem-min2 : max2-elem;                 \
    elem <<= spare;                                     \
    for (int b=nbyte-1; b>0; b--) {                     \
      key[nradix+b][i] = (uint8_t)(elem & 0xff);        \
      elem >>= 8;                                       \
    }                                                   \
    key[nradix][i] |= (uint8_t)(elem & 0xff);
    // this last |= squashes the most significant bits of this column into the the spare of the previous
    // NB: retaining group order does not retain the appearance-order in the sense that DT[,,by=] does. Because we go column-by-column, the first column values
    //     are grouped together.   Whether we do byte-column-within-column doesn't alter this and neither does bit packing across column boundaries.
    //     if input data is grouped-by-column (not grouped-by-row) then this grouping strategy may help a lot.
    //     if input data is random ordering, then a consequence of grouping-by-column (whether sorted or not) is that the output ordering will not be by
    //       row-group-appearance-order built it; the result will still need to be resorted by row number of first group item.
    //     this was the case with the previous forder as well, which went by whole-column
    //     So, we may as well bit-pack as done above.
    //     Retaining appearance-order within key-byte-column is still advatangeous for when we do encounter column-grouped data (to avoid data movement); and
    //       the ordering will be appearance-order preserving in that case (just at the byte level).
    // TODO: in future we could provide an option to return 'any' group order for efficiency, which would be the byte-appearance order. But for now, the
    //     the forder afterwards on o__[f__] (in [.data.table) is not significant.

    switch(TYPEOF(x)) {
    case INTSXP : case LGLSXP : {
      const int32_t *xd = INTEGER_RO(x);
      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
      for (int i=0; i<nrow; i++) {
        uint64_t elem=0;
        if (xd[i]==NA_INTEGER) {  // TODO: go branchless if na_count==0
          if (nalast==-1) anso[i]=0;
          elem = naval;
        } else {
          elem = xd[i] ^ 0x80000000u;
        }
        WRITE_KEY
      }}
      break;
    case REALSXP :
      if (inherits(x, "integer64")) {
        const int64_t *xd = (const int64_t *)REAL_RO(x);
        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
        for (int i=0; i<nrow; i++) {
          uint64_t elem=0;
          if (xd[i]==INT64_MIN) {
            if (nalast==-1) anso[i]=0;
            elem = naval;
          } else {
            elem = xd[i] ^ 0x8000000000000000u;
          }
          WRITE_KEY
        }
      } else {
        const double *xd = REAL_RO(x);     // TODO: revisit double compression (skip bytes/mult by 10,100 etc) as currently it's often 6-8 bytes even for 3.14,3.15
        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
        for (int i=0; i<nrow; i++) {
          uint64_t elem=0;
          if (!R_FINITE(xd[i])) {
            if (isinf(xd[i])) elem = signbit(xd[i]) ? min-1 : max+1;
            else {
              if (nalast==-1) anso[i]=0;  // for both NA and NaN
              elem = ISNA(xd[i]) ? naval : nanval;
            }
          } else {
            elem = dtwiddle(xd[i]);  // TODO: could avoid twiddle() if all positive finite which could be known from range_d.
                                     //       also R_FINITE is repeated within dtwiddle() currently, wastefully given the if() above
          }
          WRITE_KEY
        }
      }
      break;
    case STRSXP : {
      const SEXP *xd = STRING_PTR_RO(x);
      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
      for (int i=0; i<nrow; i++) {
        uint64_t elem=0;
        if (xd[i]==NA_STRING) {
          if (nalast==-1) anso[i]=0;
          elem = naval;
        } else {
          elem = -hash_lookup(marks, xd[i], 0);
        }
        WRITE_KEY
      }}
      free_ustr();  // ustr could be left allocated and reused, but free now in case large and we're tight on ram
      break;
    default: // # nocov
       internal_error_with_cleanup(__func__, "column not supported, not caught earlier"); // # nocov
    }
    nradix += nbyte-1+(spare==0);
    TEND(4)
  }
  if (key[nradix]!=NULL) nradix++;  // nradix now number of bytes in key
  #ifdef TIMING_ON
  Rprintf("nradix=%d\n", nradix); // # notranslate
  #endif

  // global nth, TMP & UGRP
  nth = getDTthreads(nrow, true);  // this nth is relied on in cleanup(); throttle=true/false debated for #5077
  TMP =  malloc(sizeof(*TMP)*nth*UINT16_MAX); // used by counting sort (my_n<=65536) in radix_r()
  UGRP = malloc(sizeof(*UGRP)*nth*256);                // TODO: align TMP and UGRP to cache lines (and do the same for stack allocations too)
  if (!TMP || !UGRP /*|| TMP%64 || UGRP%64*/) {
    free(TMP); free(UGRP); // # nocov
    STOP(_("Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d"), nth); // # nocov
  }

  if (retgrp) {
    gs_thread = calloc(nth, sizeof(*gs_thread));     // thread private group size buffers
    gs_thread_alloc = calloc(nth, sizeof(*gs_thread_alloc));
    gs_thread_n = calloc(nth, sizeof(*gs_thread_n));
    if (!gs_thread || !gs_thread_alloc || !gs_thread_n) {
      free(gs_thread); free(gs_thread_alloc); free(gs_thread_n); // # nocov
      STOP(_("Could not allocate (very tiny) group size thread buffers")); // # nocov
    }
  }
  if (nradix) {
    radix_r(0, nrow-1, 0);  // top level recursive call: (from, to, radix)
  } else {
    push(&nrow, 1);
  }

  TEND(30)

  if (anso[0]==1 && anso[nrow-1]==nrow && (nrow<3 || anso[nrow/2]==nrow/2+1)) {
    // There used to be all_skipped shared bool. But even though it was safe to update this bool to false naked (without atomic protection) :
    // i) there were a lot of updates from deeply iterated insert, so there were a lot of writes to it and that bool likely sat on a shared cache line
    // ii) there were a lot of places in the code which needed to remember to set all_skipped properly. It's simpler code just to test now almost instantly.
    // Alternatively, we could try and avoid creating anso[] until it's needed, but that has similar complexity issues as (ii)
    // Note that if nalast==-1 (remove NA) anso will contain 0's for the NAs and will be considered not-sorted.
    bool stop = false;
    #pragma omp parallel for num_threads(getDTthreads(nrow, true))
    for (int i=0; i<nrow; i++) {
      if (stop) continue;
      if (anso[i]!=i+1) stop=true;
    }
    if (!stop) {
      // data is already grouped or sorted, integer() returned with group sizes attached
      ans = PROTECT(allocVector(INTSXP, 0));  // can't attach attributes to NULL, hence an empty integer()
      n_protect++;
    }
  }
  TEND(31)

  if (retgrp) {
    SEXP tt;
    int final_gs_n = (gs_n==0) ? gs_thread_n[0] : gs_n;   // TODO: find a neater way to do this
    int *final_gs  = (gs_n==0) ? gs_thread[0] : gs;
    setAttrib(ans, sym_starts, tt = allocVector(INTSXP, final_gs_n));
    int *ss = INTEGER(tt);
    int maxgrpn = 0;
    for (int i=0, tmp=1; i<final_gs_n; i++) {
      int elem = final_gs[i];
      if (elem>maxgrpn) maxgrpn=elem;
      ss[i]=tmp;
      tmp+=elem;
    }
    setAttrib(ans, sym_maxgrpn, ScalarInteger(maxgrpn));
  }
  if (retstats) {
    setAttrib(ans, sym_anyna, ScalarInteger(any_na));
    setAttrib(ans, sym_anyinfnan, ScalarInteger(any_infnan));
    setAttrib(ans, sym_anynotascii, ScalarInteger(any_notascii));
    setAttrib(ans, sym_anynotutf8, ScalarInteger(any_notutf8));
  }

  cleanup();
  UNPROTECT(n_protect);
  TEND(32);
  #ifdef TIMING_ON
  {
    // first sum across threads
    for (int i=0; i<NBLOCK; i++) {
      for (int j=1; j<MAX_NTH; j++) {
        tblock[i] += tblock[j*NBLOCK + i];
        nblock[i] += nblock[j*NBLOCK + i];
      }
    }
    int last=NBLOCK-1;
    while (last>=0 && nblock[last]==0) last--; // remove unused timing slots
    for (int i=0; i<=last; i++) {
      Rprintf(_("Timing block %2d%s = %8.3f   %8d\n"), i, (i>=17&&i<=19)?"(*)":"   ", tblock[i], nblock[i]);
    }
    for (int i=0; i<=256; i++) if (stat[i])
      Rprintf("stat[%03d]==%20"PRIu64"\n", i, (uint64_t)stat[i]); // # notranslate
  }
  #endif
  return ans;
}

static bool sort_ugrp(uint8_t *x, const int n)
// x contains n unique bytes; sort them in-place using insert sort
// always ascending. desc and nalast are done in WRITE_KEY because columns may cross byte boundaries
// maximum value for n is 256 which is one too big for uint8_t, hence int n
{
  bool skip = true;            // was x already sorted? if so, the caller can skip reordering
  for (int i=1; i<n; i++) {
    uint8_t tmp = x[i];
    if (tmp>x[i-1]) continue;  // x[i-1]==x[i] doesn't happen because x is unique
    skip = false;
    int j = i-1;
    do {
      x[j+1] = x[j];
    } while (--j>=0 && tmp<x[j]);
    x[j+1] = tmp;
  }
  return skip;
}

void radix_r(const int from, const int to, int radix) {
  for (;;) {
  TBEG();
  const int my_n = to-from+1;
  if (my_n==1) {  // minor TODO: batch up the 1's instead in caller (and that's only needed when retgrp anyway)
    push(&my_n, 1);
    TEND(5);
    return;
  }
  else if (my_n<=256) {
    // if nth==1
    // Rprintf(_("insert clause: radix=%d, my_n=%d, from=%d, to=%d\n"), radix, my_n, from, to);
    // insert sort with some twists:
    // i) detects if grouped; if sortType==0 can then skip
    // ii) keeps group appearance order at byte level to minimize movement
    #ifdef TIMING_ON
      // #pragma omp atomic   // turn on manually as the atomic affects timings
      // stat[my_n]++;
    #endif

    uint8_t *restrict my_key = key[radix]+from;  // safe to write as we don't use this radix again
    uint8_t *o = malloc(sizeof(*o) * my_n);
    if (!o)
      STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(uint8_t)), "o"); // # nocov
    // if last key (i.e. radix+1==nradix) there are no more keys to reorder so we could reorder osub by reference directly and save allocating and populating o just
    // to use it once. However, o's type is uint8_t so many moves within this max-256 vector should be faster than many moves in osub (4 byte or 8 byte ints) [1 byte
    // type is always aligned]
    bool skip = true;
    if (sortType!=0) {
      // always ascending as desc (sortType==-1) was dealt with in WRITE_KEY
      int start = 1;
      while (start<my_n && my_key[start]>=my_key[start-1]) start++;
      if (start<my_n) {
        skip = false;  // finding start is really just to take skip out of the loop below
        for (int i=0; i<start; i++) o[i]=i;  // always at least sets o[0]=0
        for (int i=start; i<my_n; i++) {
          uint8_t ktmp = my_key[i];
          int j=i-1;
          while (j>=0 && ktmp<my_key[j]) {
            my_key[j+1] = my_key[j];
            o[j+1] = o[j];
            j--;
          }
          my_key[j+1] = ktmp;  // redundant write when while() did nothing, but that's unlikely given the common case of pre-ordered is handled by skip==true
          o[j+1] = i;          // important to initialize o[] even when while() did nothing.
        }
      }
      TEND(6)
    } else {
      // sortType==0; retain group appearance order to hopefully benefit from skip, but there is a still a sort afterwards to get back to appearance order
      // because columns are split into multiple bytes: retaining the byte appearance order within key-byte is not the same as retaining the order of
      // unique values within each by= column.  In other words, retaining group order at byte level here does not affect correctness, just efficiency.
      int second = 1;
      while (second<my_n && my_key[second]==my_key[0]) second++;  // look for the second different byte value
      int third = second+1;
      while (third<my_n && my_key[third]==my_key[second]) third++;  // look for the last of the second value (which might be a repeat of the first)
      if (third<my_n) {
        // it's not either i) all one value (xxxxx), or ii) two values (xxyyy). It could be xxyyyx.. or xxyyyz..  It's likely worth allocating seen[] now.
        bool seen[256]={false};
        seen[my_key[0]] = seen[my_key[second]] = true;  // first two different bytes have been seen
        for(; third<my_n; third++) {
          uint8_t ktmp = my_key[third];
          if (ktmp==my_key[third-1]) continue;
          if (!seen[ktmp]) { seen[ktmp]=true; continue; }
          // else different byte but we've seen it before, so this my_n is not grouped (e.g. xxyyyx)
          skip = false;
          break;
        }
        if (!skip) {
          // and now it's worth populating o[]
          for (int i=0; i<third; i++) o[i] = i;
          for (int i=third; i<my_n; i++) {
            uint8_t ktmp = my_key[i];
            if (!seen[ktmp]) { seen[ktmp]=true; o[i]=i; continue; }
            // move this byte that we've seen before back to just after the last one to group them
            int j = i-1;
            while (j>=0 && ktmp!=my_key[j]) {
              my_key[j+1] = my_key[j];
              o[j+1] = o[j];
              j--;
            }
            my_key[j+1] = ktmp;   // redundant write if my_key[i]==my_key[i-1] and while() did nothing (but that's ok as that's at least some writing since skip==false)
            o[j+1] = i;           // important to initialize o[] even if while() did nothing
          }
        }
      }
      TEND(7)
    }
    if (!skip) {
      // reorder osub and each remaining ksub
      int *TMP = malloc(sizeof(*TMP) * my_n);
      if (!TMP) {