diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 96a85eddaa..08858e7e5d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4538,7 +4538,7 @@ ll <- list(data.table(x=1, y=-1, x=-2), data.table(y=10, y=20, y=30, x=-10, a="a test(1288.1, rbindlist(ll, use.names=TRUE, fill=FALSE), error = "Item 2 has 7 columns, inconsistent with item 1 which has 3 columns") # modified after fixing #725 test(1288.2, rbindlist(ll, use.names=TRUE, fill=TRUE), - data.table(x=c(1,-10), y=c(-1,10), x=c(-2, NA), y=c(NA,20), y=c(NA,30), a=c(NA, "a"), b=c(NA, Inf), c=factor(c(NA, 1)))) + data.table(x=c(1,-10), x=c(-2, NA), y=c(-1,10), y=c(NA,20), y=c(NA,30), a=c(NA, "a"), b=c(NA, Inf), c=factor(c(NA, 1)))) # check the name of output are consistent when binding two empty dts with one empy and other non-empty dt dt1 <- data.table(x=1:5, y=6:10) @@ -12350,6 +12350,11 @@ setDF(DT) test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), error = "must be a data.table") +# appearance order of two low-cardinality columns that were squashed in pr#3124 +DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) +test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L)) + + ################################### # Add new tests above this line # ################################### diff --git a/src/forder.c b/src/forder.c index 33af8deb3f..b0c0d22dbe 100644 --- a/src/forder.c +++ b/src/forder.c @@ -394,9 +394,8 @@ static void count_group(const uint8_t *x, const int n, bool *out_skip, uint16_t for (int i=0; i 2) error("Must be 2, 1 or 0"); dround = INTEGER(droundArg)[0]; - dmask1 = dround ? 1 << (8*dround-1) : 0; - dmask2 = 0xffffffffffffffff << dround*8; + dmask = dround ? 1 << (8*dround-1) : 0; return R_NilValue; } @@ -424,29 +422,20 @@ int getNumericRounding_C() // for floating point finite you have to flip the other bits too if it was signed: http://stereopsis.com/radix.html uint64_t dtwiddle(void *p, int i) { - union { // inside for thread-safety + union { double d; - uint64_t ull; - } u; + uint64_t u64; + } u; // local for thread safety u.d = ((double *)p)[i]; if (R_FINITE(u.d)) { - u.ull = (u.d) ? u.ull + ((u.ull & dmask1) << 1) : 0; // handle 0, -0 case. Fix for issues/743. - // tested on vector length 100e6. was the fastest fix (see results at the bottom of page) - } else if (ISNAN(u.d)) { - /* 1. NA twiddled to all bits 0, sorts first. R's value 1954 cleared. - 2. NaN twiddled to set just bit 13, sorts immediately after NA. 13th bit to be - consistent with "quiet" na bit but any bit outside last 2 bytes would do. - (ref r-devel post: http://r.789695.n4.nabble.com/Question-re-NA-NaNs-in-R-td4685014.html) - 3. This also normalises a difference between NA on 32bit R (bit 13 set) and 64bit R (bit 13 not set) - 4. -Inf twiddled to : 0 sign, exponent all 0, mantissa all 1, sorts after NaN - 5. +Inf twiddled to : 1 sign, exponent all 1, mantissa all 0, sorts last since finite - numbers are defined by not-all-1 in exponent */ - u.ull = (ISNA(u.d) ? 0 : (1ULL << 51)); - return u.ull; - //return (nalast == 1 ? ~u.ull : u.ull); + if (u.d==0) u.d=0; // changes -0.0 to 0.0, issue #743 + u.u64 ^= (u.u64 & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; // always flip sign bit and if negative (sign bit was set) flip other bits too + u.u64 += (u.u64 & dmask) << 1; // when dround==1|2, if 8th|16th bit is set, round up before chopping last 1|2 bytes + return u.u64 >> (dround*8); } - uint64_t mask = (u.ull & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; // always flip sign bit and if negative (sign bit was set) flip other bits too - return (u.ull ^ mask) & dmask2; // TODO: why not shift right to reduce range. + if (ISNAN(u.d)) return ISNA(u.d) ? 0 /*NA*/ : 1 /*NaN*/; // also normalises a difference between NA on 32bit R (bit 13 set) and 64bit R (bit 13 not set) + if (isinf(u.d)) return signbit(u.d) ? 2 /*-Inf*/ : (0xffffffffffffffff>>(dround*8)) /*+Inf*/; + Error("Unknown non-finite value; not NA, NaN, -Inf or +Inf"); } SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg) @@ -521,7 +510,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP SEXP x = VECTOR_ELT(DT,INTEGER(by)[col]-1); uint64_t min, max; // min and max of non-NA values int na_count=0; - if (sort) sort=INTEGER(orderArg)[col]; // +1 or -1 + if (sort) sort=INTEGER(orderArg)[col]; // +1(asc) -1(desc) 0(first-appearance) //Rprintf("sort = %d\n", sort); switch(TYPEOF(x)) { case INTSXP : case LGLSXP : // TODO skip LGL and assume range [0,1] @@ -562,9 +551,17 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP if (spare==0) { spare = 8-firstBits; // left align to byte boundary to get better first split. } else { - if (spare >= firstBits) { - spare -= firstBits; // new spare is also how many bits to left shift - } else { + if (sort==0) { + // can't squash into previous spare as that will change first-appearance order within that radix-byte + // found thanks to test 1246.55 with seed 1540402216L and added new direct test 1954 + spare = 8-firstBits; + nradix++; + } + else if (spare >= firstBits) { + // easiest case. just left shift a few bits within these nbyte to fill the spare gap + spare -= firstBits; // new spare is also how many bits to now left shift + } + else { if (nbyte<8) { spare += 8-firstBits; nbyte++; // after shift, will need an extra byte @@ -592,8 +589,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP } if (isReal) { min2--; max2++; } // -Inf and +Inf in min-1 and max+1 spots respectively - const uint64_t naval = ((nalast==1) == (sort==1)) ? max+1+isReal*2 : min-1-isReal*2; - const uint64_t nanval = ((nalast==1) == (sort==1)) ? max+2 : min-2; // only used when isReal + const uint64_t naval = ((nalast==1) == (sort>=0)) ? max+1+isReal*2 : min-1-isReal*2; + const uint64_t nanval = ((nalast==1) == (sort>=0)) ? max+2 : min-2; // only used when isReal // several columns could squash into 1 byte. due to this bit squashing is why we deal // with asc|desc here, otherwise it could be done in the ugrp sorting by reversing the ugrp insert sort @@ -695,6 +692,13 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP char *TMP = malloc(n*sizeof(int)); for (int radix=0; radix