From e14f791d689239f1fcc1db207235991ce115c322 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 31 Jan 2026 21:41:02 +0100 Subject: [PATCH 01/17] add factor --- NAMESPACE | 4 ++ NEWS.md | 1 + R/integer64.R | 77 ++++++++++++++++++++++++++++++++- man/as.character.integer64.Rd | 6 +++ man/factor.Rd | 47 ++++++++++++++++++++ tests/testthat/test-integer64.R | 34 +++++++++++++++ 6 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 man/factor.Rd diff --git a/NAMESPACE b/NAMESPACE index 04bd7bd8..aad1dc30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -207,6 +207,7 @@ export(as.bitstring.integer64) export(as.character.integer64) export(as.data.frame.integer64) export(as.double.integer64) +export(as.factor) export(as.integer.integer64) export(as.integer64) export(as.integer64.NULL) @@ -219,6 +220,7 @@ export(as.integer64.integer64) export(as.integer64.logical) export(as.list.integer64) export(as.logical.integer64) +export(as.ordered) export(benchmark64) export(binattr) export(c.integer64) @@ -227,6 +229,7 @@ export(cbind.integer64) export(colSums) export(diff.integer64) export(duplicated.integer64) +export(factor) export(format.integer64) export(getcache) export(hashcache) @@ -297,6 +300,7 @@ export(order.integer64) export(ordercache) export(orderdup) export(orderdup.integer64) +export(ordered) export(orderfin) export(orderfin.integer64) export(orderkey) diff --git a/NEWS.md b/NEWS.md index 570fb462..5c910dd5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -61,6 +61,7 @@ - Ignores leading/trailing whitespace (as does `as.integer()`; #232). 1. `sortcache`, `sortordercache` and `ordercache` get a new argument `na.last`. 1. `matrix`, `array`, `%*%` and `as.matrix` get an `integer64` method (#45). Thanks @hcirellu. +1. `factor`, `as.factor`, `ordered`, and `as.ordered` support `integer64` input correctly, i.e. the levels are sorted according to `integer64` values. Thanks @hcirellu. ## BUG FIXES diff --git a/R/integer64.R b/R/integer64.R index 6faf0c42..26a1d346 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -2,7 +2,7 @@ # R-Code # S3 atomic 64bit integers for R # (c) 2011-2024 Jens Oehlschägel -# (c) 2025 Michael Chirico +# (c) 2025-2026 Michael Chirico # Licence: GPL2 # Provided 'as is', use at your own risk # Created: 2011-12-11 @@ -392,6 +392,27 @@ NULL #' @name all.equal.integer64 NULL +#' Factors +#' +#' The function [factor] is used to encode a vector as a factor. +#' +#' @param x a vector of data, usually taking a small number of distinct values. +#' @param levels an optional vector of the unique values (as character strings) that x might have taken. The default is the unique set of values taken by as.character(x), sorted into increasing order of x. Note that this set can be specified as smaller than sort(unique(x)). +#' @param labels either an optional character vector of labels for the levels (in the same order as levels after removing those in exclude), or a character string of length 1. Duplicated values in labels can be used to map different values of x to the same factor level. +#' @param exclude a vector of values to be excluded when forming the set of levels. This may be factor with the same level set as x or should be a character. +#' @param ordered logical flag to determine if the levels should be regarded as ordered (in the order given). +#' @param nmax an upper bound on the number of levels. +#' @param ... (in ordered(.)): any of the above, apart from ordered itself. +#' +#' @return An object of class "factor" or "ordered". +#' @seealso [factor][base::factor] +#' @examples +#' x <- as.integer64(c(132724613L, -2143220989L, -1L, NA, 1L)) +#' factor(x) +#' ordered(x) +#' @name factor +NULL + methods::setOldClass("integer64") # contributed by Leonardo Silvestri with modifications of JO @@ -792,6 +813,60 @@ as.POSIXct.integer64 = function(x, tz="", origin, ...) as.POSIXlt.integer64 = function(x, tz="", origin, ...) as.POSIXlt(as.double(x, ...), tz=tz, origin=origin, ...) +#' @rdname as.character.integer64 +#' @export as.factor +as.factor = function(x) factor(x) + +#' @rdname as.character.integer64 +#' @export as.ordered +as.ordered = function(x) ordered(x) + +#' @rdname factor +#' @export factor +factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.ordered(x), nmax=NA) { + if (!is.integer64(x)) { + sys_call = sys.call() + sys_call[[1L]] = base::factor + pf = parent.frame() + return(withCallingHandlers_and_choose_call(eval(sys_call, envir=pf), c("factor"))) + } + + nx = names(x) + if (missing(levels)) { + levels = sort(unique(x)) + } else { + levels = as.integer64(levels) + } + # basically copied from base::factor + force(ordered) + levels = levels[is.na(match(levels, exclude))] + ret = match(x, levels) + if (!is.null(nx)) + names(ret) = nx + if (missing(labels)) { + levels(ret) = as.character(levels) + } else { + nlab = length(labels) + if (nlab == length(levels)) { + nlevs = unique(xlevs <- as.character(labels)) + at = attributes(ret) + at$levels = nlevs + ret = match(xlevs, nlevs)[ret] + attributes(ret) = at + } else if (nlab == 1L) { + levels(ret) = paste0(labels, seq_along(levels)) + } else { + stop(gettextf("invalid 'labels'; length %d should be 1 or %d", nlab, length(levels), domain="R-base"), domain=NA) + } + } + class(ret) <- c(if (ordered) "ordered", "factor") + ret +} + +#' @rdname factor +#' @export ordered +ordered = function(x=character(), ...) factor(x, ..., ordered=TRUE) + #' @rdname as.character.integer64 #' @export print.bitstring = function(x, ...) { diff --git a/man/as.character.integer64.Rd b/man/as.character.integer64.Rd index 367fcc43..769ce956 100644 --- a/man/as.character.integer64.Rd +++ b/man/as.character.integer64.Rd @@ -13,6 +13,8 @@ \alias{as.Date.integer64} \alias{as.POSIXct.integer64} \alias{as.POSIXlt.integer64} +\alias{as.factor} +\alias{as.ordered} \alias{print.bitstring} \alias{as.list.integer64} \title{Coerce from integer64} @@ -41,6 +43,10 @@ as.bitstring(x, ...) \method{as.POSIXlt}{integer64}(x, tz = "", origin, ...) +as.factor(x) + +as.ordered(x) + \method{print}{bitstring}(x, ...) \method{as.list}{integer64}(x, ...) diff --git a/man/factor.Rd b/man/factor.Rd new file mode 100644 index 00000000..057fd81d --- /dev/null +++ b/man/factor.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integer64.R +\name{factor} +\alias{factor} +\alias{ordered} +\title{Factors} +\usage{ +factor( + x = character(), + levels, + labels = levels, + exclude = NA, + ordered = is.ordered(x), + nmax = NA +) + +ordered(x = character(), ...) +} +\arguments{ +\item{x}{a vector of data, usually taking a small number of distinct values.} + +\item{levels}{an optional vector of the unique values (as character strings) that x might have taken. The default is the unique set of values taken by as.character(x), sorted into increasing order of x. Note that this set can be specified as smaller than sort(unique(x)).} + +\item{labels}{either an optional character vector of labels for the levels (in the same order as levels after removing those in exclude), or a character string of length 1. Duplicated values in labels can be used to map different values of x to the same factor level.} + +\item{exclude}{a vector of values to be excluded when forming the set of levels. This may be factor with the same level set as x or should be a character.} + +\item{ordered}{logical flag to determine if the levels should be regarded as ordered (in the order given).} + +\item{nmax}{an upper bound on the number of levels.} + +\item{...}{(in ordered(.)): any of the above, apart from ordered itself.} +} +\value{ +An object of class "factor" or "ordered". +} +\description{ +The function \link{factor} is used to encode a vector as a factor. +} +\examples{ + x <- as.integer64(c(132724613L, -2143220989L, -1L, NA, 1L)) + factor(x) + ordered(x) +} +\seealso{ +\link[base:factor]{factor} +} diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index d483ae3a..6eca321f 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -826,3 +826,37 @@ test_that("match works with zero length input", { expect_identical(match(x64, integer(), nomatch=10L), match(x32, integer(), nomatch=10L)) expect_identical(match(integer(), x64), match(integer(), x32)) }) + +test_that("factor and order for integer64 are still necessary", { + # make sure that factor and order for integer64 are still necessary + x = c(132724613L, -2143220989L, -1L, NA, 1L) + expect_failure(expect_identical(base::factor(as.integer64(x)), base::factor(x))) + expect_failure(expect_identical(factor(as.integer64(x)), base::factor(as.integer64(x)))) + expect_identical(factor(as.integer64(x)), base::factor(x)) + expect_failure(expect_identical(base::ordered(as.integer64(x)), base::ordered(x))) + expect_failure(expect_identical(ordered(as.integer64(x)), base::ordered(as.integer64(x)))) + expect_identical(ordered(as.integer64(x)), base::ordered(x)) +}) + +with_parameters_test_that("factor and order work analogously to integer:", { + x = c(132724613L, -2143220989L, -1L, NA, 1L) + + expect_identical(factor(as.integer64(x)), factor(x)) + + expect_identical( + tryCatch(factor(as.integer64(x), levels=levels, labels=labels, exclude=exclude, ordered=ordered), error=conditionMessage), + tryCatch(factor(x, levels=levels, labels=labels, exclude=exclude, ordered=ordered), error=conditionMessage) + ) + if (isTRUE(ordered)) + expect_identical( + tryCatch(ordered(as.integer64(x), levels=levels, labels=labels, exclude=exclude), error=conditionMessage), + tryCatch(ordered(x, levels=levels, labels=labels, exclude=exclude), error=conditionMessage) + ) + }, + .cases = expand.grid( + levels=I(list(NULL, NA, 1L, c(-1L, 1L), "1")), + labels=I(list(levels, NULL, letters[1L], letters[1:2])), + exclude=I(list(NULL, NA, 1L, c(-1L, 1L))), + ordered=c(TRUE, FALSE) + ) +) From 1437f94bbe73db7165f5a66db9cfee8d1d89c3fa Mon Sep 17 00:00:00 2001 From: hcirellu Date: Mon, 2 Feb 2026 09:22:36 +0100 Subject: [PATCH 02/17] skip test in ancient could not find function "expect_failure" in test-ancient --- tests/testthat/test-integer64.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index 6eca321f..bce2d33a 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -828,6 +828,7 @@ test_that("match works with zero length input", { }) test_that("factor and order for integer64 are still necessary", { + skip_unless_r(">= 4.0.0") # could not find function "expect_failure" in test-ancient # make sure that factor and order for integer64 are still necessary x = c(132724613L, -2143220989L, -1L, NA, 1L) expect_failure(expect_identical(base::factor(as.integer64(x)), base::factor(x))) From 6a60f3f37b2b636f69ff603a1cd6f63cdf632a3f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 20 Feb 2026 00:19:00 +0800 Subject: [PATCH 03/17] style: drop c() for length-1 literal --- R/integer64.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/integer64.R b/R/integer64.R index 26a1d346..8b9c4b19 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -828,7 +828,7 @@ factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.o sys_call = sys.call() sys_call[[1L]] = base::factor pf = parent.frame() - return(withCallingHandlers_and_choose_call(eval(sys_call, envir=pf), c("factor"))) + return(withCallingHandlers_and_choose_call(eval(sys_call, envir=pf), "factor")) } nx = names(x) From ae2c6f878afca4530888345157d47fd3577f546d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 20 Feb 2026 00:22:11 +0800 Subject: [PATCH 04/17] style: pull out implicit assignment --- R/integer64.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/integer64.R b/R/integer64.R index 8b9c4b19..8ab4d1a5 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -848,7 +848,8 @@ factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.o } else { nlab = length(labels) if (nlab == length(levels)) { - nlevs = unique(xlevs <- as.character(labels)) + xlevs = as.character(labels) + nlevs = unique(xlevs) at = attributes(ret) at$levels = nlevs ret = match(xlevs, nlevs)[ret] From afb98eea370b8232664013dce99fcdf22c0bd125 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 21 Feb 2026 10:16:51 +0100 Subject: [PATCH 05/17] inheritParams and skip_on_cran --- R/integer64.R | 13 ++++--------- man/factor.Rd | 25 +++++++++++++++++++------ tests/testthat/test-integer64.R | 1 + 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/R/integer64.R b/R/integer64.R index 8ab4d1a5..423bbc1c 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -395,14 +395,9 @@ NULL #' Factors #' #' The function [factor] is used to encode a vector as a factor. -#' -#' @param x a vector of data, usually taking a small number of distinct values. -#' @param levels an optional vector of the unique values (as character strings) that x might have taken. The default is the unique set of values taken by as.character(x), sorted into increasing order of x. Note that this set can be specified as smaller than sort(unique(x)). -#' @param labels either an optional character vector of labels for the levels (in the same order as levels after removing those in exclude), or a character string of length 1. Duplicated values in labels can be used to map different values of x to the same factor level. -#' @param exclude a vector of values to be excluded when forming the set of levels. This may be factor with the same level set as x or should be a character. -#' @param ordered logical flag to determine if the levels should be regarded as ordered (in the order given). +#' +#' @inheritParams base::factor #' @param nmax an upper bound on the number of levels. -#' @param ... (in ordered(.)): any of the above, apart from ordered itself. #' #' @return An object of class "factor" or "ordered". #' @seealso [factor][base::factor] @@ -822,7 +817,7 @@ as.factor = function(x) factor(x) as.ordered = function(x) ordered(x) #' @rdname factor -#' @export factor +#' @export factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.ordered(x), nmax=NA) { if (!is.integer64(x)) { sys_call = sys.call() @@ -865,7 +860,7 @@ factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.o } #' @rdname factor -#' @export ordered +#' @export ordered = function(x=character(), ...) factor(x, ..., ordered=TRUE) #' @rdname as.character.integer64 diff --git a/man/factor.Rd b/man/factor.Rd index 057fd81d..dfcae56f 100644 --- a/man/factor.Rd +++ b/man/factor.Rd @@ -17,19 +17,32 @@ factor( ordered(x = character(), ...) } \arguments{ -\item{x}{a vector of data, usually taking a small number of distinct values.} +\item{x}{a vector of data, usually taking a small number of distinct + values.} -\item{levels}{an optional vector of the unique values (as character strings) that x might have taken. The default is the unique set of values taken by as.character(x), sorted into increasing order of x. Note that this set can be specified as smaller than sort(unique(x)).} +\item{levels}{an optional vector of the unique values (as character strings) + that \code{x} might have taken. The default is the unique set of + values taken by \code{\link[base]{as.character}(x)}, sorted into + increasing order \emph{of \code{x}}. Note that this set can be + specified as smaller than \code{sort(unique(x))}.} -\item{labels}{either an optional character vector of labels for the levels (in the same order as levels after removing those in exclude), or a character string of length 1. Duplicated values in labels can be used to map different values of x to the same factor level.} +\item{labels}{\emph{either} an optional character vector of + labels for the levels (in the same order as \code{levels} after + removing those in \code{exclude}), \emph{or} a character string of + length 1. Duplicated values in \code{labels} can be used to map + different values of \code{x} to the same factor level.} -\item{exclude}{a vector of values to be excluded when forming the set of levels. This may be factor with the same level set as x or should be a character.} +\item{exclude}{a vector of values to be excluded when forming the + set of levels. This may be factor with the same level set as \code{x} + or should be a \code{character}.} -\item{ordered}{logical flag to determine if the levels should be regarded as ordered (in the order given).} +\item{ordered}{logical flag to determine if the levels should be regarded + as ordered (in the order given).} \item{nmax}{an upper bound on the number of levels.} -\item{...}{(in ordered(.)): any of the above, apart from ordered itself.} +\item{...}{(in \code{ordered(.)}): any of the above, apart from + \code{ordered} itself.} } \value{ An object of class "factor" or "ordered". diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index bce2d33a..0f23c79b 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -828,6 +828,7 @@ test_that("match works with zero length input", { }) test_that("factor and order for integer64 are still necessary", { + skip_on_cran() skip_unless_r(">= 4.0.0") # could not find function "expect_failure" in test-ancient # make sure that factor and order for integer64 are still necessary x = c(132724613L, -2143220989L, -1L, NA, 1L) From 16864596861d6b57312f7e6c8dd77f17fe04ae84 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 21 Feb 2026 10:51:39 +0100 Subject: [PATCH 06/17] move skip_on_cran a line lower --- tests/testthat/test-integer64.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index 0f23c79b..b8d859e4 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -828,8 +828,8 @@ test_that("match works with zero length input", { }) test_that("factor and order for integer64 are still necessary", { - skip_on_cran() skip_unless_r(">= 4.0.0") # could not find function "expect_failure" in test-ancient + skip_on_cran() # make sure that factor and order for integer64 are still necessary x = c(132724613L, -2143220989L, -1L, NA, 1L) expect_failure(expect_identical(base::factor(as.integer64(x)), base::factor(x))) From d86d0445f9dbac36f5beb8f4fa170c8cceb0d8da Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 21 Feb 2026 20:14:25 +0100 Subject: [PATCH 07/17] add factor in optimizer64 --- R/highlevel64.R | 1672 ++++++++++++++++++++++++----------------------- 1 file changed, 862 insertions(+), 810 deletions(-) diff --git a/R/highlevel64.R b/R/highlevel64.R index 1f5845be..0ea75108 100644 --- a/R/highlevel64.R +++ b/R/highlevel64.R @@ -400,841 +400,893 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { #' without caching #' @export optimizer64 = function(nsmall=2L^16L, - nbig=2L^25L, - timefun=repeat.time, - what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile"), - uniorder=c("original", "values", "any"), - taborder=c("values", "counts"), - plot=TRUE) { - uniorder = match.arg(uniorder) - taborder = match.arg(taborder) - ret = vector("list", 2L*length(what)) - dim(ret) <- c(length(what), 2L) - dimnames(ret) <- list(what, c(nsmall, nbig)) - - if (plot) { - oldpar = par(no.readonly = TRUE) - on.exit(par(oldpar)) - par(mfrow=c(2L, 1L)) - } + nbig=2L^25L, + timefun=repeat.time, + what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile"), + uniorder=c("original", "values", "any"), + taborder=c("values", "counts"), + plot=TRUE) { + uniorder = match.arg(uniorder) + taborder = match.arg(taborder) + ret = vector("list", 2L*length(what)) + dim(ret) <- c(length(what), 2L) + dimnames(ret) <- list(what, c(nsmall, nbig)) + + if (plot) { + oldpar = par(no.readonly = TRUE) + on.exit(par(oldpar)) + par(mfrow=c(2L, 1L)) + } - if ("match" %in% what) { - message("match: timings of different methods") - N1 = c(nsmall, nbig) - N2 = c(nbig, nsmall) - for (i in seq_along(N1)) { - n1 = N1[i] - n2 = N2[i] - x1 = c(sample(n2, n1-1L, TRUE), NA) - x2 = c(sample(n2, n2-1L, TRUE), NA) - tim = matrix(0.0, 9L, 3L) - dimnames(tim) <- list( - c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. - c("prep", "both", "use") - ) + if ("match" %in% what) { + message("match: timings of different methods") + N1 = c(nsmall, nbig) + N2 = c(nbig, nsmall) + for (i in seq_along(N1)) { + n1 = N1[i] + n2 = N2[i] + x1 = c(sample(n2, n1 - 1L, TRUE), NA) + x2 = c(sample(n2, n2 - 1L, TRUE), NA) + tim = matrix(0.0, 9L, 3L) + dimnames(tim) <- list( + c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("prep", "both", "use") + ) - tim["match", "both"] <- timefun({ - p = match(x1, x2) - })[3L] - x1 = as.integer64(x1) - x2 = as.integer64(x2) - - tim["match.64", "both"] <- timefun({ - p2 = match.integer64(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - - tim["hashpos", "prep"] <- timefun({ - h2 = hashmap(x2) - })[3L] - tim["hashpos", "use"] <- timefun({ - p2 = hashpos(h2, x1) - })[3L] - stopifnot(identical(p2, p)) - - tim["hashrev", "prep"] <- timefun({ - h1 = hashmap(x1) - })[3L] - tim["hashrev", "use"] <- timefun({ - p1 = hashrev(h1, x2) - })[3L] - stopifnot(identical(p1, p)) - - tim["sortorderpos", "prep"] <- system.time({ - s2 = clone(x2) - o2 = seq_along(x2) - ramsortorder(s2, o2, na.last=FALSE) - })[3L] - tim["sortorderpos", "use"] <- timefun({ - p2 = sortorderpos(s2, o2, x1) - })[3L] - stopifnot(identical(p2, p)) - - tim["orderpos", "prep"] <- timefun({ - o2 = seq_along(x2) - ramorder(x2, o2, na.last=FALSE) - })[3L] - tim["orderpos", "use"] <- timefun({ - p2 = orderpos(x2, o2, x1, method=2L) - })[3L] - stopifnot(identical(p2, p)) - - hashcache(x2) - tim["hashcache", "use"] <- timefun({ - p2 = match.integer64(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - remcache(x2) - - sortordercache(x2) - tim["sortorder.cache", "use"] <- timefun({ - p2 = match.integer64(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - remcache(x2) - - ordercache(x2) - tim["order.cache", "use"] <- timefun({ - p2 = match.integer64(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - remcache(x2) - - if (plot) { - barplot(t(tim)) - n = format(c(n1, n2)) - title(paste("match", n[1L], "in", n[2L])) - } + tim["match", "both"] <- timefun({ + p = match(x1, x2) + })[3L] + x1 = as.integer64(x1) + x2 = as.integer64(x2) - ret[["match", as.character(n1)]] <- tim - } - } + tim["match.64", "both"] <- timefun({ + p2 = match.integer64(x1, x2) + })[3L] + stopifnot(identical(p2, p)) - if ("%in%" %in% what) { - message("%in%: timings of different methods") - N1 = c(nsmall, nbig) - N2 = c(nbig, nsmall) - for (i in seq_along(N1)) { - n1 = N1[i] - n2 = N2[i] - x1 = c(sample(n2, n1-1L, TRUE), NA) - x2 = c(sample(n2, n2-1L, TRUE), NA) - tim = matrix(0.0, 10L, 3L) - dimnames(tim) <- list( - c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. - c("prep", "both", "use") - ) + tim["hashpos", "prep"] <- timefun({ + h2 = hashmap(x2) + })[3L] + tim["hashpos", "use"] <- timefun({ + p2 = hashpos(h2, x1) + })[3L] + stopifnot(identical(p2, p)) - tim["%in%", "both"] <- timefun({ - p = x1 %in% x2 - })[3L] - x1 = as.integer64(x1) - x2 = as.integer64(x2) - - tim["match.64", "both"] <- timefun({ - p2 = match.integer64(x1, x2, nomatch = 0L) > 0L - })[3L] - stopifnot(identical(p2, p)) - - tim["%in%.64", "both"] <- timefun({ - p2 = "%in%.integer64"(x1, x2) # this is using the custom version - })[3L] - stopifnot(identical(p2, p)) - - tim["hashfin", "prep"] <- timefun({ - h2 = hashmap(x2) - })[3L] - tim["hashfin", "use"] <- timefun({ - p2 = hashfin(h2, x1) - })[3L] - stopifnot(identical(p2, p)) - - tim["hashrin", "prep"] <- timefun({ - h1 = hashmap(x1) - })[3L] - tim["hashrin", "use"] <- timefun({ - p1 = hashrin(h1, x2) - })[3L] - stopifnot(identical(p2, p)) - - tim["sortfin", "prep"] <- timefun({ - s2 = clone(x2) - ramsort(s2, na.last=FALSE) - })[3L] - tim["sortfin", "use"] <- timefun({ - p2 = sortfin(s2, x1) - })[3L] - stopifnot(identical(p2, p)) - - tim["orderfin", "prep"] <- timefun({ - o2 = seq_along(x2) - ramorder(x2, o2, na.last=FALSE) - })[3L] - tim["orderfin", "use"] <- timefun({ - p2 = orderfin(x2, o2, x1) - })[3L] - stopifnot(identical(p2, p)) - - hashcache(x2) - tim["hash.cache", "use"] <- timefun({ - p2 = "%in%.integer64"(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - remcache(x2) - - sortordercache(x2) - tim["sortorder.cache", "use"] <- timefun({ - p2 = "%in%.integer64"(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - remcache(x2) - - ordercache(x2) - tim["order.cache", "use"] <- timefun({ - p2 = "%in%.integer64"(x1, x2) - })[3L] - stopifnot(identical(p2, p)) - remcache(x2) - - if (plot) { - barplot(t(tim)) - n = format(c(n1, n2)) - title(paste(n[1L], "%in%", n[2L])) - } + tim["hashrev", "prep"] <- timefun({ + h1 = hashmap(x1) + })[3L] + tim["hashrev", "use"] <- timefun({ + p1 = hashrev(h1, x2) + })[3L] + stopifnot(identical(p1, p)) + + tim["sortorderpos", "prep"] <- system.time({ + s2 = clone(x2) + o2 = seq_along(x2) + ramsortorder(s2, o2, na.last=FALSE) + })[3L] + tim["sortorderpos", "use"] <- timefun({ + p2 = sortorderpos(s2, o2, x1) + })[3L] + stopifnot(identical(p2, p)) + + tim["orderpos", "prep"] <- timefun({ + o2 = seq_along(x2) + ramorder(x2, o2, na.last=FALSE) + })[3L] + tim["orderpos", "use"] <- timefun({ + p2 = orderpos(x2, o2, x1, method=2L) + })[3L] + stopifnot(identical(p2, p)) - ret[["%in%", as.character(n1)]] <- tim + hashcache(x2) + tim["hashcache", "use"] <- timefun({ + p2 = match.integer64(x1, x2) + })[3L] + stopifnot(identical(p2, p)) + remcache(x2) + + sortordercache(x2) + tim["sortorder.cache", "use"] <- timefun({ + p2 = match.integer64(x1, x2) + })[3L] + stopifnot(identical(p2, p)) + remcache(x2) + + ordercache(x2) + tim["order.cache", "use"] <- timefun({ + p2 = match.integer64(x1, x2) + })[3L] + stopifnot(identical(p2, p)) + remcache(x2) + + if (plot) { + barplot(t(tim)) + n = format(c(n1, n2)) + title(paste("match", n[1L], "in", n[2L])) + } + + ret[["match", as.character(n1)]] <- tim + } } - } - if ("duplicated" %in% what) { - message("duplicated: timings of different methods") - N = c(nsmall, nbig) - for (i in seq_along(N)) { - n = N[i] - x = c(sample(n, n-1L, TRUE), NA) - tim = matrix(0.0, 10L, 3L) - dimnames(tim) <- list( - c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. - c("prep", "both", "use") - ) - tim["duplicated", "both"] <- timefun({ - p = duplicated(x) - })[3L] - x = as.integer64(x) - - tim["duplicated.64", "both"] <- timefun({ - p2 = duplicated(x) - })[3L] - stopifnot(identical(p2, p)) - - tim["hashdup", "prep"] <- timefun({ - h = hashmap(x) - })[3L] - tim["hashdup", "use"] <- timefun({ - p2 = hashdup(h) - })[3L] - stopifnot(identical(p2, p)) - - tim["sortorderdup1", "prep"] <- timefun({ - s = clone(x) - o = seq_along(x) - ramsortorder(s, o, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sortorderdup1", "use"] <- timefun({ - p2 = sortorderdup(s, o, method=1L) - })[3L] - stopifnot(identical(p2, p)) - - tim["sortorderdup2", "prep"] <- tim["sortorderdup1", "prep"] - tim["sortorderdup2", "use"] <- timefun({ - p2 = sortorderdup(s, o, method=2L) - })[3L] - stopifnot(identical(p2, p)) - - tim["orderdup1", "prep"] <- timefun({ - o = seq_along(x) - ramorder(x, o, na.last=FALSE) - nunique = ordernut(x, o)[1L] - })[3L] - tim["orderdup1", "use"] <- timefun({ - p2 = orderdup(x, o, method=1L) - })[3L] - stopifnot(identical(p2, p)) - - tim["orderdup2", "prep"] <- tim["orderdup1", "prep"] - tim["orderdup2", "use"] <- timefun({ - p2 = orderdup(x, o, method=2L) - })[3L] - stopifnot(identical(p2, p)) - - hashcache(x) - tim["hash.cache", "use"] <- timefun({ - p2 = duplicated(x) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ - p2 = duplicated(x) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 = duplicated(x) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - if (plot) { - barplot(t(tim), cex.names=0.7) - title(paste0("duplicated(", n, ")")) + if ("%in%" %in% what) { + message("%in%: timings of different methods") + N1 = c(nsmall, nbig) + N2 = c(nbig, nsmall) + for (i in seq_along(N1)) { + n1 = N1[i] + n2 = N2[i] + x1 = c(sample(n2, n1 - 1L, TRUE), NA) + x2 = c(sample(n2, n2 - 1L, TRUE), NA) + tim = matrix(0.0, 10L, 3L) + dimnames(tim) <- list( + c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("prep", "both", "use") + ) + + tim["%in%", "both"] <- timefun({ + p = x1 %in% x2 + })[3L] + x1 = as.integer64(x1) + x2 = as.integer64(x2) + + tim["match.64", "both"] <- timefun({ + p2 = match.integer64(x1, x2, nomatch = 0L) > 0L + })[3L] + stopifnot(identical(p2, p)) + + tim["%in%.64", "both"] <- timefun({ + p2 = "%in%.integer64"(x1, x2) # this is using the custom version + })[3L] + stopifnot(identical(p2, p)) + + tim["hashfin", "prep"] <- timefun({ + h2 = hashmap(x2) + })[3L] + tim["hashfin", "use"] <- timefun({ + p2 = hashfin(h2, x1) + })[3L] + stopifnot(identical(p2, p)) + + tim["hashrin", "prep"] <- timefun({ + h1 = hashmap(x1) + })[3L] + tim["hashrin", "use"] <- timefun({ + p1 = hashrin(h1, x2) + })[3L] + stopifnot(identical(p2, p)) + + tim["sortfin", "prep"] <- timefun({ + s2 = clone(x2) + ramsort(s2, na.last=FALSE) + })[3L] + tim["sortfin", "use"] <- timefun({ + p2 = sortfin(s2, x1) + })[3L] + stopifnot(identical(p2, p)) + + tim["orderfin", "prep"] <- timefun({ + o2 = seq_along(x2) + ramorder(x2, o2, na.last=FALSE) + })[3L] + tim["orderfin", "use"] <- timefun({ + p2 = orderfin(x2, o2, x1) + })[3L] + stopifnot(identical(p2, p)) + + hashcache(x2) + tim["hash.cache", "use"] <- timefun({ + p2 = "%in%.integer64"(x1, x2) + })[3L] + stopifnot(identical(p2, p)) + remcache(x2) + + sortordercache(x2) + tim["sortorder.cache", "use"] <- timefun({ + p2 = "%in%.integer64"(x1, x2) + })[3L] + stopifnot(identical(p2, p)) + remcache(x2) + + ordercache(x2) + tim["order.cache", "use"] <- timefun({ + p2 = "%in%.integer64"(x1, x2) + })[3L] + stopifnot(identical(p2, p)) + remcache(x2) + + if (plot) { + barplot(t(tim)) + n = format(c(n1, n2)) + title(paste(n[1L], "%in%", n[2L])) + } + + ret[["%in%", as.character(n1)]] <- tim } + } + if ("duplicated" %in% what) { + message("duplicated: timings of different methods") + N = c(nsmall, nbig) + for (i in seq_along(N)) { + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 10L, 3L) + dimnames(tim) <- list( + c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("prep", "both", "use") + ) - ret[["duplicated", as.character(n)]] <- tim + tim["duplicated", "both"] <- timefun({ + p = duplicated(x) + })[3L] + x = as.integer64(x) + + tim["duplicated.64", "both"] <- timefun({ + p2 = duplicated(x) + })[3L] + stopifnot(identical(p2, p)) + + tim["hashdup", "prep"] <- timefun({ + h = hashmap(x) + })[3L] + tim["hashdup", "use"] <- timefun({ + p2 = hashdup(h) + })[3L] + stopifnot(identical(p2, p)) + + tim["sortorderdup1", "prep"] <- timefun({ + s = clone(x) + o = seq_along(x) + ramsortorder(s, o, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sortorderdup1", "use"] <- timefun({ + p2 = sortorderdup(s, o, method=1L) + })[3L] + stopifnot(identical(p2, p)) + + tim["sortorderdup2", "prep"] <- tim["sortorderdup1", "prep"] + tim["sortorderdup2", "use"] <- timefun({ + p2 = sortorderdup(s, o, method=2L) + })[3L] + stopifnot(identical(p2, p)) + + tim["orderdup1", "prep"] <- timefun({ + o = seq_along(x) + ramorder(x, o, na.last=FALSE) + nunique = ordernut(x, o)[1L] + })[3L] + tim["orderdup1", "use"] <- timefun({ + p2 = orderdup(x, o, method=1L) + })[3L] + stopifnot(identical(p2, p)) + + tim["orderdup2", "prep"] <- tim["orderdup1", "prep"] + tim["orderdup2", "use"] <- timefun({ + p2 = orderdup(x, o, method=2L) + })[3L] + stopifnot(identical(p2, p)) + + hashcache(x) + tim["hash.cache", "use"] <- timefun({ + p2 = duplicated(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + sortordercache(x) + tim["sortorder.cache", "use"] <- timefun({ + p2 = duplicated(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] <- timefun({ + p2 = duplicated(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("duplicated(", n, ")")) + } + + ret[["duplicated", as.character(n)]] <- tim + } } - } - if ("unique" %in% what) { - message("unique: timings of different methods") - N = c(nsmall, nbig) - for (i in seq_along(N)) { - n = N[i] - x = c(sample(n, n-1L, TRUE), NA) - tim = matrix(0.0, 15L, 3L) - dimnames(tim) <- list( - c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. - c("prep", "both", "use") - ) + if ("unique" %in% what) { + message("unique: timings of different methods") + N = c(nsmall, nbig) + for (i in seq_along(N)) { + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 15L, 3L) + dimnames(tim) <- list( + c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("prep", "both", "use") + ) - tim["unique", "both"] <- timefun({ - p = unique(x) - })[3L] - x = as.integer64(x) - p = as.integer64(p) - if (uniorder=="values") - ramsort(p, na.last=FALSE) - - tim["unique.64", "both"] <- timefun({ - p2 = unique(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical.integer64(p2, p)) - - tim["hashmapuni", "both"] <- timefun({ - p2 = hashmapuni(x) - })[3L] - if (uniorder=="original") - stopifnot(identical.integer64(p2, p)) - - tim["hashuni", "prep"] <- timefun({ - h = hashmap(x) - # for(r in 1:r)h <- hashmap(x, nunique=h$nunique) - })[3L] - tim["hashuni", "use"] <- timefun({ - p2 = hashuni(h) - })[3L] - if (uniorder=="values") - stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) - - tim["hashunikeep", "prep"] <- tim["hashuni", "prep"] - tim["hashunikeep", "use"] <- timefun({ - p2 = hashuni(h, keep.order=TRUE) - })[3L] - if (uniorder=="original") - stopifnot(identical.integer64(p2, p)) - - tim["sortuni", "prep"] <- timefun({ - s = clone(x) - ramsort(s, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sortuni", "use"] <- timefun({ - p2 = sortuni(s, nunique) - })[3L] - if (uniorder=="values") - stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) - - tim["sortunikeep", "prep"] <- timefun({ - s = clone(x) - o = seq_along(x) - ramsortorder(s, o, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sortunikeep", "use"] <- timefun({ - p2 = sortorderuni(x, s, o, nunique) - })[3L] - if (uniorder=="original") - stopifnot(identical.integer64(p2, p)) - - tim["orderuni", "prep"] <- timefun({ - o = seq_along(x) - ramorder(x, o, na.last=FALSE) - nunique = ordernut(x, o)[1L] - })[3L] - tim["orderuni", "use"] <- timefun({ - p2 = orderuni(x, o, nunique) - })[3L] - if (uniorder=="values") - stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) - - tim["orderunikeep", "prep"] <- tim["orderuni", "prep"] - tim["orderunikeep", "use"] <- timefun({ - p2 = orderuni(x, o, nunique, keep.order=TRUE) - nunique = ordernut(x, o)[1L] - })[3L] - if (uniorder=="original") - stopifnot(identical.integer64(p2, p)) - - tim["hashdup", "prep"] <- tim["hashuni", "prep"] - tim["hashdup", "use"] <- timefun({ - p2 = x[!hashdup(h)] - })[3L] - if (uniorder=="original") - stopifnot(identical.integer64(p2, p)) - - tim["sortorderdup", "prep"] <- tim["sortunikeep", "prep"] - tim["sortorderdup", "use"] <- timefun({ - p2 = x[!sortorderdup(s, o)] - })[3L] - if (uniorder=="original") - stopifnot(identical.integer64(p2, p)) - - - hashcache(x) - tim["hash.cache", "use"] <- timefun({ - p2 = unique(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical.integer64(p2, p)) - remcache(x) - - sortcache(x) - tim["sort.cache", "use"] <- timefun({ - p2 = unique(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical.integer64(p2, p)) - remcache(x) - - sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ - p2 = unique(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical.integer64(p2, p)) - remcache(x) - - ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 = unique(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical.integer64(p2, p)) - remcache(x) - - if (plot) { - barplot(t(tim), cex.names=0.7) - title(paste0("unique(", n, ", order=", uniorder, ")")) - } + tim["unique", "both"] <- timefun({ + p = unique(x) + })[3L] + x = as.integer64(x) + p = as.integer64(p) + if (uniorder == "values") + ramsort(p, na.last=FALSE) - ret[["unique", as.character(n)]] <- tim + tim["unique.64", "both"] <- timefun({ + p2 = unique(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical.integer64(p2, p)) + + tim["hashmapuni", "both"] <- timefun({ + p2 = hashmapuni(x) + })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) + + tim["hashuni", "prep"] <- timefun({ + h = hashmap(x) + # for(r in 1:r)h <- hashmap(x, nunique=h$nunique) + })[3L] + tim["hashuni", "use"] <- timefun({ + p2 = hashuni(h) + })[3L] + if (uniorder == "values") + stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) + + tim["hashunikeep", "prep"] <- tim["hashuni", "prep"] + tim["hashunikeep", "use"] <- timefun({ + p2 = hashuni(h, keep.order=TRUE) + })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) + + tim["sortuni", "prep"] <- timefun({ + s = clone(x) + ramsort(s, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sortuni", "use"] <- timefun({ + p2 = sortuni(s, nunique) + })[3L] + if (uniorder == "values") + stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) + + tim["sortunikeep", "prep"] <- timefun({ + s = clone(x) + o = seq_along(x) + ramsortorder(s, o, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sortunikeep", "use"] <- timefun({ + p2 = sortorderuni(x, s, o, nunique) + })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) + + tim["orderuni", "prep"] <- timefun({ + o = seq_along(x) + ramorder(x, o, na.last=FALSE) + nunique = ordernut(x, o)[1L] + })[3L] + tim["orderuni", "use"] <- timefun({ + p2 = orderuni(x, o, nunique) + })[3L] + if (uniorder == "values") + stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) + + tim["orderunikeep", "prep"] <- tim["orderuni", "prep"] + tim["orderunikeep", "use"] <- timefun({ + p2 = orderuni(x, o, nunique, keep.order=TRUE) + nunique = ordernut(x, o)[1L] + })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) + + tim["hashdup", "prep"] <- tim["hashuni", "prep"] + tim["hashdup", "use"] <- timefun({ + p2 = x[!hashdup(h)] + })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) + + tim["sortorderdup", "prep"] <- tim["sortunikeep", "prep"] + tim["sortorderdup", "use"] <- timefun({ + p2 = x[!sortorderdup(s, o)] + })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) + + hashcache(x) + tim["hash.cache", "use"] <- timefun({ + p2 = unique(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical.integer64(p2, p)) + remcache(x) + + sortcache(x) + tim["sort.cache", "use"] <- timefun({ + p2 = unique(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical.integer64(p2, p)) + remcache(x) + + sortordercache(x) + tim["sortorder.cache", "use"] <- timefun({ + p2 = unique(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical.integer64(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] <- timefun({ + p2 = unique(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical.integer64(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("unique(", n, ", order=", uniorder, ")")) + } + + ret[["unique", as.character(n)]] <- tim + } } - } - if ("unipos" %in% what) { - message("unipos: timings of different methods") - N = c(nsmall, nbig) - for (i in seq_along(N)) { - n = N[i] - x = c(sample(n, n-1L, TRUE), NA) - tim = matrix(0.0, 14L, 3L) - dimnames(tim) <- list( - c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. - c("prep", "both", "use") - ) + if ("unipos" %in% what) { + message("unipos: timings of different methods") + N = c(nsmall, nbig) + for (i in seq_along(N)) { + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 14L, 3L) + dimnames(tim) <- list( + c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("prep", "both", "use") + ) - tim["unique", "both"] <- timefun({ - unique(x) - })[3L] - x = as.integer64(x) - - tim["unipos.64", "both"] <- timefun({ - p = unipos(x, order=uniorder) - })[3L] - - tim["hashmapupo", "both"] <- timefun({ - p2 = hashmapupo(x) - })[3L] - if (uniorder=="original") - stopifnot(identical(p2, p)) - - tim["hashupo", "prep"] <- timefun({ - h = hashmap(x) - # if nunique is small we could re-build the hashmap at a smaller size - # h <- hashmap(x, nunique=h$nunique) - })[3L] - tim["hashupo", "use"] <- timefun({ - p2 = hashupo(h) - })[3L] - if (uniorder=="values") - stopifnot(identical(sort(p2, na.last=FALSE), sort(p, na.last=FALSE))) - - tim["hashupokeep", "prep"] <- tim["hashupo", "prep"] - tim["hashupokeep", "use"] <- timefun({ - p2 = hashupo(h, keep.order=TRUE) - })[3L] - if (uniorder=="original") - stopifnot(identical(p2, p)) - - - tim["sortorderupo", "prep"] <- timefun({ - s = clone(x) - o = seq_along(x) - ramsortorder(s, o, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sortorderupo", "use"] <- timefun({ - p2 = sortorderupo(s, o, nunique) - })[3L] - if (uniorder=="values") - stopifnot(identical(p2, p)) - - tim["sortorderupokeep", "prep"] <- timefun({ - s = clone(x) - o = seq_along(x) - ramsortorder(s, o, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sortorderupokeep", "use"] <- timefun({ - p2 = sortorderupo(s, o, nunique, keep.order=TRUE) - })[3L] - if (uniorder=="original") - stopifnot(identical(p2, p)) - - tim["orderupo", "prep"] <- timefun({ - o = seq_along(x) - ramorder(x, o, na.last=FALSE) - nunique = ordernut(x, o)[1L] - })[3L] - tim["orderupo", "use"] <- timefun({ - p2 = orderupo(x, o, nunique) - })[3L] - if (uniorder=="values") - stopifnot(identical(p2, p)) - - tim["orderupokeep", "prep"] <- tim["orderupo", "prep"] - tim["orderupokeep", "use"] <- timefun({ - p2 = orderupo(x, o, nunique, keep.order=TRUE) - nunique = ordernut(x, o)[1L] - })[3L] - if (uniorder=="original") - stopifnot(identical(p2, p)) - - tim["hashdup", "prep"] <- tim["hashupo", "prep"] - tim["hashdup", "use"] <- timefun({ - p2 = (1:n)[!hashdup(h)] - })[3L] - if (uniorder=="original") - stopifnot(identical(p2, p)) - - tim["sortorderdup", "prep"] <- tim["sortorderupokeep", "prep"] - tim["sortorderdup", "use"] <- timefun({ - p2 = (1:n)[!sortorderdup(s, o)] - })[3L] - if (uniorder=="original") - stopifnot(identical(p2, p)) - - hashcache(x) - tim["hash.cache", "use"] <- timefun({ - p2 = unipos(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical(p2, p)) - remcache(x) - - sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ - p2 = unipos(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical(p2, p)) - remcache(x) - - ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 = unipos(x, order=uniorder) - })[3L] - if (uniorder!="any") - stopifnot(identical(p2, p)) - remcache(x) - - if (plot) { - barplot(t(tim), cex.names=0.7) - title(paste0("unipos(", n, ", order=", uniorder, ")")) - } + tim["unique", "both"] <- timefun({ + unique(x) + })[3L] + x = as.integer64(x) + + tim["unipos.64", "both"] <- timefun({ + p = unipos(x, order=uniorder) + })[3L] + + tim["hashmapupo", "both"] <- timefun({ + p2 = hashmapupo(x) + })[3L] + if (uniorder == "original") + stopifnot(identical(p2, p)) - ret[["unipos", as.character(n)]] <- tim + tim["hashupo", "prep"] <- timefun({ + h = hashmap(x) + # if nunique is small we could re-build the hashmap at a smaller size + # h <- hashmap(x, nunique=h$nunique) + })[3L] + tim["hashupo", "use"] <- timefun({ + p2 = hashupo(h) + })[3L] + if (uniorder == "values") + stopifnot(identical(sort(p2, na.last=FALSE), sort(p, na.last=FALSE))) + + tim["hashupokeep", "prep"] <- tim["hashupo", "prep"] + tim["hashupokeep", "use"] <- timefun({ + p2 = hashupo(h, keep.order=TRUE) + })[3L] + if (uniorder == "original") + stopifnot(identical(p2, p)) + + tim["sortorderupo", "prep"] <- timefun({ + s = clone(x) + o = seq_along(x) + ramsortorder(s, o, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sortorderupo", "use"] <- timefun({ + p2 = sortorderupo(s, o, nunique) + })[3L] + if (uniorder == "values") + stopifnot(identical(p2, p)) + + tim["sortorderupokeep", "prep"] <- timefun({ + s = clone(x) + o = seq_along(x) + ramsortorder(s, o, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sortorderupokeep", "use"] <- timefun({ + p2 = sortorderupo(s, o, nunique, keep.order=TRUE) + })[3L] + if (uniorder == "original") + stopifnot(identical(p2, p)) + + tim["orderupo", "prep"] <- timefun({ + o = seq_along(x) + ramorder(x, o, na.last=FALSE) + nunique = ordernut(x, o)[1L] + })[3L] + tim["orderupo", "use"] <- timefun({ + p2 = orderupo(x, o, nunique) + })[3L] + if (uniorder == "values") + stopifnot(identical(p2, p)) + + tim["orderupokeep", "prep"] <- tim["orderupo", "prep"] + tim["orderupokeep", "use"] <- timefun({ + p2 = orderupo(x, o, nunique, keep.order=TRUE) + nunique = ordernut(x, o)[1L] + })[3L] + if (uniorder == "original") + stopifnot(identical(p2, p)) + + tim["hashdup", "prep"] <- tim["hashupo", "prep"] + tim["hashdup", "use"] <- timefun({ + p2 = (1:n)[!hashdup(h)] + })[3L] + if (uniorder == "original") + stopifnot(identical(p2, p)) + + tim["sortorderdup", "prep"] <- tim["sortorderupokeep", "prep"] + tim["sortorderdup", "use"] <- timefun({ + p2 = (1:n)[!sortorderdup(s, o)] + })[3L] + if (uniorder == "original") + stopifnot(identical(p2, p)) + + hashcache(x) + tim["hash.cache", "use"] <- timefun({ + p2 = unipos(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical(p2, p)) + remcache(x) + + sortordercache(x) + tim["sortorder.cache", "use"] <- timefun({ + p2 = unipos(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] <- timefun({ + p2 = unipos(x, order=uniorder) + })[3L] + if (uniorder != "any") + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("unipos(", n, ", order=", uniorder, ")")) + } + + ret[["unipos", as.character(n)]] <- tim + } } - } - if ("table" %in% what) { - message("table: timings of different methods") - N = c(nsmall, nbig) - for (i in seq_along(N)) { - n = N[i] - x = c(sample.int(1024L, n-1L, replace=TRUE), NA) - tim = matrix(0.0, 13L, 3L) - dimnames(tim) <- list(c("tabulate", "table", "table.64", "hashmaptab", "hashtab", "hashtab2", "sorttab", "sortordertab", "ordertab", "ordertabkeep" - , "hash.cache", "sort.cache", "order.cache") - , c("prep", "both", "use")) - - tim["tabulate", "both"] <- timefun({ - tabulate(x) - })[3L] - - tim["table", "both"] <- timefun({ - p = table(x, exclude=NULL) - })[3L] - - x = as.integer64(x) - - tim["table.64", "both"] <- timefun({ - p2 = table(x, exclude=NULL, order=taborder) - })[3L] - stopifnot(identical(p2, p)) - - tim["hashmaptab", "both"] <- timefun({ - p = hashmaptab(x) - })[3L] - - tim["hashtab", "prep"] <- timefun({ - h = hashmap(x) - })[3L] - tim["hashtab", "use"] <- timefun({ - p2 = hashtab(h) - })[3L] - stopifnot(identical(p2, p)) - - tim["hashtab2", "prep"] <- tim["hashtab", "prep"] + timefun({ - h = hashmap(x, nunique=h$nunique) - })[3L] - tim["hashtab2", "use"] <- timefun({ - p2 = hashtab(h) - })[3L] - - sortp = function(p) { - s = p$values - o = seq_along(s) - ramsortorder(s, o, na.last=FALSE) - list(values=s, counts=p$counts[o]) - } - p = sortp(p) - p2 = sortp(p2) - stopifnot(identical(p2, p)) - - tim["sorttab", "prep"] <- timefun({ - s = clone(x) - ramsort(s, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sorttab", "use"] <- timefun({ - p2 = list(values=sortuni(s, nunique), counts=sorttab(s, nunique)) - })[3L] - stopifnot(identical(p2, p)) - - tim["sortordertab", "prep"] <- timefun({ - s = clone(x) - o = seq_along(x) - ramsortorder(s, o, na.last=FALSE) - nunique = sortnut(s)[1L] - })[3L] - tim["sortordertab", "use"] <- timefun({ - p2 <- list(values=sortorderuni(x, s, o, nunique), counts=sortordertab(s, o)) - })[3L] - p2 <- sortp(p2) - stopifnot(identical(p2, p)) - - tim["ordertab", "prep"] <- timefun({ - o <- seq_along(x) - ramorder(x, o, na.last=FALSE) - nunique <- ordernut(x, o)[1L] - })[3L] - tim["ordertab", "use"] <- timefun({ - p2 <- list(values=orderuni(x, o, nunique), counts=ordertab(x, o, nunique)) - })[3L] - stopifnot(identical(p2, p)) - - tim["ordertabkeep", "prep"] <- tim["ordertab", "prep"] - tim["ordertabkeep", "use"] <- timefun({ - p2 <- list(values=orderuni(x, o, nunique, keep.order=TRUE), counts=ordertab(x, o, nunique, keep.order=TRUE)) - })[3L] - p2 <- sortp(p2) - stopifnot(identical(p2, p)) - - hashcache(x) - tim["hash.cache", "use"] <- timefun({ - p <- table(x, exclude=NULL, order=taborder) - })[3L] - remcache(x) - - sortordercache(x, na.last=TRUE) - tim["sort.cache", "use"] <- timefun({ - p2 <- table(x, exclude=NULL, order=taborder) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - ordercache(x, na.last=TRUE) - tim["order.cache", "use"] <- timefun({ - p2 <- table(x, exclude=NULL, order=taborder) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - if (plot) { - barplot(t(tim), cex.names=0.7) - title(paste0("table(", n, ", order=", taborder, ")")) - } + if ("table" %in% what) { + message("table: timings of different methods") + N = c(nsmall, nbig) + for (i in seq_along(N)) { + n = N[i] + x = c(sample.int(1024L, n - 1L, replace=TRUE), NA) + tim = matrix(0.0, 13L, 3L) + dimnames(tim) <- list( + c("tabulate", "table", "table.64", "hashmaptab", "hashtab", "hashtab2", "sorttab", "sortordertab", "ordertab", "ordertabkeep", "hash.cache", "sort.cache", "order.cache"), + c("prep", "both", "use") + ) - ret[["table", as.character(n)]] <- tim - } + tim["tabulate", "both"] <- timefun({ + tabulate(x) + })[3L] + + tim["table", "both"] <- timefun({ + p = table(x, exclude=NULL) + })[3L] + + x = as.integer64(x) + + tim["table.64", "both"] <- timefun({ + p2 = table(x, exclude=NULL, order=taborder) + })[3L] + stopifnot(identical(p2, p)) + + tim["hashmaptab", "both"] <- timefun({ + p = hashmaptab(x) + })[3L] + + tim["hashtab", "prep"] <- timefun({ + h = hashmap(x) + })[3L] + tim["hashtab", "use"] <- timefun({ + p2 = hashtab(h) + })[3L] + stopifnot(identical(p2, p)) + + tim["hashtab2", "prep"] <- tim["hashtab", "prep"] + timefun({ + h = hashmap(x, nunique=h$nunique) + })[3L] + tim["hashtab2", "use"] <- timefun({ + p2 = hashtab(h) + })[3L] + + sortp = function(p) { + s = p$values + o = seq_along(s) + ramsortorder(s, o, na.last=FALSE) + list(values=s, counts=p$counts[o]) + } + p = sortp(p) + p2 = sortp(p2) + stopifnot(identical(p2, p)) + + tim["sorttab", "prep"] <- timefun({ + s = clone(x) + ramsort(s, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sorttab", "use"] <- timefun({ + p2 = list(values=sortuni(s, nunique), counts=sorttab(s, nunique)) + })[3L] + stopifnot(identical(p2, p)) + + tim["sortordertab", "prep"] <- timefun({ + s = clone(x) + o = seq_along(x) + ramsortorder(s, o, na.last=FALSE) + nunique = sortnut(s)[1L] + })[3L] + tim["sortordertab", "use"] <- timefun({ + p2 <- list(values=sortorderuni(x, s, o, nunique), counts=sortordertab(s, o)) + })[3L] + p2 <- sortp(p2) + stopifnot(identical(p2, p)) + + tim["ordertab", "prep"] <- timefun({ + o <- seq_along(x) + ramorder(x, o, na.last=FALSE) + nunique <- ordernut(x, o)[1L] + })[3L] + tim["ordertab", "use"] <- timefun({ + p2 <- list(values=orderuni(x, o, nunique), counts=ordertab(x, o, nunique)) + })[3L] + stopifnot(identical(p2, p)) + + tim["ordertabkeep", "prep"] <- tim["ordertab", "prep"] + tim["ordertabkeep", "use"] <- timefun({ + p2 <- list(values=orderuni(x, o, nunique, keep.order=TRUE), counts=ordertab(x, o, nunique, keep.order=TRUE)) + })[3L] + p2 <- sortp(p2) + stopifnot(identical(p2, p)) + + hashcache(x) + tim["hash.cache", "use"] <- timefun({ + p <- table(x, exclude=NULL, order=taborder) + })[3L] + remcache(x) + + sortordercache(x, na.last=TRUE) + tim["sort.cache", "use"] <- timefun({ + p2 <- table(x, exclude=NULL, order=taborder) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x, na.last=TRUE) + tim["order.cache", "use"] <- timefun({ + p2 <- table(x, exclude=NULL, order=taborder) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("table(", n, ", order=", taborder, ")")) + } + + ret[["table", as.character(n)]] <- tim } - if ("rank" %in% what) { - message("rank: timings of different methods") - N <- c(nsmall, nbig) - for (i in seq_along(N)) { - n <- N[i] - x <- c(sample(n, n-1L, TRUE), NA) - tim <- matrix(0.0, 7L, 3L) - dimnames(tim) <- list(c("rank", "rank.keep", "rank.64", "sortorderrnk", "orderrnk" - , "sort.cache", "order.cache") - , c("prep", "both", "use")) - - tim["rank", "both"] <- timefun({ - rank(x) - })[3L] - - tim["rank.keep", "both"] <- timefun({ - p <- rank(x, na.last="keep") - })[3L] - - x <- as.integer64(x) - - tim["rank.64", "both"] <- timefun({ - p2 <- rank.integer64(x) - })[3L] - stopifnot(identical(p2, p)) - - tim["sortorderrnk", "prep"] <- timefun({ - s <- clone(x) - o <- seq_along(x) - na.count <- ramsortorder(s, o, na.last=FALSE) - })[3L] - tim["sortorderrnk", "use"] <- timefun({ - p2 <- sortorderrnk(s, o, na.count) - })[3L] - stopifnot(identical(p2, p)) - - tim["orderrnk", "prep"] <- timefun({ - o <- seq_along(x) - na.count <- ramorder(x, o, na.last=FALSE) - })[3L] - tim["orderrnk", "use"] <- timefun({ - p2 <- orderrnk(x, o, na.count) - })[3L] - stopifnot(identical(p2, p)) - - sortordercache(x) - tim["sort.cache", "use"] <- timefun({ - p2 <- rank.integer64(x) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 <- rank.integer64(x) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - if (plot) { - barplot(t(tim), cex.names=0.7) - title(paste0("rank.integer64(", n, ")")) - } + } + if ("rank" %in% what) { + message("rank: timings of different methods") + N <- c(nsmall, nbig) + for (i in seq_along(N)) { + n <- N[i] + x <- c(sample(n, n - 1L, TRUE), NA) + tim <- matrix(0.0, 7L, 3L) + dimnames(tim) <- list( + c("rank", "rank.keep", "rank.64", "sortorderrnk", "orderrnk", "sort.cache", "order.cache"), + c("prep", "both", "use") + ) - ret[["rank", as.character(n)]] <- tim - } + tim["rank", "both"] <- timefun({ + rank(x) + })[3L] + + tim["rank.keep", "both"] <- timefun({ + p <- rank(x, na.last="keep") + })[3L] + + x <- as.integer64(x) + + tim["rank.64", "both"] <- timefun({ + p2 <- rank.integer64(x) + })[3L] + stopifnot(identical(p2, p)) + + tim["sortorderrnk", "prep"] <- timefun({ + s <- clone(x) + o <- seq_along(x) + na.count <- ramsortorder(s, o, na.last=FALSE) + })[3L] + tim["sortorderrnk", "use"] <- timefun({ + p2 <- sortorderrnk(s, o, na.count) + })[3L] + stopifnot(identical(p2, p)) + + tim["orderrnk", "prep"] <- timefun({ + o <- seq_along(x) + na.count <- ramorder(x, o, na.last=FALSE) + })[3L] + tim["orderrnk", "use"] <- timefun({ + p2 <- orderrnk(x, o, na.count) + })[3L] + stopifnot(identical(p2, p)) + + sortordercache(x) + tim["sort.cache", "use"] <- timefun({ + p2 <- rank.integer64(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] <- timefun({ + p2 <- rank.integer64(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("rank.integer64(", n, ")")) + } + + ret[["rank", as.character(n)]] <- tim } - if ("quantile" %in% what) { - message("quantile: timings of different methods") - N <- c(nsmall, nbig) - for (i in seq_along(N)) { - n <- N[i] - x <- c(sample(n, n-1L, TRUE), NA) - tim <- matrix(0.0, 6L, 3L) - dimnames(tim) <- list(c("quantile", "quantile.64", "sortqtl", "orderqtl" - , "sort.cache", "order.cache") - , c("prep", "both", "use")) - - tim["quantile", "both"] <- timefun({ - p <- quantile(x, type=1L, na.rm=TRUE) - })[3L] - p2 <- p - p <- as.integer64(p2) - names(p) <- names(p2) - - x <- as.integer64(x) - - tim["quantile.64", "both"] <- timefun({ - p2 <- quantile(x, na.rm=TRUE) - })[3L] - stopifnot(identical(p2, p)) - - tim["sortqtl", "prep"] <- timefun({ - s <- clone(x) - na.count <- ramsort(s, na.last=FALSE) - })[3L] - tim["sortqtl", "use"] <- timefun({ - p2 <- sortqtl(s, na.count, seq(0.0, 1.0, 0.25)) - })[3L] - stopifnot(identical(unname(p2), unname(p))) - - tim["orderqtl", "prep"] <- timefun({ - o <- seq_along(x) - na.count <- ramorder(x, o, na.last=FALSE) - })[3L] - tim["orderqtl", "use"] <- timefun({ - p2 <- orderqtl(x, o, na.count, seq(0.0, 1.0, 0.25)) - })[3L] - stopifnot(identical(unname(p2), unname(p))) - - sortordercache(x) - tim["sort.cache", "use"] <- timefun({ - p2 <- quantile(x, na.rm=TRUE) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 <- quantile(x, na.rm=TRUE) - })[3L] - stopifnot(identical(p2, p)) - remcache(x) - - if (plot) { - barplot(t(tim), cex.names=0.7) - title(paste0("quantile(", n, ")")) - } + } + if ("quantile" %in% what) { + message("quantile: timings of different methods") + N <- c(nsmall, nbig) + for (i in seq_along(N)) { + n <- N[i] + x <- c(sample(n, n - 1L, TRUE), NA) + tim <- matrix(0.0, 6L, 3L) + dimnames(tim) <- list( + c("quantile", "quantile.64", "sortqtl", "orderqtl", "sort.cache", "order.cache"), + c("prep", "both", "use") + ) - ret[["quantile", as.character(n)]] <- tim - } + tim["quantile", "both"] <- timefun({ + p <- quantile(x, type=1L, na.rm=TRUE) + })[3L] + p2 <- p + p <- as.integer64(p2) + names(p) <- names(p2) + + x <- as.integer64(x) + + tim["quantile.64", "both"] <- timefun({ + p2 <- quantile(x, na.rm=TRUE) + })[3L] + stopifnot(identical(p2, p)) + + tim["sortqtl", "prep"] <- timefun({ + s <- clone(x) + na.count <- ramsort(s, na.last=FALSE) + })[3L] + tim["sortqtl", "use"] <- timefun({ + p2 <- sortqtl(s, na.count, seq(0.0, 1.0, 0.25)) + })[3L] + stopifnot(identical(unname(p2), unname(p))) + + tim["orderqtl", "prep"] <- timefun({ + o <- seq_along(x) + na.count <- ramorder(x, o, na.last=FALSE) + })[3L] + tim["orderqtl", "use"] <- timefun({ + p2 <- orderqtl(x, o, na.count, seq(0.0, 1.0, 0.25)) + })[3L] + stopifnot(identical(unname(p2), unname(p))) + + sortordercache(x) + tim["sort.cache", "use"] <- timefun({ + p2 <- quantile(x, na.rm=TRUE) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] <- timefun({ + p2 <- quantile(x, na.rm=TRUE) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("quantile(", n, ")")) + } + + ret[["quantile", as.character(n)]] <- tim } + } - ret + if ("factor" %in% what) { + message("factor: timings of different methods") + N = c(nsmall, nbig) + for (i in seq_along(N)) { + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 5L, 3L) + dimnames(tim) <- list( + c("factor", "factor.64", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("prep", "both", "use") + ) + + tim["factor", "both"] <- timefun({ + p = base::factor(x) + })[3L] + x = as.integer64(x) + + tim["factor.64", "both"] <- timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + + hashcache(x) + tim["hashcache", "use"] <- timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + sortordercache(x) + tim["sortorder.cache", "use"] <- timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] <- timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("factor(", n, ")")) + } + + ret[["factor", as.character(n)]] <- tim + } + } + ret } # nolint end: brace_linter, line_length_linter. # nocov end From 19c41f3dc7a965b226d29e16629bab70cc5de537 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 21 Feb 2026 20:14:59 +0100 Subject: [PATCH 08/17] use base::factor for short vectors --- R/integer64.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/integer64.R b/R/integer64.R index 423bbc1c..8c341a0b 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -829,11 +829,19 @@ factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.o nx = names(x) if (missing(levels)) { levels = sort(unique(x)) - } else { + } else if (length(x) >= 4000) { levels = as.integer64(levels) } - # basically copied from base::factor - force(ordered) + # use base::factor for short vectors because it is faster + if (length(x) < 4000) { + force(ordered) + if (missing(labels)) + return(withCallingHandlers_and_choose_call(base::factor(as.character(x), levels=levels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) + else + return(withCallingHandlers_and_choose_call(base::factor(as.character(x), levels=levels, labels=labels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) + } + + # basically copied from base::factor, but using the benefit from caching levels = levels[is.na(match(levels, exclude))] ret = match(x, levels) if (!is.null(nx)) From f9dff7951ffea761e8fc82be5c3139356e61afdf Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 21 Feb 2026 20:27:58 +0100 Subject: [PATCH 09/17] trying to fix ancient test --- R/integer64.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/integer64.R b/R/integer64.R index 8c341a0b..a8749ce1 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -810,11 +810,11 @@ as.POSIXlt.integer64 = function(x, tz="", origin, ...) #' @rdname as.character.integer64 #' @export as.factor -as.factor = function(x) factor(x) +as.factor = function(x) factor(x=x) #' @rdname as.character.integer64 #' @export as.ordered -as.ordered = function(x) ordered(x) +as.ordered = function(x) ordered(x=x) #' @rdname factor #' @export @@ -835,10 +835,11 @@ factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.o # use base::factor for short vectors because it is faster if (length(x) < 4000) { force(ordered) + x = as.character(x) if (missing(labels)) - return(withCallingHandlers_and_choose_call(base::factor(as.character(x), levels=levels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) + return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) else - return(withCallingHandlers_and_choose_call(base::factor(as.character(x), levels=levels, labels=labels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) + return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, labels=labels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) } # basically copied from base::factor, but using the benefit from caching From 3ef5414ee4c9b50c58f6794667deec38d9b050ab Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sat, 21 Feb 2026 20:33:58 +0100 Subject: [PATCH 10/17] fix ancient test --- R/integer64.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/integer64.R b/R/integer64.R index a8749ce1..d74c2f22 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -836,6 +836,7 @@ factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.o if (length(x) < 4000) { force(ordered) x = as.character(x) + levels = as.character(levels) if (missing(labels)) return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) else From 4261a4e9a59c0cc328bf01f441417b7a99cfd900 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sun, 22 Feb 2026 10:03:14 +0100 Subject: [PATCH 11/17] add factor to benchmark64 --- R/highlevel64.R | 897 ++++++++++++++++++++++++------------------------ 1 file changed, 453 insertions(+), 444 deletions(-) diff --git a/R/highlevel64.R b/R/highlevel64.R index 0ea75108..b2be2631 100644 --- a/R/highlevel64.R +++ b/R/highlevel64.R @@ -47,6 +47,7 @@ #' | rank(b) | ranking of big vector | #' | quantile(b) | quantiles of big vector | #' | summary(b) | summary of of big vector | +#' | factor(b) | coercion to factor of big vector | #' | SESSION | exemplary session involving multiple calls (including cache filling costs) | #' #' Note that the timings for the cached variants do _not_ contain the @@ -129,268 +130,275 @@ NULL # nolint start: brace_linter, line_length_linter. benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { - message('\ncompare performance for a complete sessions of calls') - s = sample(nbig, nsmall, TRUE) - b = sample(nbig, nbig, TRUE) - b2 = sample(nbig, nbig, TRUE) + message('\ncompare performance for a complete sessions of calls') + s = sample(nbig, nsmall, TRUE) + b = sample(nbig, nbig, TRUE) + b2 = sample(nbig, nbig, TRUE) - tim1 = double(6L) - names(tim1) <- c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache") + tim1 = double(6L) + names(tim1) = c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache") - s = as.integer(s) - b = as.integer(b) - b2 = as.integer(b2) + s = as.integer(s) + b = as.integer(b) + b2 = as.integer(b2) - for (i in 1:6) { - message("\n=== ", names(tim1)[i], " ===") + for (i in 1:6) { + message("\n=== ", names(tim1)[i], " ===") - if (i==2L) { - s = as.integer64(s) - b = as.integer64(b) - b2 = as.integer64(b2) + if (i == 2L) { + s = as.integer64(s) + b = as.integer64(b) + b2 = as.integer64(b2) + } + + tim1[i] = 0L + + tim1[i] = tim1[i] + timefun({ + switch(i, + NULL, # i=1 + NULL, # i=2 + { hashcache(s); hashcache(b); hashcache(b2) }, + { sortordercache(s); sortordercache(b); sortordercache(b2) }, + { ordercache(s); ordercache(b); ordercache(b2) }, + { hashcache(s); hashcache(b); hashcache(b2); sortordercache(s); sortordercache(b); sortordercache(b2) } + ) + })[3L] + + message('check data range, mean etc.') + tim1[i] = tim1[i] + timefun({ + summary(b) + })[3L] + message('get all percentiles for plotting distribution shape') + tim1[i] = tim1[i] + timefun({ + quantile(b, probs=seq(0.0, 1.0, 0.01)) + })[3L] + message('list the upper and lower permille of values') + tim1[i] = tim1[i] + timefun({ + quantile(b, probs=c(0.001, 0.999)) + sort(b, na.last=NA) + })[3L] + message('OK, for some of these values I want to see the complete ROW, so I need their positions in the data.frame') + tim1[i] = tim1[i] + timefun({ + if (i == 1L) order(b) else order.integer64(b) + })[3L] + message('check if any values are duplicated') + tim1[i] = tim1[i] + timefun({ + anyDuplicated(b) + })[3L] + message('since not unique, then check distribution of frequencies') + tim1[i] = tim1[i] + timefun({ + if (i == 1L) tabulate(table(b, exclude=NULL)) else tabulate(table(b, return='list')$counts) + })[3L] + message("OK, let's plot the percentiles of unique values versus the percentiles allowing for duplicates") + tim1[i] = tim1[i] + timefun({ + quantile(b, probs=seq(0.0, 1.0, 0.01)) + quantile(unique(b), probs=seq(0.0, 1.0, 0.01)) + })[3L] + message('check whether we find a match for each fact in the dimension table') + tim1[i] = tim1[i] + timefun({ + all(if (i == 1L) b %in% s else "%in%.integer64"(b, s)) + })[3L] + message('check whether there are any dimension table entries not in the fact table') + tim1[i] = tim1[i] + timefun({ + all(if (i == 1L) s %in% b else "%in%.integer64"(s, b)) + })[3L] + message('check whether we find a match for each fact in a parallel fact table') + tim1[i] = tim1[i] + timefun({ + all(if (i == 1L) b %in% b2 else "%in%.integer64"(b, b2)) + })[3L] + message('find positions of facts in dimension table for joining') + tim1[i] = tim1[i] + timefun({ + if (i == 1L) match(b, s) else match.integer64(b, s) + })[3L] + message('find positions of facts in parallel fact table for joining') + tim1[i] = tim1[i] + timefun({ + if (i == 1L) match(b, b2) else match.integer64(b, b2) + })[3L] + message('out of curiosity: how well rank-correlated are fact and parallel fact table?') + tim1[i] = tim1[i] + timefun({ + if (i == 1L) + cor(rank(b, na.last="keep"), rank(b2, na.last="keep"), use="na.or.complete") + else + cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete") + })[3L] + message('convert to factor') + tim1[i] = tim1[i] + timefun({ + factor(b) + })[3L] + + remcache(s) + remcache(b) + remcache(b2) + + print(round(rbind(seconds=tim1, factor=tim1[1L]/tim1), 3L)) } - tim1[i] <- 0L - - tim1[i] <- tim1[i] + timefun({ - switch(i, - NULL, # i=1 - NULL, # i=2 - { hashcache(s); hashcache(b); hashcache(b2) }, - { sortordercache(s); sortordercache(b); sortordercache(b2) }, - { ordercache(s); ordercache(b); ordercache(b2) }, - { hashcache(s); hashcache(b); hashcache(b2);sortordercache(s); sortordercache(b); sortordercache(b2) } - ) - })[3L] - - message('check data range, mean etc.') - tim1[i] <- tim1[i] + timefun({ - summary(b) - })[3L] - message('get all percentiles for plotting distribution shape') - tim1[i] <- tim1[i] + timefun({ - quantile(b, probs=seq(0.0, 1.0, 0.01)) - })[3L] - message('list the upper and lower permille of values') - tim1[i] <- tim1[i] + timefun({ - quantile(b, probs=c(0.001, 0.999)) - sort(b, na.last=NA) - })[3L] - message('OK, for some of these values I want to see the complete ROW, so I need their positions in the data.frame') - tim1[i] <- tim1[i] + timefun({ - if (i==1L) order(b) else order.integer64(b) - })[3L] - message('check if any values are duplicated') - tim1[i] <- tim1[i] + timefun({ - anyDuplicated(b) - })[3L] - message('since not unique, then check distribution of frequencies') - tim1[i] <- tim1[i] + timefun({ - if (i==1L) tabulate(table(b, exclude=NULL)) else tabulate(table(b, return='list')$counts) - })[3L] - message("OK, let's plot the percentiles of unique values versus the percentiles allowing for duplicates") - tim1[i] <- tim1[i] + timefun({ - quantile(b, probs=seq(0.0, 1.0, 0.01)) - quantile(unique(b), probs=seq(0.0, 1.0, 0.01)) - })[3L] - message('check whether we find a match for each fact in the dimension table') - tim1[i] <- tim1[i] + timefun({ - all(if (i==1L) b %in% s else "%in%.integer64"(b, s)) - })[3L] - message('check whether there are any dimension table entries not in the fact table') - tim1[i] <- tim1[i] + timefun({ - all(if (i==1L) s %in% b else "%in%.integer64"(s, b)) - })[3L] - message('check whether we find a match for each fact in a parallel fact table') - tim1[i] <- tim1[i] + timefun({ - all(if (i==1L) b %in% b2 else "%in%.integer64"(b, b2)) - })[3L] - message('find positions of facts in dimension table for joining') - tim1[i] <- tim1[i] + timefun({ - if (i==1L) match(b, s) else match.integer64(b, s) - })[3L] - message('find positions of facts in parallel fact table for joining') - tim1[i] <- tim1[i] + timefun({ - if (i==1L) match(b, b2) else match.integer64(b, b2) - })[3L] - message('out of curiosity: how well rank-correlated are fact and parallel fact table?') - tim1[i] <- tim1[i] + timefun({ - if (i==1L) { - cor(rank(b, na.last="keep"), rank(b2, na.last="keep"), use="na.or.complete") - } else { - cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete") - } - })[3L] - - remcache(s) - remcache(b) - remcache(b2) - - print(round(rbind(seconds=tim1, factor=tim1[1L]/tim1), 3L)) - - } - - # 32-bit 64-bit hashcache sortordercache ordercache allcache - # 196.510 8.963 8.242 5.183 12.325 6.043 - # 32-bit 64-bit hashcache sortordercache ordercache allcache - # 1.000 21.924 23.842 37.913 15.944 32.519 - - - message("\nnow let's look more systematically at the components involved") - s = sample(nbig, nsmall, TRUE) - b = sample(nbig, nbig, TRUE) - b2 = sample(nbig, nbig, TRUE) + # 32-bit 64-bit hashcache sortordercache ordercache allcache + # 196.510 8.963 8.242 5.183 12.325 6.043 + # 32-bit 64-bit hashcache sortordercache ordercache allcache + # 1.000 21.924 23.842 37.913 15.944 32.519 + + message("\nnow let's look more systematically at the components involved") + s = sample(nbig, nsmall, TRUE) + b = sample(nbig, nbig, TRUE) + b2 = sample(nbig, nbig, TRUE) tim2 = matrix(0.0, 15L, 6L) - dimnames(tim2) <- list( - c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)"), # nolint: line_length_linter. + dimnames(tim2) = list( + c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)", "factor(b)"), # nolint: line_length_linter. c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache") ) - s = as.integer(s) - b = as.integer(b) - b2 = as.integer(b2) + s = as.integer(s) + b = as.integer(b) + b2 = as.integer(b2) - i = 1L - for (i in 1:6) { - if (i==2L) { - s = as.integer64(s) - b = as.integer64(b) - b2 = as.integer64(b2) + i = 1L + for (i in 1:6) { + if (i == 2L) { + s = as.integer64(s) + b = as.integer64(b) + b2 = as.integer64(b2) + } + + if (i > 2L) message(colnames(tim2)[i], " cache") + tim2["cache", i] = timefun({ + switch(i, + NULL, # i=1 + NULL, # i=2 + { hashcache(s); hashcache(b); hashcache(b2) }, + { sortordercache(s); sortordercache(b); sortordercache(b2) }, + { ordercache(s); ordercache(b); ordercache(b2) }, + { hashcache(s); hashcache(b); hashcache(b2); sortordercache(s); sortordercache(b); sortordercache(b2) } + ) + })[3L] + + message(colnames(tim2)[i], " match(s, b)") + tim2["match(s, b)", i] = timefun({ + if (i == 1L) match(s, b) else match.integer64(s, b) + })[3L] + + message(colnames(tim2)[i], " s %in% b") + tim2["s %in% b", i] = timefun({ + if (i == 1L) s %in% b else "%in%.integer64"(s, b) + })[3L] + + message(colnames(tim2)[i], " match(b, s)") + tim2["match(b, s)", i] = timefun({ + if (i == 1L) match(b, s) else match.integer64(b, s) + })[3L] + + message(colnames(tim2)[i], " b %in% s") + tim2["b %in% s", i] = timefun({ + if (i == 1L) b %in% s else "%in%.integer64"(b, s) + })[3L] + + message(colnames(tim2)[i], " match(b, b)") + tim2["match(b, b)", i] = timefun({ + if (i == 1L) match(b, b2) else match.integer64(b, b2) + })[3L] + + message(colnames(tim2)[i], " b %in% b") + tim2["b %in% b", i] = timefun({ + if (i == 1L) b %in% b2 else "%in%.integer64"(b, b2) + })[3L] + + message(colnames(tim2)[i], " duplicated(b)") + tim2["duplicated(b)", i] = timefun({ + duplicated(b) + })[3L] + + message(colnames(tim2)[i], " unique(b)") + tim2["unique(b)", i] = timefun({ + unique(b) + })[3L] + + message(colnames(tim2)[i], " table(b)") + tim2["table(b)", i] = timefun({ + if (i == 1L) table(b) else table(b, return='list') + })[3L] + + message(colnames(tim2)[i], " sort(b)") + tim2["sort(b)", i] = timefun({ + sort(b) + })[3L] + + message(colnames(tim2)[i], " order(b)") + tim2["order(b)", i] = timefun({ + if (i == 1L) order(b) else order.integer64(b) + })[3L] + + message(colnames(tim2)[i], " rank(b)") + tim2["rank(b)", i] = timefun({ + if (i == 1L) rank(b) else rank.integer64(b) + })[3L] + + message(colnames(tim2)[i], " quantile(b)") + tim2["quantile(b)", i] = timefun({ + quantile(b) + })[3L] + + message(colnames(tim2)[i], " summary(b)") + tim2["summary(b)", i] = timefun({ + summary(b) + })[3L] + + message(colnames(tim2)[i], " factor(b)") + tim2["factor(b)", i] = timefun({ + factor(b) + })[3L] + + remcache(s) + remcache(b) + remcache(b2) + + tim3 = rbind(tim2, SESSION=tim1) + #tim2 = tim2[, 1]/tim2 + + cat("seconds") + print(round(tim3, 3L)) + cat("factor") + print(round(tim3[, 1L]/tim3, 3L)) } - if (i>2L) message(colnames(tim2)[i], " cache") - tim2["cache", i] <- timefun({ - switch(i, - NULL, # i=1 - NULL, # i=2 - { hashcache(s); hashcache(b); hashcache(b2) }, - { sortordercache(s); sortordercache(b); sortordercache(b2) }, - { ordercache(s); ordercache(b); ordercache(b2) }, - { hashcache(s); hashcache(b); hashcache(b2);sortordercache(s); sortordercache(b); sortordercache(b2) } - ) - })[3L] - - message(colnames(tim2)[i], " match(s, b)") - tim2["match(s, b)", i] <- timefun({ - if (i==1L) match(s, b) else match.integer64(s, b) - })[3L] - - message(colnames(tim2)[i], " s %in% b") - tim2["s %in% b", i] <- timefun({ - if (i==1L) s %in% b else "%in%.integer64"(s, b) - })[3L] - - message(colnames(tim2)[i], " match(b, s)") - tim2["match(b, s)", i] <- timefun({ - if (i==1L) match(b, s) else match.integer64(b, s) - })[3L] - - message(colnames(tim2)[i], " b %in% s") - tim2["b %in% s", i] <- timefun({ - if (i==1L) b %in% s else "%in%.integer64"(b, s) - })[3L] - - message(colnames(tim2)[i], " match(b, b)") - tim2["match(b, b)", i] <- timefun({ - if (i==1L) match(b, b2) else match.integer64(b, b2) - })[3L] - - message(colnames(tim2)[i], " b %in% b") - tim2["b %in% b", i] <- timefun({ - if (i==1L) b %in% b2 else "%in%.integer64"(b, b2) - })[3L] - - message(colnames(tim2)[i], " duplicated(b)") - tim2["duplicated(b)", i] <- timefun({ - duplicated(b) - })[3L] - - message(colnames(tim2)[i], " unique(b)") - tim2["unique(b)", i] <- timefun({ - unique(b) - })[3L] - - message(colnames(tim2)[i], " table(b)") - tim2["table(b)", i] <- timefun({ - if (i==1L) table(b) else table(b, return='list') - })[3L] - - message(colnames(tim2)[i], " sort(b)") - tim2["sort(b)", i] <- timefun({ - sort(b) - })[3L] - - message(colnames(tim2)[i], " order(b)") - tim2["order(b)", i] <- timefun({ - if (i==1L) order(b) else order.integer64(b) - })[3L] - - message(colnames(tim2)[i], " rank(b)") - tim2["rank(b)", i] <- timefun({ - if (i==1L) rank(b) else rank.integer64(b) - })[3L] - - message(colnames(tim2)[i], " quantile(b)") - tim2["quantile(b)", i] <- timefun({ - quantile(b) - })[3L] - - message(colnames(tim2)[i], " summary(b)") - tim2["summary(b)", i] <- timefun({ - summary(b) - })[3L] - - remcache(s) - remcache(b) - remcache(b2) - - tim3 = rbind(tim2, SESSION=tim1) - #tim2 <- tim2[, 1]/tim2 - - cat("seconds") - print(round(tim3, 3L)) - cat("factor") - print(round(tim3[, 1L]/tim3, 3L)) - - } - - - - # 32-bit 64-bit hashcache sortordercache ordercache allcache -# cache 0.000 0.000 0.775 1.330 6.500 2.660 -# match(s, b) 0.820 0.218 0.004 0.025 0.093 0.004 -# s %in% b 0.810 0.234 0.003 0.022 0.093 0.003 -# match(b, s) 0.450 0.228 0.232 0.224 0.224 0.226 -# b %in% s 0.510 0.226 0.224 0.222 0.218 0.222 -# match(b, b) 2.370 0.870 0.505 0.890 0.880 0.505 -# b %in% b 2.350 0.850 0.480 0.865 0.870 0.483 -# duplicated(b) 0.875 0.510 0.141 0.116 0.383 0.117 -# unique(b) 0.930 0.555 0.447 0.156 0.427 0.450 -# table(b) 110.340 0.725 0.680 0.234 0.575 0.202 -# sort(b) 2.440 0.400 0.433 0.072 0.460 0.069 -# order(b) 12.780 0.680 0.615 0.036 0.036 0.035 -# rank(b) 13.480 0.860 0.915 0.240 0.545 0.246 -# quantile(b) 0.373 0.400 0.410 0.000 0.000 0.000 -# summary(b) 0.645 0.423 0.427 0.016 0.016 0.016 -# TOTAL 149.173 7.179 6.291 4.448 11.320 5.239 - # 32-bit 64-bit hashcache sortordercache ordercache allcache -# cache 1 1.062 0.000 0.000 0.000 0.000 -# match(s, b) 1 3.761 230.420 32.475 8.843 217.300 -# s %in% b 1 3.462 234.090 36.450 8.735 237.386 -# match(b, s) 1 1.974 1.940 2.009 2.009 1.991 -# b %in% s 1 2.257 2.277 2.297 2.339 2.297 -# match(b, b) 1 2.724 4.693 2.663 2.693 4.693 -# b %in% b 1 2.765 4.896 2.717 2.701 4.862 -# duplicated(b) 1 1.716 6.195 7.572 2.283 7.500 -# unique(b) 1 1.676 2.082 5.972 2.180 2.067 -# table(b) 1 152.193 162.265 471.538 191.896 546.238 -# sort(b) 1 6.100 5.631 33.822 5.304 35.534 -# order(b) 1 18.794 20.780 357.840 354.297 366.950 -# rank(b) 1 15.674 14.732 56.167 24.734 54.797 -# quantile(b) 1 0.933 0.911 804.907 806.027 810.133 -# summary(b) 1 1.524 1.512 39.345 39.345 39.345 -# TOTAL 1 20.778 23.712 33.534 13.177 28.476 + + + # 32-bit 64-bit hashcache sortordercache ordercache allcache + # cache 0.000 0.000 0.775 1.330 6.500 2.660 + # match(s, b) 0.820 0.218 0.004 0.025 0.093 0.004 + # s %in% b 0.810 0.234 0.003 0.022 0.093 0.003 + # match(b, s) 0.450 0.228 0.232 0.224 0.224 0.226 + # b %in% s 0.510 0.226 0.224 0.222 0.218 0.222 + # match(b, b) 2.370 0.870 0.505 0.890 0.880 0.505 + # b %in% b 2.350 0.850 0.480 0.865 0.870 0.483 + # duplicated(b) 0.875 0.510 0.141 0.116 0.383 0.117 + # unique(b) 0.930 0.555 0.447 0.156 0.427 0.450 + # table(b) 110.340 0.725 0.680 0.234 0.575 0.202 + # sort(b) 2.440 0.400 0.433 0.072 0.460 0.069 + # order(b) 12.780 0.680 0.615 0.036 0.036 0.035 + # rank(b) 13.480 0.860 0.915 0.240 0.545 0.246 + # quantile(b) 0.373 0.400 0.410 0.000 0.000 0.000 + # summary(b) 0.645 0.423 0.427 0.016 0.016 0.016 + # factor(b) 0.645 0.423 0.427 0.016 0.016 0.016 + # TOTAL 149.173 7.179 6.291 4.448 11.320 5.239 + # 32-bit 64-bit hashcache sortordercache ordercache allcache + # cache 1 1.062 0.000 0.000 0.000 0.000 + # match(s, b) 1 3.761 230.420 32.475 8.843 217.300 + # s %in% b 1 3.462 234.090 36.450 8.735 237.386 + # match(b, s) 1 1.974 1.940 2.009 2.009 1.991 + # b %in% s 1 2.257 2.277 2.297 2.339 2.297 + # match(b, b) 1 2.724 4.693 2.663 2.693 4.693 + # b %in% b 1 2.765 4.896 2.717 2.701 4.862 + # duplicated(b) 1 1.716 6.195 7.572 2.283 7.500 + # unique(b) 1 1.676 2.082 5.972 2.180 2.067 + # table(b) 1 152.193 162.265 471.538 191.896 546.238 + # sort(b) 1 6.100 5.631 33.822 5.304 35.534 + # order(b) 1 18.794 20.780 357.840 354.297 366.950 + # rank(b) 1 15.674 14.732 56.167 24.734 54.797 + # quantile(b) 1 0.933 0.911 804.907 806.027 810.133 + # summary(b) 1 1.524 1.512 39.345 39.345 39.345 +# factor(b) 1 1.524 1.512 39.345 39.345 39.345 + # TOTAL 1 20.778 23.712 33.534 13.177 28.476 tim3 } @@ -402,18 +410,19 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { optimizer64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time, - what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile"), + what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile", "factor"), uniorder=c("original", "values", "any"), taborder=c("values", "counts"), plot=TRUE) { + what = match.arg(what, several.ok=TRUE) uniorder = match.arg(uniorder) taborder = match.arg(taborder) ret = vector("list", 2L*length(what)) - dim(ret) <- c(length(what), 2L) - dimnames(ret) <- list(what, c(nsmall, nbig)) + dim(ret) = c(length(what), 2L) + dimnames(ret) = list(what, c(nsmall, nbig)) if (plot) { - oldpar = par(no.readonly = TRUE) + oldpar = par(no.readonly=TRUE) on.exit(par(oldpar)) par(mfrow=c(2L, 1L)) } @@ -428,73 +437,73 @@ optimizer64 = function(nsmall=2L^16L, x1 = c(sample(n2, n1 - 1L, TRUE), NA) x2 = c(sample(n2, n2 - 1L, TRUE), NA) tim = matrix(0.0, 9L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. c("prep", "both", "use") ) - tim["match", "both"] <- timefun({ + tim["match", "both"] = timefun({ p = match(x1, x2) })[3L] x1 = as.integer64(x1) x2 = as.integer64(x2) - tim["match.64", "both"] <- timefun({ + tim["match.64", "both"] = timefun({ p2 = match.integer64(x1, x2) })[3L] stopifnot(identical(p2, p)) - tim["hashpos", "prep"] <- timefun({ + tim["hashpos", "prep"] = timefun({ h2 = hashmap(x2) })[3L] - tim["hashpos", "use"] <- timefun({ + tim["hashpos", "use"] = timefun({ p2 = hashpos(h2, x1) })[3L] stopifnot(identical(p2, p)) - tim["hashrev", "prep"] <- timefun({ + tim["hashrev", "prep"] = timefun({ h1 = hashmap(x1) })[3L] - tim["hashrev", "use"] <- timefun({ + tim["hashrev", "use"] = timefun({ p1 = hashrev(h1, x2) })[3L] stopifnot(identical(p1, p)) - tim["sortorderpos", "prep"] <- system.time({ + tim["sortorderpos", "prep"] = system.time({ s2 = clone(x2) o2 = seq_along(x2) ramsortorder(s2, o2, na.last=FALSE) })[3L] - tim["sortorderpos", "use"] <- timefun({ + tim["sortorderpos", "use"] = timefun({ p2 = sortorderpos(s2, o2, x1) })[3L] stopifnot(identical(p2, p)) - tim["orderpos", "prep"] <- timefun({ + tim["orderpos", "prep"] = timefun({ o2 = seq_along(x2) ramorder(x2, o2, na.last=FALSE) })[3L] - tim["orderpos", "use"] <- timefun({ + tim["orderpos", "use"] = timefun({ p2 = orderpos(x2, o2, x1, method=2L) })[3L] stopifnot(identical(p2, p)) hashcache(x2) - tim["hashcache", "use"] <- timefun({ + tim["hashcache", "use"] = timefun({ p2 = match.integer64(x1, x2) })[3L] stopifnot(identical(p2, p)) remcache(x2) sortordercache(x2) - tim["sortorder.cache", "use"] <- timefun({ + tim["sortorder.cache", "use"] = timefun({ p2 = match.integer64(x1, x2) })[3L] stopifnot(identical(p2, p)) remcache(x2) ordercache(x2) - tim["order.cache", "use"] <- timefun({ + tim["order.cache", "use"] = timefun({ p2 = match.integer64(x1, x2) })[3L] stopifnot(identical(p2, p)) @@ -506,7 +515,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste("match", n[1L], "in", n[2L])) } - ret[["match", as.character(n1)]] <- tim + ret[["match", as.character(n1)]] = tim } } @@ -520,77 +529,77 @@ optimizer64 = function(nsmall=2L^16L, x1 = c(sample(n2, n1 - 1L, TRUE), NA) x2 = c(sample(n2, n2 - 1L, TRUE), NA) tim = matrix(0.0, 10L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. c("prep", "both", "use") ) - tim["%in%", "both"] <- timefun({ + tim["%in%", "both"] = timefun({ p = x1 %in% x2 })[3L] x1 = as.integer64(x1) x2 = as.integer64(x2) - tim["match.64", "both"] <- timefun({ + tim["match.64", "both"] = timefun({ p2 = match.integer64(x1, x2, nomatch = 0L) > 0L })[3L] stopifnot(identical(p2, p)) - tim["%in%.64", "both"] <- timefun({ + tim["%in%.64", "both"] = timefun({ p2 = "%in%.integer64"(x1, x2) # this is using the custom version })[3L] stopifnot(identical(p2, p)) - tim["hashfin", "prep"] <- timefun({ + tim["hashfin", "prep"] = timefun({ h2 = hashmap(x2) })[3L] - tim["hashfin", "use"] <- timefun({ + tim["hashfin", "use"] = timefun({ p2 = hashfin(h2, x1) })[3L] stopifnot(identical(p2, p)) - tim["hashrin", "prep"] <- timefun({ + tim["hashrin", "prep"] = timefun({ h1 = hashmap(x1) })[3L] - tim["hashrin", "use"] <- timefun({ + tim["hashrin", "use"] = timefun({ p1 = hashrin(h1, x2) })[3L] stopifnot(identical(p2, p)) - tim["sortfin", "prep"] <- timefun({ + tim["sortfin", "prep"] = timefun({ s2 = clone(x2) ramsort(s2, na.last=FALSE) })[3L] - tim["sortfin", "use"] <- timefun({ + tim["sortfin", "use"] = timefun({ p2 = sortfin(s2, x1) })[3L] stopifnot(identical(p2, p)) - tim["orderfin", "prep"] <- timefun({ + tim["orderfin", "prep"] = timefun({ o2 = seq_along(x2) ramorder(x2, o2, na.last=FALSE) })[3L] - tim["orderfin", "use"] <- timefun({ + tim["orderfin", "use"] = timefun({ p2 = orderfin(x2, o2, x1) })[3L] stopifnot(identical(p2, p)) hashcache(x2) - tim["hash.cache", "use"] <- timefun({ + tim["hash.cache", "use"] = timefun({ p2 = "%in%.integer64"(x1, x2) })[3L] stopifnot(identical(p2, p)) remcache(x2) sortordercache(x2) - tim["sortorder.cache", "use"] <- timefun({ + tim["sortorder.cache", "use"] = timefun({ p2 = "%in%.integer64"(x1, x2) })[3L] stopifnot(identical(p2, p)) remcache(x2) ordercache(x2) - tim["order.cache", "use"] <- timefun({ + tim["order.cache", "use"] = timefun({ p2 = "%in%.integer64"(x1, x2) })[3L] stopifnot(identical(p2, p)) @@ -602,7 +611,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste(n[1L], "%in%", n[2L])) } - ret[["%in%", as.character(n1)]] <- tim + ret[["%in%", as.character(n1)]] = tim } } if ("duplicated" %in% what) { @@ -612,78 +621,78 @@ optimizer64 = function(nsmall=2L^16L, n = N[i] x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 10L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. c("prep", "both", "use") ) - tim["duplicated", "both"] <- timefun({ + tim["duplicated", "both"] = timefun({ p = duplicated(x) })[3L] x = as.integer64(x) - tim["duplicated.64", "both"] <- timefun({ + tim["duplicated.64", "both"] = timefun({ p2 = duplicated(x) })[3L] stopifnot(identical(p2, p)) - tim["hashdup", "prep"] <- timefun({ + tim["hashdup", "prep"] = timefun({ h = hashmap(x) })[3L] - tim["hashdup", "use"] <- timefun({ + tim["hashdup", "use"] = timefun({ p2 = hashdup(h) })[3L] stopifnot(identical(p2, p)) - tim["sortorderdup1", "prep"] <- timefun({ + tim["sortorderdup1", "prep"] = timefun({ s = clone(x) o = seq_along(x) ramsortorder(s, o, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sortorderdup1", "use"] <- timefun({ + tim["sortorderdup1", "use"] = timefun({ p2 = sortorderdup(s, o, method=1L) })[3L] stopifnot(identical(p2, p)) - tim["sortorderdup2", "prep"] <- tim["sortorderdup1", "prep"] - tim["sortorderdup2", "use"] <- timefun({ + tim["sortorderdup2", "prep"] = tim["sortorderdup1", "prep"] + tim["sortorderdup2", "use"] = timefun({ p2 = sortorderdup(s, o, method=2L) })[3L] stopifnot(identical(p2, p)) - tim["orderdup1", "prep"] <- timefun({ + tim["orderdup1", "prep"] = timefun({ o = seq_along(x) ramorder(x, o, na.last=FALSE) nunique = ordernut(x, o)[1L] })[3L] - tim["orderdup1", "use"] <- timefun({ + tim["orderdup1", "use"] = timefun({ p2 = orderdup(x, o, method=1L) })[3L] stopifnot(identical(p2, p)) - tim["orderdup2", "prep"] <- tim["orderdup1", "prep"] - tim["orderdup2", "use"] <- timefun({ + tim["orderdup2", "prep"] = tim["orderdup1", "prep"] + tim["orderdup2", "use"] = timefun({ p2 = orderdup(x, o, method=2L) })[3L] stopifnot(identical(p2, p)) hashcache(x) - tim["hash.cache", "use"] <- timefun({ + tim["hash.cache", "use"] = timefun({ p2 = duplicated(x) })[3L] stopifnot(identical(p2, p)) remcache(x) sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ + tim["sortorder.cache", "use"] = timefun({ p2 = duplicated(x) })[3L] stopifnot(identical(p2, p)) remcache(x) ordercache(x) - tim["order.cache", "use"] <- timefun({ + tim["order.cache", "use"] = timefun({ p2 = duplicated(x) })[3L] stopifnot(identical(p2, p)) @@ -694,7 +703,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("duplicated(", n, ")")) } - ret[["duplicated", as.character(n)]] <- tim + ret[["duplicated", as.character(n)]] = tim } } if ("unique" %in% what) { @@ -704,12 +713,12 @@ optimizer64 = function(nsmall=2L^16L, n = N[i] x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 15L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. c("prep", "both", "use") ) - tim["unique", "both"] <- timefun({ + tim["unique", "both"] = timefun({ p = unique(x) })[3L] x = as.integer64(x) @@ -717,93 +726,93 @@ optimizer64 = function(nsmall=2L^16L, if (uniorder == "values") ramsort(p, na.last=FALSE) - tim["unique.64", "both"] <- timefun({ + tim["unique.64", "both"] = timefun({ p2 = unique(x, order=uniorder) })[3L] if (uniorder != "any") stopifnot(identical.integer64(p2, p)) - tim["hashmapuni", "both"] <- timefun({ + tim["hashmapuni", "both"] = timefun({ p2 = hashmapuni(x) })[3L] if (uniorder == "original") stopifnot(identical.integer64(p2, p)) - tim["hashuni", "prep"] <- timefun({ + tim["hashuni", "prep"] = timefun({ h = hashmap(x) - # for(r in 1:r)h <- hashmap(x, nunique=h$nunique) + # for(r in 1:r)h = hashmap(x, nunique=h$nunique) })[3L] - tim["hashuni", "use"] <- timefun({ + tim["hashuni", "use"] = timefun({ p2 = hashuni(h) })[3L] if (uniorder == "values") stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) - tim["hashunikeep", "prep"] <- tim["hashuni", "prep"] - tim["hashunikeep", "use"] <- timefun({ + tim["hashunikeep", "prep"] = tim["hashuni", "prep"] + tim["hashunikeep", "use"] = timefun({ p2 = hashuni(h, keep.order=TRUE) })[3L] if (uniorder == "original") stopifnot(identical.integer64(p2, p)) - tim["sortuni", "prep"] <- timefun({ + tim["sortuni", "prep"] = timefun({ s = clone(x) ramsort(s, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sortuni", "use"] <- timefun({ + tim["sortuni", "use"] = timefun({ p2 = sortuni(s, nunique) })[3L] if (uniorder == "values") stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) - tim["sortunikeep", "prep"] <- timefun({ + tim["sortunikeep", "prep"] = timefun({ s = clone(x) o = seq_along(x) ramsortorder(s, o, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sortunikeep", "use"] <- timefun({ + tim["sortunikeep", "use"] = timefun({ p2 = sortorderuni(x, s, o, nunique) })[3L] if (uniorder == "original") stopifnot(identical.integer64(p2, p)) - tim["orderuni", "prep"] <- timefun({ + tim["orderuni", "prep"] = timefun({ o = seq_along(x) ramorder(x, o, na.last=FALSE) nunique = ordernut(x, o)[1L] })[3L] - tim["orderuni", "use"] <- timefun({ + tim["orderuni", "use"] = timefun({ p2 = orderuni(x, o, nunique) })[3L] if (uniorder == "values") stopifnot(identical.integer64(sort(p2, na.last=FALSE), p)) - tim["orderunikeep", "prep"] <- tim["orderuni", "prep"] - tim["orderunikeep", "use"] <- timefun({ + tim["orderunikeep", "prep"] = tim["orderuni", "prep"] + tim["orderunikeep", "use"] = timefun({ p2 = orderuni(x, o, nunique, keep.order=TRUE) nunique = ordernut(x, o)[1L] })[3L] if (uniorder == "original") stopifnot(identical.integer64(p2, p)) - tim["hashdup", "prep"] <- tim["hashuni", "prep"] - tim["hashdup", "use"] <- timefun({ + tim["hashdup", "prep"] = tim["hashuni", "prep"] + tim["hashdup", "use"] = timefun({ p2 = x[!hashdup(h)] })[3L] if (uniorder == "original") stopifnot(identical.integer64(p2, p)) - tim["sortorderdup", "prep"] <- tim["sortunikeep", "prep"] - tim["sortorderdup", "use"] <- timefun({ + tim["sortorderdup", "prep"] = tim["sortunikeep", "prep"] + tim["sortorderdup", "use"] = timefun({ p2 = x[!sortorderdup(s, o)] })[3L] if (uniorder == "original") stopifnot(identical.integer64(p2, p)) hashcache(x) - tim["hash.cache", "use"] <- timefun({ + tim["hash.cache", "use"] = timefun({ p2 = unique(x, order=uniorder) })[3L] if (uniorder != "any") @@ -811,7 +820,7 @@ optimizer64 = function(nsmall=2L^16L, remcache(x) sortcache(x) - tim["sort.cache", "use"] <- timefun({ + tim["sort.cache", "use"] = timefun({ p2 = unique(x, order=uniorder) })[3L] if (uniorder != "any") @@ -819,7 +828,7 @@ optimizer64 = function(nsmall=2L^16L, remcache(x) sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ + tim["sortorder.cache", "use"] = timefun({ p2 = unique(x, order=uniorder) })[3L] if (uniorder != "any") @@ -827,7 +836,7 @@ optimizer64 = function(nsmall=2L^16L, remcache(x) ordercache(x) - tim["order.cache", "use"] <- timefun({ + tim["order.cache", "use"] = timefun({ p2 = unique(x, order=uniorder) })[3L] if (uniorder != "any") @@ -839,7 +848,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("unique(", n, ", order=", uniorder, ")")) } - ret[["unique", as.character(n)]] <- tim + ret[["unique", as.character(n)]] = tim } } if ("unipos" %in% what) { @@ -849,103 +858,103 @@ optimizer64 = function(nsmall=2L^16L, n = N[i] x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 14L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. c("prep", "both", "use") ) - tim["unique", "both"] <- timefun({ + tim["unique", "both"] = timefun({ unique(x) })[3L] x = as.integer64(x) - tim["unipos.64", "both"] <- timefun({ + tim["unipos.64", "both"] = timefun({ p = unipos(x, order=uniorder) })[3L] - tim["hashmapupo", "both"] <- timefun({ + tim["hashmapupo", "both"] = timefun({ p2 = hashmapupo(x) })[3L] if (uniorder == "original") stopifnot(identical(p2, p)) - tim["hashupo", "prep"] <- timefun({ + tim["hashupo", "prep"] = timefun({ h = hashmap(x) # if nunique is small we could re-build the hashmap at a smaller size - # h <- hashmap(x, nunique=h$nunique) + # h = hashmap(x, nunique=h$nunique) })[3L] - tim["hashupo", "use"] <- timefun({ + tim["hashupo", "use"] = timefun({ p2 = hashupo(h) })[3L] if (uniorder == "values") stopifnot(identical(sort(p2, na.last=FALSE), sort(p, na.last=FALSE))) - tim["hashupokeep", "prep"] <- tim["hashupo", "prep"] - tim["hashupokeep", "use"] <- timefun({ + tim["hashupokeep", "prep"] = tim["hashupo", "prep"] + tim["hashupokeep", "use"] = timefun({ p2 = hashupo(h, keep.order=TRUE) })[3L] if (uniorder == "original") stopifnot(identical(p2, p)) - tim["sortorderupo", "prep"] <- timefun({ + tim["sortorderupo", "prep"] = timefun({ s = clone(x) o = seq_along(x) ramsortorder(s, o, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sortorderupo", "use"] <- timefun({ + tim["sortorderupo", "use"] = timefun({ p2 = sortorderupo(s, o, nunique) })[3L] if (uniorder == "values") stopifnot(identical(p2, p)) - tim["sortorderupokeep", "prep"] <- timefun({ + tim["sortorderupokeep", "prep"] = timefun({ s = clone(x) o = seq_along(x) ramsortorder(s, o, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sortorderupokeep", "use"] <- timefun({ + tim["sortorderupokeep", "use"] = timefun({ p2 = sortorderupo(s, o, nunique, keep.order=TRUE) })[3L] if (uniorder == "original") stopifnot(identical(p2, p)) - tim["orderupo", "prep"] <- timefun({ + tim["orderupo", "prep"] = timefun({ o = seq_along(x) ramorder(x, o, na.last=FALSE) nunique = ordernut(x, o)[1L] })[3L] - tim["orderupo", "use"] <- timefun({ + tim["orderupo", "use"] = timefun({ p2 = orderupo(x, o, nunique) })[3L] if (uniorder == "values") stopifnot(identical(p2, p)) - tim["orderupokeep", "prep"] <- tim["orderupo", "prep"] - tim["orderupokeep", "use"] <- timefun({ + tim["orderupokeep", "prep"] = tim["orderupo", "prep"] + tim["orderupokeep", "use"] = timefun({ p2 = orderupo(x, o, nunique, keep.order=TRUE) nunique = ordernut(x, o)[1L] })[3L] if (uniorder == "original") stopifnot(identical(p2, p)) - tim["hashdup", "prep"] <- tim["hashupo", "prep"] - tim["hashdup", "use"] <- timefun({ + tim["hashdup", "prep"] = tim["hashupo", "prep"] + tim["hashdup", "use"] = timefun({ p2 = (1:n)[!hashdup(h)] })[3L] if (uniorder == "original") stopifnot(identical(p2, p)) - tim["sortorderdup", "prep"] <- tim["sortorderupokeep", "prep"] - tim["sortorderdup", "use"] <- timefun({ + tim["sortorderdup", "prep"] = tim["sortorderupokeep", "prep"] + tim["sortorderdup", "use"] = timefun({ p2 = (1:n)[!sortorderdup(s, o)] })[3L] if (uniorder == "original") stopifnot(identical(p2, p)) hashcache(x) - tim["hash.cache", "use"] <- timefun({ + tim["hash.cache", "use"] = timefun({ p2 = unipos(x, order=uniorder) })[3L] if (uniorder != "any") @@ -953,7 +962,7 @@ optimizer64 = function(nsmall=2L^16L, remcache(x) sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ + tim["sortorder.cache", "use"] = timefun({ p2 = unipos(x, order=uniorder) })[3L] if (uniorder != "any") @@ -961,7 +970,7 @@ optimizer64 = function(nsmall=2L^16L, remcache(x) ordercache(x) - tim["order.cache", "use"] <- timefun({ + tim["order.cache", "use"] = timefun({ p2 = unipos(x, order=uniorder) })[3L] if (uniorder != "any") @@ -973,7 +982,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("unipos(", n, ", order=", uniorder, ")")) } - ret[["unipos", as.character(n)]] <- tim + ret[["unipos", as.character(n)]] = tim } } if ("table" %in% what) { @@ -983,42 +992,42 @@ optimizer64 = function(nsmall=2L^16L, n = N[i] x = c(sample.int(1024L, n - 1L, replace=TRUE), NA) tim = matrix(0.0, 13L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("tabulate", "table", "table.64", "hashmaptab", "hashtab", "hashtab2", "sorttab", "sortordertab", "ordertab", "ordertabkeep", "hash.cache", "sort.cache", "order.cache"), c("prep", "both", "use") ) - tim["tabulate", "both"] <- timefun({ + tim["tabulate", "both"] = timefun({ tabulate(x) })[3L] - tim["table", "both"] <- timefun({ + tim["table", "both"] = timefun({ p = table(x, exclude=NULL) })[3L] x = as.integer64(x) - tim["table.64", "both"] <- timefun({ + tim["table.64", "both"] = timefun({ p2 = table(x, exclude=NULL, order=taborder) })[3L] stopifnot(identical(p2, p)) - tim["hashmaptab", "both"] <- timefun({ + tim["hashmaptab", "both"] = timefun({ p = hashmaptab(x) })[3L] - tim["hashtab", "prep"] <- timefun({ + tim["hashtab", "prep"] = timefun({ h = hashmap(x) })[3L] - tim["hashtab", "use"] <- timefun({ + tim["hashtab", "use"] = timefun({ p2 = hashtab(h) })[3L] stopifnot(identical(p2, p)) - tim["hashtab2", "prep"] <- tim["hashtab", "prep"] + timefun({ + tim["hashtab2", "prep"] = tim["hashtab", "prep"] + timefun({ h = hashmap(x, nunique=h$nunique) })[3L] - tim["hashtab2", "use"] <- timefun({ + tim["hashtab2", "use"] = timefun({ p2 = hashtab(h) })[3L] @@ -1032,61 +1041,61 @@ optimizer64 = function(nsmall=2L^16L, p2 = sortp(p2) stopifnot(identical(p2, p)) - tim["sorttab", "prep"] <- timefun({ + tim["sorttab", "prep"] = timefun({ s = clone(x) ramsort(s, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sorttab", "use"] <- timefun({ + tim["sorttab", "use"] = timefun({ p2 = list(values=sortuni(s, nunique), counts=sorttab(s, nunique)) })[3L] stopifnot(identical(p2, p)) - tim["sortordertab", "prep"] <- timefun({ + tim["sortordertab", "prep"] = timefun({ s = clone(x) o = seq_along(x) ramsortorder(s, o, na.last=FALSE) nunique = sortnut(s)[1L] })[3L] - tim["sortordertab", "use"] <- timefun({ - p2 <- list(values=sortorderuni(x, s, o, nunique), counts=sortordertab(s, o)) + tim["sortordertab", "use"] = timefun({ + p2 = list(values=sortorderuni(x, s, o, nunique), counts=sortordertab(s, o)) })[3L] - p2 <- sortp(p2) + p2 = sortp(p2) stopifnot(identical(p2, p)) - tim["ordertab", "prep"] <- timefun({ - o <- seq_along(x) + tim["ordertab", "prep"] = timefun({ + o = seq_along(x) ramorder(x, o, na.last=FALSE) - nunique <- ordernut(x, o)[1L] + nunique = ordernut(x, o)[1L] })[3L] - tim["ordertab", "use"] <- timefun({ - p2 <- list(values=orderuni(x, o, nunique), counts=ordertab(x, o, nunique)) + tim["ordertab", "use"] = timefun({ + p2 = list(values=orderuni(x, o, nunique), counts=ordertab(x, o, nunique)) })[3L] stopifnot(identical(p2, p)) - tim["ordertabkeep", "prep"] <- tim["ordertab", "prep"] - tim["ordertabkeep", "use"] <- timefun({ - p2 <- list(values=orderuni(x, o, nunique, keep.order=TRUE), counts=ordertab(x, o, nunique, keep.order=TRUE)) + tim["ordertabkeep", "prep"] = tim["ordertab", "prep"] + tim["ordertabkeep", "use"] = timefun({ + p2 = list(values=orderuni(x, o, nunique, keep.order=TRUE), counts=ordertab(x, o, nunique, keep.order=TRUE)) })[3L] - p2 <- sortp(p2) + p2 = sortp(p2) stopifnot(identical(p2, p)) hashcache(x) - tim["hash.cache", "use"] <- timefun({ - p <- table(x, exclude=NULL, order=taborder) + tim["hash.cache", "use"] = timefun({ + p = table(x, exclude=NULL, order=taborder) })[3L] remcache(x) sortordercache(x, na.last=TRUE) - tim["sort.cache", "use"] <- timefun({ - p2 <- table(x, exclude=NULL, order=taborder) + tim["sort.cache", "use"] = timefun({ + p2 = table(x, exclude=NULL, order=taborder) })[3L] stopifnot(identical(p2, p)) remcache(x) ordercache(x, na.last=TRUE) - tim["order.cache", "use"] <- timefun({ - p2 <- table(x, exclude=NULL, order=taborder) + tim["order.cache", "use"] = timefun({ + p2 = table(x, exclude=NULL, order=taborder) })[3L] stopifnot(identical(p2, p)) remcache(x) @@ -1096,65 +1105,65 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("table(", n, ", order=", taborder, ")")) } - ret[["table", as.character(n)]] <- tim + ret[["table", as.character(n)]] = tim } } if ("rank" %in% what) { message("rank: timings of different methods") - N <- c(nsmall, nbig) + N = c(nsmall, nbig) for (i in seq_along(N)) { - n <- N[i] - x <- c(sample(n, n - 1L, TRUE), NA) - tim <- matrix(0.0, 7L, 3L) - dimnames(tim) <- list( + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 7L, 3L) + dimnames(tim) = list( c("rank", "rank.keep", "rank.64", "sortorderrnk", "orderrnk", "sort.cache", "order.cache"), c("prep", "both", "use") ) - tim["rank", "both"] <- timefun({ + tim["rank", "both"] = timefun({ rank(x) })[3L] - tim["rank.keep", "both"] <- timefun({ - p <- rank(x, na.last="keep") + tim["rank.keep", "both"] = timefun({ + p = rank(x, na.last="keep") })[3L] - x <- as.integer64(x) + x = as.integer64(x) - tim["rank.64", "both"] <- timefun({ - p2 <- rank.integer64(x) + tim["rank.64", "both"] = timefun({ + p2 = rank.integer64(x) })[3L] stopifnot(identical(p2, p)) - tim["sortorderrnk", "prep"] <- timefun({ - s <- clone(x) - o <- seq_along(x) - na.count <- ramsortorder(s, o, na.last=FALSE) + tim["sortorderrnk", "prep"] = timefun({ + s = clone(x) + o = seq_along(x) + na.count = ramsortorder(s, o, na.last=FALSE) })[3L] - tim["sortorderrnk", "use"] <- timefun({ - p2 <- sortorderrnk(s, o, na.count) + tim["sortorderrnk", "use"] = timefun({ + p2 = sortorderrnk(s, o, na.count) })[3L] stopifnot(identical(p2, p)) - tim["orderrnk", "prep"] <- timefun({ - o <- seq_along(x) - na.count <- ramorder(x, o, na.last=FALSE) + tim["orderrnk", "prep"] = timefun({ + o = seq_along(x) + na.count = ramorder(x, o, na.last=FALSE) })[3L] - tim["orderrnk", "use"] <- timefun({ - p2 <- orderrnk(x, o, na.count) + tim["orderrnk", "use"] = timefun({ + p2 = orderrnk(x, o, na.count) })[3L] stopifnot(identical(p2, p)) sortordercache(x) - tim["sort.cache", "use"] <- timefun({ - p2 <- rank.integer64(x) + tim["sort.cache", "use"] = timefun({ + p2 = rank.integer64(x) })[3L] stopifnot(identical(p2, p)) remcache(x) ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 <- rank.integer64(x) + tim["order.cache", "use"] = timefun({ + p2 = rank.integer64(x) })[3L] stopifnot(identical(p2, p)) remcache(x) @@ -1164,63 +1173,63 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("rank.integer64(", n, ")")) } - ret[["rank", as.character(n)]] <- tim + ret[["rank", as.character(n)]] = tim } } if ("quantile" %in% what) { message("quantile: timings of different methods") - N <- c(nsmall, nbig) + N = c(nsmall, nbig) for (i in seq_along(N)) { - n <- N[i] - x <- c(sample(n, n - 1L, TRUE), NA) - tim <- matrix(0.0, 6L, 3L) - dimnames(tim) <- list( + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 6L, 3L) + dimnames(tim) = list( c("quantile", "quantile.64", "sortqtl", "orderqtl", "sort.cache", "order.cache"), c("prep", "both", "use") ) - tim["quantile", "both"] <- timefun({ - p <- quantile(x, type=1L, na.rm=TRUE) + tim["quantile", "both"] = timefun({ + p = quantile(x, type=1L, na.rm=TRUE) })[3L] - p2 <- p - p <- as.integer64(p2) - names(p) <- names(p2) + p2 = p + p = as.integer64(p2) + names(p) = names(p2) - x <- as.integer64(x) + x = as.integer64(x) - tim["quantile.64", "both"] <- timefun({ - p2 <- quantile(x, na.rm=TRUE) + tim["quantile.64", "both"] = timefun({ + p2 = quantile(x, na.rm=TRUE) })[3L] stopifnot(identical(p2, p)) - tim["sortqtl", "prep"] <- timefun({ - s <- clone(x) - na.count <- ramsort(s, na.last=FALSE) + tim["sortqtl", "prep"] = timefun({ + s = clone(x) + na.count = ramsort(s, na.last=FALSE) })[3L] - tim["sortqtl", "use"] <- timefun({ - p2 <- sortqtl(s, na.count, seq(0.0, 1.0, 0.25)) + tim["sortqtl", "use"] = timefun({ + p2 = sortqtl(s, na.count, seq(0.0, 1.0, 0.25)) })[3L] stopifnot(identical(unname(p2), unname(p))) - tim["orderqtl", "prep"] <- timefun({ - o <- seq_along(x) - na.count <- ramorder(x, o, na.last=FALSE) + tim["orderqtl", "prep"] = timefun({ + o = seq_along(x) + na.count = ramorder(x, o, na.last=FALSE) })[3L] - tim["orderqtl", "use"] <- timefun({ - p2 <- orderqtl(x, o, na.count, seq(0.0, 1.0, 0.25)) + tim["orderqtl", "use"] = timefun({ + p2 = orderqtl(x, o, na.count, seq(0.0, 1.0, 0.25)) })[3L] stopifnot(identical(unname(p2), unname(p))) sortordercache(x) - tim["sort.cache", "use"] <- timefun({ - p2 <- quantile(x, na.rm=TRUE) + tim["sort.cache", "use"] = timefun({ + p2 = quantile(x, na.rm=TRUE) })[3L] stopifnot(identical(p2, p)) remcache(x) ordercache(x) - tim["order.cache", "use"] <- timefun({ - p2 <- quantile(x, na.rm=TRUE) + tim["order.cache", "use"] = timefun({ + p2 = quantile(x, na.rm=TRUE) })[3L] stopifnot(identical(p2, p)) remcache(x) @@ -1230,7 +1239,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("quantile(", n, ")")) } - ret[["quantile", as.character(n)]] <- tim + ret[["quantile", as.character(n)]] = tim } } @@ -1241,37 +1250,37 @@ optimizer64 = function(nsmall=2L^16L, n = N[i] x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 5L, 3L) - dimnames(tim) <- list( + dimnames(tim) = list( c("factor", "factor.64", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. c("prep", "both", "use") ) - tim["factor", "both"] <- timefun({ + tim["factor", "both"] = timefun({ p = base::factor(x) })[3L] x = as.integer64(x) - tim["factor.64", "both"] <- timefun({ + tim["factor.64", "both"] = timefun({ p2 = factor(x) })[3L] stopifnot(identical(p2, p)) hashcache(x) - tim["hashcache", "use"] <- timefun({ + tim["hashcache", "use"] = timefun({ p2 = factor(x) })[3L] stopifnot(identical(p2, p)) remcache(x) sortordercache(x) - tim["sortorder.cache", "use"] <- timefun({ + tim["sortorder.cache", "use"] = timefun({ p2 = factor(x) })[3L] stopifnot(identical(p2, p)) remcache(x) ordercache(x) - tim["order.cache", "use"] <- timefun({ + tim["order.cache", "use"] = timefun({ p2 = factor(x) })[3L] stopifnot(identical(p2, p)) @@ -1282,7 +1291,7 @@ optimizer64 = function(nsmall=2L^16L, title(paste0("factor(", n, ")")) } - ret[["factor", as.character(n)]] <- tim + ret[["factor", as.character(n)]] = tim } } From aab8b8f2d1a9e70574e5a40c1dd2e753c36257e5 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Sun, 22 Feb 2026 10:03:14 +0100 Subject: [PATCH 12/17] add factor to benchmark64 --- R/highlevel64.R | 12 ++++++------ man/benchmark64.Rd | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/highlevel64.R b/R/highlevel64.R index b2be2631..0c3f2ed9 100644 --- a/R/highlevel64.R +++ b/R/highlevel64.R @@ -221,7 +221,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { else cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete") })[3L] - message('convert to factor') + message('coerce to factor') tim1[i] = tim1[i] + timefun({ factor(b) })[3L] @@ -243,7 +243,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { b = sample(nbig, nbig, TRUE) b2 = sample(nbig, nbig, TRUE) - tim2 = matrix(0.0, 15L, 6L) + tim2 = matrix(0.0, 16L, 6L) dimnames(tim2) = list( c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)", "factor(b)"), # nolint: line_length_linter. c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache") @@ -363,7 +363,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { - # 32-bit 64-bit hashcache sortordercache ordercache allcache + # 32-bit 64-bit hashcache sortordercache ordercache allcache # cache 0.000 0.000 0.775 1.330 6.500 2.660 # match(s, b) 0.820 0.218 0.004 0.025 0.093 0.004 # s %in% b 0.810 0.234 0.003 0.022 0.093 0.003 @@ -379,9 +379,9 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { # rank(b) 13.480 0.860 0.915 0.240 0.545 0.246 # quantile(b) 0.373 0.400 0.410 0.000 0.000 0.000 # summary(b) 0.645 0.423 0.427 0.016 0.016 0.016 - # factor(b) 0.645 0.423 0.427 0.016 0.016 0.016 + # factor(b) 6.655 8.212 7.834 7.812 7.241 8.332 # TOTAL 149.173 7.179 6.291 4.448 11.320 5.239 - # 32-bit 64-bit hashcache sortordercache ordercache allcache + # 32-bit 64-bit hashcache sortordercache ordercache allcache # cache 1 1.062 0.000 0.000 0.000 0.000 # match(s, b) 1 3.761 230.420 32.475 8.843 217.300 # s %in% b 1 3.462 234.090 36.450 8.735 237.386 @@ -397,7 +397,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { # rank(b) 1 15.674 14.732 56.167 24.734 54.797 # quantile(b) 1 0.933 0.911 804.907 806.027 810.133 # summary(b) 1 1.524 1.512 39.345 39.345 39.345 -# factor(b) 1 1.524 1.512 39.345 39.345 39.345 + # factor(b) 1 0.812 0.851 0.848 0.920 0.767 # TOTAL 1 20.778 23.712 33.534 13.177 28.476 tim3 diff --git a/man/benchmark64.Rd b/man/benchmark64.Rd index f250345e..865d94e2 100644 --- a/man/benchmark64.Rd +++ b/man/benchmark64.Rd @@ -12,7 +12,7 @@ optimizer64( nbig = 2L^25L, timefun = repeat.time, what = c("match", "\%in\%", "duplicated", "unique", "unipos", "table", "rank", - "quantile"), + "quantile", "factor"), uniorder = c("original", "values", "any"), taborder = c("values", "counts"), plot = TRUE @@ -77,6 +77,7 @@ Function for measuring algorithmic performance of high-level and low-level integ rank(b) \tab ranking of big vector \cr quantile(b) \tab quantiles of big vector \cr summary(b) \tab summary of of big vector \cr + factor(b) \tab coercion to factor of big vector \cr SESSION \tab exemplary session involving multiple calls (including cache filling costs) \cr } From 93b3016f4d61c9b7cc9f51c0a5d488ab91d7998c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 23 Feb 2026 08:03:04 +0700 Subject: [PATCH 13/17] refine compatibility test for ordered --- tests/testthat/test-integer64.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index b8d859e4..f694d73f 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -828,16 +828,16 @@ test_that("match works with zero length input", { }) test_that("factor and order for integer64 are still necessary", { - skip_unless_r(">= 4.0.0") # could not find function "expect_failure" in test-ancient + skip_unless_r(">= 4.1.0") # skip implementing "expect_failure" for ancient R skip_on_cran() - # make sure that factor and order for integer64 are still necessary + x = c(132724613L, -2143220989L, -1L, NA, 1L) expect_failure(expect_identical(base::factor(as.integer64(x)), base::factor(x))) expect_failure(expect_identical(factor(as.integer64(x)), base::factor(as.integer64(x)))) expect_identical(factor(as.integer64(x)), base::factor(x)) - expect_failure(expect_identical(base::ordered(as.integer64(x)), base::ordered(x))) - expect_failure(expect_identical(ordered(as.integer64(x)), base::ordered(as.integer64(x)))) - expect_identical(ordered(as.integer64(x)), base::ordered(x)) + + expect_identical(formals(ordered), formals(base::ordered)) + expect_identical(body(ordered), body(base::ordered)) }) with_parameters_test_that("factor and order work analogously to integer:", { From ba267cf164dac7cb194051a2faaf32441392a0db Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 23 Feb 2026 08:05:14 +0700 Subject: [PATCH 14/17] test formals of factor() too --- tests/testthat/test-integer64.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index f694d73f..982be07f 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -836,6 +836,7 @@ test_that("factor and order for integer64 are still necessary", { expect_failure(expect_identical(factor(as.integer64(x)), base::factor(as.integer64(x)))) expect_identical(factor(as.integer64(x)), base::factor(x)) + expect_identical(formals(factor), formals(base::factor)) expect_identical(formals(ordered), formals(base::ordered)) expect_identical(body(ordered), body(base::ordered)) }) From 0a9063cec4cb487fdb3cac3bf182bb87551c8903 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 23 Feb 2026 10:51:00 +0000 Subject: [PATCH 15/17] vestigial nolint in nolint start/end region --- R/highlevel64.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/highlevel64.R b/R/highlevel64.R index 0c3f2ed9..a6289824 100644 --- a/R/highlevel64.R +++ b/R/highlevel64.R @@ -245,7 +245,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { tim2 = matrix(0.0, 16L, 6L) dimnames(tim2) = list( - c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)", "factor(b)"), # nolint: line_length_linter. + c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)", "factor(b)"), c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache") ) @@ -438,7 +438,7 @@ optimizer64 = function(nsmall=2L^16L, x2 = c(sample(n2, n2 - 1L, TRUE), NA) tim = matrix(0.0, 9L, 3L) dimnames(tim) = list( - c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -530,7 +530,7 @@ optimizer64 = function(nsmall=2L^16L, x2 = c(sample(n2, n2 - 1L, TRUE), NA) tim = matrix(0.0, 10L, 3L) dimnames(tim) = list( - c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -622,7 +622,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 10L, 3L) dimnames(tim) = list( - c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -714,7 +714,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 15L, 3L) dimnames(tim) = list( - c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -859,7 +859,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 14L, 3L) dimnames(tim) = list( - c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -1251,7 +1251,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 5L, 3L) dimnames(tim) = list( - c("factor", "factor.64", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("factor", "factor.64", "hashcache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) From c58ff33246b6e7fef9a3aa4b980614915cf31071 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 23 Feb 2026 17:57:15 +0700 Subject: [PATCH 16/17] need covr skip --- tests/testthat/test-integer64.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index 982be07f..ee298d17 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -830,6 +830,7 @@ test_that("match works with zero length input", { test_that("factor and order for integer64 are still necessary", { skip_unless_r(">= 4.1.0") # skip implementing "expect_failure" for ancient R skip_on_cran() + skip_on_covr() # covr edits the function bodies x = c(132724613L, -2143220989L, -1L, NA, 1L) expect_failure(expect_identical(base::factor(as.integer64(x)), base::factor(x))) From e599abe7dbfccafffae2b149771ef4b8b7f05bd6 Mon Sep 17 00:00:00 2001 From: hcirellu Date: Mon, 23 Feb 2026 13:22:03 +0100 Subject: [PATCH 17/17] add test for short and long vectors to factor() --- tests/testthat/test-integer64.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index ee298d17..ff918ee5 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -844,6 +844,9 @@ test_that("factor and order for integer64 are still necessary", { with_parameters_test_that("factor and order work analogously to integer:", { x = c(132724613L, -2143220989L, -1L, NA, 1L) + # test factor() for integer64 with short (< 4000) and long (>= 4000) vectors, because of the different code paths for the two cases + if (isTRUE(long_input)) + x = rep_len(x, 5000L) expect_identical(factor(as.integer64(x)), factor(x)) @@ -856,11 +859,12 @@ with_parameters_test_that("factor and order work analogously to integer:", { tryCatch(ordered(as.integer64(x), levels=levels, labels=labels, exclude=exclude), error=conditionMessage), tryCatch(ordered(x, levels=levels, labels=labels, exclude=exclude), error=conditionMessage) ) - }, +}, .cases = expand.grid( levels=I(list(NULL, NA, 1L, c(-1L, 1L), "1")), labels=I(list(levels, NULL, letters[1L], letters[1:2])), exclude=I(list(NULL, NA, 1L, c(-1L, 1L))), - ordered=c(TRUE, FALSE) + ordered=c(TRUE, FALSE), + long_input=c(FALSE, TRUE) ) )