diff --git a/NAMESPACE b/NAMESPACE index 04bd7bd8..aad1dc30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -207,6 +207,7 @@ export(as.bitstring.integer64) export(as.character.integer64) export(as.data.frame.integer64) export(as.double.integer64) +export(as.factor) export(as.integer.integer64) export(as.integer64) export(as.integer64.NULL) @@ -219,6 +220,7 @@ export(as.integer64.integer64) export(as.integer64.logical) export(as.list.integer64) export(as.logical.integer64) +export(as.ordered) export(benchmark64) export(binattr) export(c.integer64) @@ -227,6 +229,7 @@ export(cbind.integer64) export(colSums) export(diff.integer64) export(duplicated.integer64) +export(factor) export(format.integer64) export(getcache) export(hashcache) @@ -297,6 +300,7 @@ export(order.integer64) export(ordercache) export(orderdup) export(orderdup.integer64) +export(ordered) export(orderfin) export(orderfin.integer64) export(orderkey) diff --git a/NEWS.md b/NEWS.md index 570fb462..5c910dd5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -61,6 +61,7 @@ - Ignores leading/trailing whitespace (as does `as.integer()`; #232). 1. `sortcache`, `sortordercache` and `ordercache` get a new argument `na.last`. 1. `matrix`, `array`, `%*%` and `as.matrix` get an `integer64` method (#45). Thanks @hcirellu. +1. `factor`, `as.factor`, `ordered`, and `as.ordered` support `integer64` input correctly, i.e. the levels are sorted according to `integer64` values. Thanks @hcirellu. ## BUG FIXES diff --git a/R/highlevel64.R b/R/highlevel64.R index bf037ea8..a6289824 100644 --- a/R/highlevel64.R +++ b/R/highlevel64.R @@ -47,6 +47,7 @@ #' | rank(b) | ranking of big vector | #' | quantile(b) | quantiles of big vector | #' | summary(b) | summary of of big vector | +#' | factor(b) | coercion to factor of big vector | #' | SESSION | exemplary session involving multiple calls (including cache filling costs) | #' #' Note that the timings for the cached variants do _not_ contain the @@ -220,6 +221,10 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { else cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete") })[3L] + message('coerce to factor') + tim1[i] = tim1[i] + timefun({ + factor(b) + })[3L] remcache(s) remcache(b) @@ -238,9 +243,9 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { b = sample(nbig, nbig, TRUE) b2 = sample(nbig, nbig, TRUE) - tim2 = matrix(0.0, 15L, 6L) + tim2 = matrix(0.0, 16L, 6L) dimnames(tim2) = list( - c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)"), # nolint: line_length_linter. + c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)", "factor(b)"), c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache") ) @@ -338,6 +343,11 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { summary(b) })[3L] + message(colnames(tim2)[i], " factor(b)") + tim2["factor(b)", i] = timefun({ + factor(b) + })[3L] + remcache(s) remcache(b) remcache(b2) @@ -369,6 +379,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { # rank(b) 13.480 0.860 0.915 0.240 0.545 0.246 # quantile(b) 0.373 0.400 0.410 0.000 0.000 0.000 # summary(b) 0.645 0.423 0.427 0.016 0.016 0.016 + # factor(b) 6.655 8.212 7.834 7.812 7.241 8.332 # TOTAL 149.173 7.179 6.291 4.448 11.320 5.239 # 32-bit 64-bit hashcache sortordercache ordercache allcache # cache 1 1.062 0.000 0.000 0.000 0.000 @@ -386,6 +397,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { # rank(b) 1 15.674 14.732 56.167 24.734 54.797 # quantile(b) 1 0.933 0.911 804.907 806.027 810.133 # summary(b) 1 1.524 1.512 39.345 39.345 39.345 + # factor(b) 1 0.812 0.851 0.848 0.920 0.767 # TOTAL 1 20.778 23.712 33.534 13.177 28.476 tim3 @@ -398,7 +410,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) { optimizer64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time, - what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile"), + what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile", "factor"), uniorder=c("original", "values", "any"), taborder=c("values", "counts"), plot=TRUE) { @@ -426,7 +438,7 @@ optimizer64 = function(nsmall=2L^16L, x2 = c(sample(n2, n2 - 1L, TRUE), NA) tim = matrix(0.0, 9L, 3L) dimnames(tim) = list( - c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -518,7 +530,7 @@ optimizer64 = function(nsmall=2L^16L, x2 = c(sample(n2, n2 - 1L, TRUE), NA) tim = matrix(0.0, 10L, 3L) dimnames(tim) = list( - c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -601,8 +613,6 @@ optimizer64 = function(nsmall=2L^16L, ret[["%in%", as.character(n1)]] = tim } - - ret[["%in%", as.character(n1)]] <- tim } if ("duplicated" %in% what) { message("duplicated: timings of different methods") @@ -612,7 +622,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 10L, 3L) dimnames(tim) = list( - c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -695,8 +705,6 @@ optimizer64 = function(nsmall=2L^16L, ret[["duplicated", as.character(n)]] = tim } - - ret[["duplicated", as.character(n)]] <- tim } if ("unique" %in% what) { message("unique: timings of different methods") @@ -706,7 +714,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 15L, 3L) dimnames(tim) = list( - c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -786,6 +794,8 @@ optimizer64 = function(nsmall=2L^16L, p2 = orderuni(x, o, nunique, keep.order=TRUE) nunique = ordernut(x, o)[1L] })[3L] + if (uniorder == "original") + stopifnot(identical.integer64(p2, p)) tim["hashdup", "prep"] = tim["hashuni", "prep"] tim["hashdup", "use"] = timefun({ @@ -849,7 +859,7 @@ optimizer64 = function(nsmall=2L^16L, x = c(sample(n, n - 1L, TRUE), NA) tim = matrix(0.0, 14L, 3L) dimnames(tim) = list( - c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter. + c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), c("prep", "both", "use") ) @@ -1233,6 +1243,58 @@ optimizer64 = function(nsmall=2L^16L, } } + if ("factor" %in% what) { + message("factor: timings of different methods") + N = c(nsmall, nbig) + for (i in seq_along(N)) { + n = N[i] + x = c(sample(n, n - 1L, TRUE), NA) + tim = matrix(0.0, 5L, 3L) + dimnames(tim) = list( + c("factor", "factor.64", "hashcache", "sortorder.cache", "order.cache"), + c("prep", "both", "use") + ) + + tim["factor", "both"] = timefun({ + p = base::factor(x) + })[3L] + x = as.integer64(x) + + tim["factor.64", "both"] = timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + + hashcache(x) + tim["hashcache", "use"] = timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + sortordercache(x) + tim["sortorder.cache", "use"] = timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + ordercache(x) + tim["order.cache", "use"] = timefun({ + p2 = factor(x) + })[3L] + stopifnot(identical(p2, p)) + remcache(x) + + if (plot) { + barplot(t(tim), cex.names=0.7) + title(paste0("factor(", n, ")")) + } + + ret[["factor", as.character(n)]] = tim + } + } + ret } # nolint end: brace_linter, line_length_linter. diff --git a/R/integer64.R b/R/integer64.R index c3b52056..48a11f4e 100644 --- a/R/integer64.R +++ b/R/integer64.R @@ -2,7 +2,7 @@ # R-Code # S3 atomic 64bit integers for R # (c) 2011-2024 Jens Oehlschägel -# (c) 2025 Michael Chirico +# (c) 2025-2026 Michael Chirico # Licence: GPL2 # Provided 'as is', use at your own risk # Created: 2011-12-11 @@ -392,6 +392,22 @@ NULL #' @name all.equal.integer64 NULL +#' Factors +#' +#' The function [factor] is used to encode a vector as a factor. +#' +#' @inheritParams base::factor +#' @param nmax an upper bound on the number of levels. +#' +#' @return An object of class "factor" or "ordered". +#' @seealso [factor][base::factor] +#' @examples +#' x <- as.integer64(c(132724613L, -2143220989L, -1L, NA, 1L)) +#' factor(x) +#' ordered(x) +#' @name factor +NULL + methods::setOldClass("integer64") # contributed by Leonardo Silvestri with modifications of JO @@ -792,6 +808,71 @@ as.POSIXct.integer64 = function(x, tz="", origin, ...) as.POSIXlt.integer64 = function(x, tz="", origin, ...) as.POSIXlt(as.double(x, ...), tz=tz, origin=origin, ...) +#' @rdname as.character.integer64 +#' @export as.factor +as.factor = function(x) factor(x=x) + +#' @rdname as.character.integer64 +#' @export as.ordered +as.ordered = function(x) ordered(x=x) + +#' @rdname factor +#' @export +factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.ordered(x), nmax=NA) { + if (!is.integer64(x)) { + sys_call = sys.call() + sys_call[[1L]] = base::factor + pf = parent.frame() + return(withCallingHandlers_and_choose_call(eval(sys_call, envir=pf), "factor")) + } + + nx = names(x) + if (missing(levels)) { + levels = sort(unique(x)) + } else if (length(x) >= 4000) { + levels = as.integer64(levels) + } + # use base::factor for short vectors because it is faster + if (length(x) < 4000) { + force(ordered) + x = as.character(x) + levels = as.character(levels) + if (missing(labels)) + return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) + else + return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, labels=labels, exclude=exclude, ordered=ordered, nmax=nmax), "factor")) + } + + # basically copied from base::factor, but using the benefit from caching + levels = levels[is.na(match(levels, exclude))] + ret = match(x, levels) + if (!is.null(nx)) + names(ret) = nx + if (missing(labels)) { + levels(ret) = as.character(levels) + } else { + nlab = length(labels) + if (nlab == length(levels)) { + xlevs = as.character(labels) + nlevs = unique(xlevs) + at = attributes(ret) + at$levels = nlevs + ret = match(xlevs, nlevs)[ret] + attributes(ret) = at + } else if (nlab == 1L) { + levels(ret) = paste0(labels, seq_along(levels)) + } else { + stop(gettextf("invalid 'labels'; length %d should be 1 or %d", nlab, length(levels), domain="R-base"), domain=NA) + } + } + class(ret) <- c(if (ordered) "ordered", "factor") + ret +} + +#' @rdname factor +#' @export +ordered = function(x=character(), ...) factor(x, ..., ordered=TRUE) + #' @rdname as.character.integer64 #' @export print.bitstring = function(x, ...) { diff --git a/man/as.character.integer64.Rd b/man/as.character.integer64.Rd index 367fcc43..769ce956 100644 --- a/man/as.character.integer64.Rd +++ b/man/as.character.integer64.Rd @@ -13,6 +13,8 @@ \alias{as.Date.integer64} \alias{as.POSIXct.integer64} \alias{as.POSIXlt.integer64} +\alias{as.factor} +\alias{as.ordered} \alias{print.bitstring} \alias{as.list.integer64} \title{Coerce from integer64} @@ -41,6 +43,10 @@ as.bitstring(x, ...) \method{as.POSIXlt}{integer64}(x, tz = "", origin, ...) +as.factor(x) + +as.ordered(x) + \method{print}{bitstring}(x, ...) \method{as.list}{integer64}(x, ...) diff --git a/man/benchmark64.Rd b/man/benchmark64.Rd index f250345e..865d94e2 100644 --- a/man/benchmark64.Rd +++ b/man/benchmark64.Rd @@ -12,7 +12,7 @@ optimizer64( nbig = 2L^25L, timefun = repeat.time, what = c("match", "\%in\%", "duplicated", "unique", "unipos", "table", "rank", - "quantile"), + "quantile", "factor"), uniorder = c("original", "values", "any"), taborder = c("values", "counts"), plot = TRUE @@ -77,6 +77,7 @@ Function for measuring algorithmic performance of high-level and low-level integ rank(b) \tab ranking of big vector \cr quantile(b) \tab quantiles of big vector \cr summary(b) \tab summary of of big vector \cr + factor(b) \tab coercion to factor of big vector \cr SESSION \tab exemplary session involving multiple calls (including cache filling costs) \cr } diff --git a/man/factor.Rd b/man/factor.Rd new file mode 100644 index 00000000..dfcae56f --- /dev/null +++ b/man/factor.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integer64.R +\name{factor} +\alias{factor} +\alias{ordered} +\title{Factors} +\usage{ +factor( + x = character(), + levels, + labels = levels, + exclude = NA, + ordered = is.ordered(x), + nmax = NA +) + +ordered(x = character(), ...) +} +\arguments{ +\item{x}{a vector of data, usually taking a small number of distinct + values.} + +\item{levels}{an optional vector of the unique values (as character strings) + that \code{x} might have taken. The default is the unique set of + values taken by \code{\link[base]{as.character}(x)}, sorted into + increasing order \emph{of \code{x}}. Note that this set can be + specified as smaller than \code{sort(unique(x))}.} + +\item{labels}{\emph{either} an optional character vector of + labels for the levels (in the same order as \code{levels} after + removing those in \code{exclude}), \emph{or} a character string of + length 1. Duplicated values in \code{labels} can be used to map + different values of \code{x} to the same factor level.} + +\item{exclude}{a vector of values to be excluded when forming the + set of levels. This may be factor with the same level set as \code{x} + or should be a \code{character}.} + +\item{ordered}{logical flag to determine if the levels should be regarded + as ordered (in the order given).} + +\item{nmax}{an upper bound on the number of levels.} + +\item{...}{(in \code{ordered(.)}): any of the above, apart from + \code{ordered} itself.} +} +\value{ +An object of class "factor" or "ordered". +} +\description{ +The function \link{factor} is used to encode a vector as a factor. +} +\examples{ + x <- as.integer64(c(132724613L, -2143220989L, -1L, NA, 1L)) + factor(x) + ordered(x) +} +\seealso{ +\link[base:factor]{factor} +} diff --git a/tests/testthat/test-integer64.R b/tests/testthat/test-integer64.R index d483ae3a..ff918ee5 100644 --- a/tests/testthat/test-integer64.R +++ b/tests/testthat/test-integer64.R @@ -826,3 +826,45 @@ test_that("match works with zero length input", { expect_identical(match(x64, integer(), nomatch=10L), match(x32, integer(), nomatch=10L)) expect_identical(match(integer(), x64), match(integer(), x32)) }) + +test_that("factor and order for integer64 are still necessary", { + skip_unless_r(">= 4.1.0") # skip implementing "expect_failure" for ancient R + skip_on_cran() + skip_on_covr() # covr edits the function bodies + + x = c(132724613L, -2143220989L, -1L, NA, 1L) + expect_failure(expect_identical(base::factor(as.integer64(x)), base::factor(x))) + expect_failure(expect_identical(factor(as.integer64(x)), base::factor(as.integer64(x)))) + expect_identical(factor(as.integer64(x)), base::factor(x)) + + expect_identical(formals(factor), formals(base::factor)) + expect_identical(formals(ordered), formals(base::ordered)) + expect_identical(body(ordered), body(base::ordered)) +}) + +with_parameters_test_that("factor and order work analogously to integer:", { + x = c(132724613L, -2143220989L, -1L, NA, 1L) + # test factor() for integer64 with short (< 4000) and long (>= 4000) vectors, because of the different code paths for the two cases + if (isTRUE(long_input)) + x = rep_len(x, 5000L) + + expect_identical(factor(as.integer64(x)), factor(x)) + + expect_identical( + tryCatch(factor(as.integer64(x), levels=levels, labels=labels, exclude=exclude, ordered=ordered), error=conditionMessage), + tryCatch(factor(x, levels=levels, labels=labels, exclude=exclude, ordered=ordered), error=conditionMessage) + ) + if (isTRUE(ordered)) + expect_identical( + tryCatch(ordered(as.integer64(x), levels=levels, labels=labels, exclude=exclude), error=conditionMessage), + tryCatch(ordered(x, levels=levels, labels=labels, exclude=exclude), error=conditionMessage) + ) +}, + .cases = expand.grid( + levels=I(list(NULL, NA, 1L, c(-1L, 1L), "1")), + labels=I(list(levels, NULL, letters[1L], letters[1:2])), + exclude=I(list(NULL, NA, 1L, c(-1L, 1L))), + ordered=c(TRUE, FALSE), + long_input=c(FALSE, TRUE) + ) +)