Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ export(as.bitstring.integer64)
export(as.character.integer64)
export(as.data.frame.integer64)
export(as.double.integer64)
export(as.factor)
export(as.integer.integer64)
export(as.integer64)
export(as.integer64.NULL)
Expand All @@ -219,6 +220,7 @@ export(as.integer64.integer64)
export(as.integer64.logical)
export(as.list.integer64)
export(as.logical.integer64)
export(as.ordered)
export(benchmark64)
export(binattr)
export(c.integer64)
Expand All @@ -227,6 +229,7 @@ export(cbind.integer64)
export(colSums)
export(diff.integer64)
export(duplicated.integer64)
export(factor)
export(format.integer64)
export(getcache)
export(hashcache)
Expand Down Expand Up @@ -297,6 +300,7 @@ export(order.integer64)
export(ordercache)
export(orderdup)
export(orderdup.integer64)
export(ordered)
export(orderfin)
export(orderfin.integer64)
export(orderkey)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
- Ignores leading/trailing whitespace (as does `as.integer()`; #232).
1. `sortcache`, `sortordercache` and `ordercache` get a new argument `na.last`.
1. `matrix`, `array`, `%*%` and `as.matrix` get an `integer64` method (#45). Thanks @hcirellu.
1. `factor`, `as.factor`, `ordered`, and `as.ordered` support `integer64` input correctly, i.e. the levels are sorted according to `integer64` values. Thanks @hcirellu.

## BUG FIXES

Expand Down
86 changes: 74 additions & 12 deletions R/highlevel64.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#' | rank(b) | ranking of big vector |
#' | quantile(b) | quantiles of big vector |
#' | summary(b) | summary of of big vector |
#' | factor(b) | coercion to factor of big vector |
#' | SESSION | exemplary session involving multiple calls (including cache filling costs) |
#'
#' Note that the timings for the cached variants do _not_ contain the
Expand Down Expand Up @@ -220,6 +221,10 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {
else
cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete")
})[3L]
message('coerce to factor')
tim1[i] = tim1[i] + timefun({
factor(b)
})[3L]

remcache(s)
remcache(b)
Expand All @@ -238,9 +243,9 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {
b = sample(nbig, nbig, TRUE)
b2 = sample(nbig, nbig, TRUE)

tim2 = matrix(0.0, 15L, 6L)
tim2 = matrix(0.0, 16L, 6L)
dimnames(tim2) = list(
c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)"), # nolint: line_length_linter.
c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)", "factor(b)"),
c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache")
)

Expand Down Expand Up @@ -338,6 +343,11 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {
summary(b)
})[3L]

message(colnames(tim2)[i], " factor(b)")
tim2["factor(b)", i] = timefun({
factor(b)
})[3L]

remcache(s)
remcache(b)
remcache(b2)
Expand Down Expand Up @@ -369,6 +379,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {
# rank(b) 13.480 0.860 0.915 0.240 0.545 0.246
# quantile(b) 0.373 0.400 0.410 0.000 0.000 0.000
# summary(b) 0.645 0.423 0.427 0.016 0.016 0.016
# factor(b) 6.655 8.212 7.834 7.812 7.241 8.332
# TOTAL 149.173 7.179 6.291 4.448 11.320 5.239
# 32-bit 64-bit hashcache sortordercache ordercache allcache
# cache 1 1.062 0.000 0.000 0.000 0.000
Expand All @@ -386,6 +397,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {
# rank(b) 1 15.674 14.732 56.167 24.734 54.797
# quantile(b) 1 0.933 0.911 804.907 806.027 810.133
# summary(b) 1 1.524 1.512 39.345 39.345 39.345
# factor(b) 1 0.812 0.851 0.848 0.920 0.767
# TOTAL 1 20.778 23.712 33.534 13.177 28.476

tim3
Expand All @@ -398,7 +410,7 @@ benchmark64 = function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {
optimizer64 = function(nsmall=2L^16L,
nbig=2L^25L,
timefun=repeat.time,
what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile"),
what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile", "factor"),
uniorder=c("original", "values", "any"),
taborder=c("values", "counts"),
plot=TRUE) {
Expand Down Expand Up @@ -426,7 +438,7 @@ optimizer64 = function(nsmall=2L^16L,
x2 = c(sample(n2, n2 - 1L, TRUE), NA)
tim = matrix(0.0, 9L, 3L)
dimnames(tim) = list(
c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"),
c("prep", "both", "use")
)

Expand Down Expand Up @@ -518,7 +530,7 @@ optimizer64 = function(nsmall=2L^16L,
x2 = c(sample(n2, n2 - 1L, TRUE), NA)
tim = matrix(0.0, 10L, 3L)
dimnames(tim) = list(
c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"),
c("prep", "both", "use")
)

Expand Down Expand Up @@ -601,8 +613,6 @@ optimizer64 = function(nsmall=2L^16L,

ret[["%in%", as.character(n1)]] = tim
}

ret[["%in%", as.character(n1)]] <- tim
Comment thread
hcirellu marked this conversation as resolved.
}
if ("duplicated" %in% what) {
message("duplicated: timings of different methods")
Expand All @@ -612,7 +622,7 @@ optimizer64 = function(nsmall=2L^16L,
x = c(sample(n, n - 1L, TRUE), NA)
tim = matrix(0.0, 10L, 3L)
dimnames(tim) = list(
c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"),
c("prep", "both", "use")
)

Expand Down Expand Up @@ -695,8 +705,6 @@ optimizer64 = function(nsmall=2L^16L,

ret[["duplicated", as.character(n)]] = tim
}

ret[["duplicated", as.character(n)]] <- tim
}
if ("unique" %in% what) {
message("unique: timings of different methods")
Expand All @@ -706,7 +714,7 @@ optimizer64 = function(nsmall=2L^16L,
x = c(sample(n, n - 1L, TRUE), NA)
tim = matrix(0.0, 15L, 3L)
dimnames(tim) = list(
c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"),
c("prep", "both", "use")
)

Expand Down Expand Up @@ -786,6 +794,8 @@ optimizer64 = function(nsmall=2L^16L,
p2 = orderuni(x, o, nunique, keep.order=TRUE)
nunique = ordernut(x, o)[1L]
})[3L]
if (uniorder == "original")
Comment thread
hcirellu marked this conversation as resolved.
stopifnot(identical.integer64(p2, p))

tim["hashdup", "prep"] = tim["hashuni", "prep"]
tim["hashdup", "use"] = timefun({
Expand Down Expand Up @@ -849,7 +859,7 @@ optimizer64 = function(nsmall=2L^16L,
x = c(sample(n, n - 1L, TRUE), NA)
tim = matrix(0.0, 14L, 3L)
dimnames(tim) = list(
c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"),
c("prep", "both", "use")
)

Expand Down Expand Up @@ -1233,6 +1243,58 @@ optimizer64 = function(nsmall=2L^16L,
}
}

if ("factor" %in% what) {
message("factor: timings of different methods")
N = c(nsmall, nbig)
for (i in seq_along(N)) {
n = N[i]
x = c(sample(n, n - 1L, TRUE), NA)
tim = matrix(0.0, 5L, 3L)
dimnames(tim) = list(
c("factor", "factor.64", "hashcache", "sortorder.cache", "order.cache"),
c("prep", "both", "use")
)

tim["factor", "both"] = timefun({
p = base::factor(x)
})[3L]
x = as.integer64(x)

tim["factor.64", "both"] = timefun({
p2 = factor(x)
})[3L]
stopifnot(identical(p2, p))

hashcache(x)
tim["hashcache", "use"] = timefun({
p2 = factor(x)
})[3L]
stopifnot(identical(p2, p))
remcache(x)

sortordercache(x)
tim["sortorder.cache", "use"] = timefun({
p2 = factor(x)
})[3L]
stopifnot(identical(p2, p))
remcache(x)

ordercache(x)
tim["order.cache", "use"] = timefun({
p2 = factor(x)
})[3L]
stopifnot(identical(p2, p))
remcache(x)

if (plot) {
barplot(t(tim), cex.names=0.7)
title(paste0("factor(", n, ")"))
}

ret[["factor", as.character(n)]] = tim
}
}

ret
}
# nolint end: brace_linter, line_length_linter.
Expand Down
83 changes: 82 additions & 1 deletion R/integer64.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# R-Code
# S3 atomic 64bit integers for R
# (c) 2011-2024 Jens Oehlschägel
# (c) 2025 Michael Chirico
# (c) 2025-2026 Michael Chirico
# Licence: GPL2
# Provided 'as is', use at your own risk
# Created: 2011-12-11
Expand Down Expand Up @@ -392,6 +392,22 @@ NULL
#' @name all.equal.integer64
NULL

#' Factors
#'
#' The function [factor] is used to encode a vector as a factor.
#'
#' @inheritParams base::factor
#' @param nmax an upper bound on the number of levels.
#'
#' @return An object of class "factor" or "ordered".
#' @seealso [factor][base::factor]
#' @examples
#' x <- as.integer64(c(132724613L, -2143220989L, -1L, NA, 1L))
#' factor(x)
#' ordered(x)
#' @name factor
NULL

methods::setOldClass("integer64")

# contributed by Leonardo Silvestri with modifications of JO
Expand Down Expand Up @@ -792,6 +808,71 @@ as.POSIXct.integer64 = function(x, tz="", origin, ...)
as.POSIXlt.integer64 = function(x, tz="", origin, ...)
as.POSIXlt(as.double(x, ...), tz=tz, origin=origin, ...)

#' @rdname as.character.integer64
#' @export as.factor
as.factor = function(x) factor(x=x)

#' @rdname as.character.integer64
#' @export as.ordered
as.ordered = function(x) ordered(x=x)

#' @rdname factor
#' @export
factor = function(x=character(), levels, labels=levels, exclude=NA, ordered=is.ordered(x), nmax=NA) {
if (!is.integer64(x)) {
sys_call = sys.call()
sys_call[[1L]] = base::factor
pf = parent.frame()
return(withCallingHandlers_and_choose_call(eval(sys_call, envir=pf), "factor"))
}

nx = names(x)
if (missing(levels)) {
levels = sort(unique(x))
} else if (length(x) >= 4000) {
levels = as.integer64(levels)
}
# use base::factor for short vectors because it is faster
if (length(x) < 4000) {
force(ordered)
x = as.character(x)
levels = as.character(levels)
if (missing(labels))
return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, exclude=exclude, ordered=ordered, nmax=nmax), "factor"))
else
return(withCallingHandlers_and_choose_call(base::factor(x=x, levels=levels, labels=labels, exclude=exclude, ordered=ordered, nmax=nmax), "factor"))
}

# basically copied from base::factor, but using the benefit from caching
levels = levels[is.na(match(levels, exclude))]
ret = match(x, levels)
if (!is.null(nx))
names(ret) = nx
if (missing(labels)) {
levels(ret) = as.character(levels)
} else {
nlab = length(labels)
if (nlab == length(levels)) {
xlevs = as.character(labels)
nlevs = unique(xlevs)
at = attributes(ret)
at$levels = nlevs
ret = match(xlevs, nlevs)[ret]
attributes(ret) = at
} else if (nlab == 1L) {
levels(ret) = paste0(labels, seq_along(levels))
} else {
stop(gettextf("invalid 'labels'; length %d should be 1 or %d", nlab, length(levels), domain="R-base"), domain=NA)
}
}
class(ret) <- c(if (ordered) "ordered", "factor")
ret
}

#' @rdname factor
#' @export
ordered = function(x=character(), ...) factor(x, ..., ordered=TRUE)
Comment thread
MichaelChirico marked this conversation as resolved.

#' @rdname as.character.integer64
#' @export
print.bitstring = function(x, ...) {
Expand Down
6 changes: 6 additions & 0 deletions man/as.character.integer64.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/benchmark64.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading