From a2bd60ea42326f399c2f3e23b0297ba1da5f7f55 Mon Sep 17 00:00:00 2001 From: Rick Saporta Date: Mon, 29 Sep 2014 22:33:30 -0400 Subject: [PATCH 1/3] setclass() added to data.table.r --- R/data.table.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index c3e85fb444..ad56ba2724 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2044,6 +2044,19 @@ setnames = function(x,old,new) { invisible(x) } +setclass <- function(x, newclass, append=TRUE) { + if (length(newclass) == 1 && newclass=="data.table") { + warning ("use setDT() instead of setclass() to convert an object to data.table by reference") + return(setDT(x)) + } + + if (isTRUE(append)) + cl <- c(newclass, class(x)) + + setattr(x, "class", cl) +} + + setcolorder = function(x,neworder) { if (!is.data.table(x)) stop("x is not a data.table") From cf1851b45ae74de0e28b7a20391872d1a0a0e281 Mon Sep 17 00:00:00 2001 From: Rick Saporta Date: Tue, 30 Sep 2014 00:56:09 -0400 Subject: [PATCH 2/3] desc.r - functions to describe DT columns with clean output --- R/desc.r | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 R/desc.r diff --git a/R/desc.r b/R/desc.r new file mode 100644 index 0000000000..5b7b0bf0dc --- /dev/null +++ b/R/desc.r @@ -0,0 +1,119 @@ +desc <- function(x, ...) { + UseMethod ("desc") +} + +desc.default <- function(x, ...) { + stop ("Currently desc is only implemented for data.table / data.frame.\nPlease feel free to submit a feature request on https://github.com/Rdatatable/data.table/issues i") +} + +desc.data.frame <- function(x, ...) { + x.DT <- copy(x) + setDT(x.DT) + desc.data.table(x.DT) +} + +desc.data.table <- function(x, tight.output=FALSE, sort.column=TRUE, quietly=FALSE, ...) { +## NOTE: depends on as.data.table.matrix which returns strings instead of factors for character matrices +## +## TODO: Add 'details=TRUE' argument, such as min, max, etc, depending on class + + if (!length(x)) + return(null.data.table()) + + cl.list <- lapply(DT, class) + #find max length of cl.l and pad the rest of the values with "" + l <- sapply(cl.list, length) + if (mean(l) != min(l)) + ans <- rbindlist(mapply(function(cl, p) as.data.table(t(c(cl, rep("", p)))), cl.list, max(l) - l, SIMPLIFY=FALSE)) + else + ans <- as.data.table(do.call(rbind, cl.list)) + + ## TODO: Once characters are returned unofromly as characters, we can remove this check + if (is.factor(ans[[1]])) + ans <- ans[, lapply(.SD, as.character)] + + ## First column is called "class", subsequent are called "class2", "class3" etc + setnames(ans, 1, "class") + setnames(ans, gsub("V", "class", names(ans))) + + ## add in column names + ans[, column := names(DT)] + + setclass(ans, "desc", append=TRUE) + + ## Avoid printing the output twice, if user executes print(desc(DT)) + if (missing(quietly) && grepl("^print\\(|\\.", tail(sys.calls(), 3)[1:2])) + quietly <- TRUE + + ## This last portion, as well as the template definition of quietly is to allow + if (!missing(quietly) && isTRUE(quietly)) + return(invisible(ans)) + + print.desc(ans, tight.output=tight.output, sort.column=sort.column) + return(invisible(ans)) +} + +print.desc <- function(x, tight.output=FALSE, sort.column=TRUE, ...) { +## Some columns have more than one class. However, +## for the purposes of this clean output, we show only the primary class. + + ## error check. The dots are included to avoid crashes when print(x, some.arg) is called + ## However, notify user, as it might be sign of unexpected method deployment + if (length(dots <- list(...))) + warning("print.desc(x) does not use argument(s) ", paste(names(dtos), collapse=", ")) + + ## Output will be sorted according to this manual ordering + classOrder <- c("idcol", "character", "factor", "logical", "integer", "numeric" + , "Date", "POSIXct", "POSIXlt" + , "list", "data.table", "data.frame") + + if (!inherits(x, "desc")) + x <- desc(x) + else + x <- copy(x) + + ## Add in any missing classes + classOrder <- unique(c(classOrder, x$class, use.names=FALSE) ) + x[, class.factor := factor(class, levels=classOrder)] + + if (sort.column) + setkeyv(x, c("class", "column")) + + ## fmt for sprintf, check largest char size of class.factor + space.class <- sprintf("%%s %%-%is %%1s %%s", max(nchar(x$class))) ## eg: "%s %-12s %1s %s" for: "\n class : columns..." + ## combine the columns into a comma separated string, by class + ans <- x[, list(columns.out = paste(column, collapse=", ")), keyby=class.factor] + + ## chop each line between 30~60 characters + repl <- sprintf(space.class, "", "", "", "") ## thhe spae at the left of a new line for the same class + repl <- paste0("\\1,\n", repl) + ans[, columns.out := gsub("(.{30}.{0,30}?), ", repl, columns.out)] + + + out <- ans[, sprintf(fmt=space.class, ifelse(tight.output, "", "\n"), class.factor, ":", columns.out)] + cat(out, if (!tight.output) "", sep="\n") + + return(invisible(NULL)) +} + + +if (FALSE) +{ + desc(DT) + desc(DT, sort=FALSE) + desc(DT, tight=TRUE) + desc(DT, tight=TRUE, sort=FALSE) + desc(DT, tight=TRUE, sort=FALSE, quietly=FALSE) + desc(DT, tight=TRUE, sort=FALSE, quietly=TRUE) + desc(DT, quietly=FALSE) + desc(DT, quietly=TRUE) + desc(DT, tight=TRUE, sort=FALSE, quietly=TRUE) + + + ## Confirming that we do not output the ans twice. + ## (the calls to identity are simply to test sys.calls() is correctly used. + print(desc(DT)) + identity(print(desc(DT))) + identity(print(desc(identity(DT)))) +} + From c5af8a29ef2bee776ec3c1d3df67c7dcaf949a96 Mon Sep 17 00:00:00 2001 From: Rick Saporta Date: Tue, 30 Sep 2014 01:21:33 -0400 Subject: [PATCH 3/3] desc.r - bug fixes and better example --- R/desc.r | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/R/desc.r b/R/desc.r index 5b7b0bf0dc..c226de2c88 100644 --- a/R/desc.r +++ b/R/desc.r @@ -12,7 +12,7 @@ desc.data.frame <- function(x, ...) { desc.data.table(x.DT) } -desc.data.table <- function(x, tight.output=FALSE, sort.column=TRUE, quietly=FALSE, ...) { +desc.data.table <- function(x, tight.output=FALSE, sort.column=TRUE, quietly=FALSE) { ## NOTE: depends on as.data.table.matrix which returns strings instead of factors for character matrices ## ## TODO: Add 'details=TRUE' argument, such as min, max, etc, depending on class @@ -42,7 +42,7 @@ desc.data.table <- function(x, tight.output=FALSE, sort.column=TRUE, quietly=FAL setclass(ans, "desc", append=TRUE) ## Avoid printing the output twice, if user executes print(desc(DT)) - if (missing(quietly) && grepl("^print\\(|\\.", tail(sys.calls(), 3)[1:2])) + if (missing(quietly) && grepl("^print(\\(|\\.)", tail(sys.calls(), 3)[1:2])) quietly <- TRUE ## This last portion, as well as the template definition of quietly is to allow @@ -60,7 +60,7 @@ print.desc <- function(x, tight.output=FALSE, sort.column=TRUE, ...) { ## error check. The dots are included to avoid crashes when print(x, some.arg) is called ## However, notify user, as it might be sign of unexpected method deployment if (length(dots <- list(...))) - warning("print.desc(x) does not use argument(s) ", paste(names(dtos), collapse=", ")) + warning("print.desc(x) does not use argument(s) ", paste("'", names(dots), "'", collapse=", ")) ## Output will be sorted according to this manual ordering classOrder <- c("idcol", "character", "factor", "logical", "integer", "numeric" @@ -87,7 +87,8 @@ print.desc <- function(x, tight.output=FALSE, sort.column=TRUE, ...) { ## chop each line between 30~60 characters repl <- sprintf(space.class, "", "", "", "") ## thhe spae at the left of a new line for the same class repl <- paste0("\\1,\n", repl) - ans[, columns.out := gsub("(.{30}.{0,30}?), ", repl, columns.out)] + ## TODO: Take into account max(nchar(x$class) for the regex pattern + ans[, columns.out := gsub("(.{35}.{0,30}?), ", repl, columns.out)] out <- ans[, sprintf(fmt=space.class, ifelse(tight.output, "", "\n"), class.factor, ":", columns.out)] @@ -99,7 +100,24 @@ print.desc <- function(x, tight.output=FALSE, sort.column=TRUE, ...) { if (FALSE) { + L <- 1:5 + DT <- data.table(ID = LETTERS[1L] + , date = seq(Sys.Date(), length=max(L), by="day") + , last.occurance = seq(Sys.Date(), length=max(L), by="-2 month") + , value = rnorm(L) + , distance = runif(L, 100, 1e5) + , group = factor(letters[L]) + , days = as.integer(sample(30, L)) + , user = c("tammy", "tommmy", "billie", "zoe", "chloe") + , storeid = paste0("store", L) + , mileage = {set.seed(1); rnorm(L, 100, 1e5)} + , mileage.scaled = scale({set.seed(1); rnorm(L, 100, 1e5)}) + ) + desc(DT) + desc.data.table(DT, tight.output=TRUE) + + desc(DT, sort=FALSE) desc(DT, tight=TRUE) desc(DT, tight=TRUE, sort=FALSE) @@ -115,5 +133,6 @@ if (FALSE) print(desc(DT)) identity(print(desc(DT))) identity(print(desc(identity(DT)))) + }