Rdatatable · mattdowle · Jun 16, 2021 · May 14, 2020 · May 14, 2020 · May 19, 2020
@@ -154,7 +154,10 @@ grep -n "[^A-Za-z0-9]F[^A-Za-z0-9]" ./inst/tests/tests.Rraw
 grep -Enr "^[^#]*(?:\[|==|>|<|>=|<=|,|\(|\+)\s*[-]?[0-9]+[^0-9L:.e]" R | grep -Ev "stop|warning|tolerance"
 
 # Never use ifelse. fifelse for vectors when necessary (nothing yet)
- grep -Enr "\bifelse" R
+grep -Enr "\bifelse" R
+
+# use substr() instead of substring(), #4447
+grep -Fnr "substring" R
 
 # No system.time in main tests.Rraw. Timings should be in benchmark.Rraw
 grep -Fn "system.time" ./inst/tests/*.Rraw | grep -Fv "benchmark.Rraw" | grep -Fv "this system.time usage ok"

@@ -157,7 +157,7 @@ status0 = function(bioc=FALSE) {
     if (file.exists(fn)) {
       v = suppressWarnings(system(paste0("grep 'Status:' ",fn), intern=TRUE))
       if (!length(v)) return("RUNNING")
-      return(substring(v,9))
+      return(substr(v, 9L, nchar(v)))
     }
     if (file.exists(paste0("./",x,".Rcheck"))) return("RUNNING")
     return("NOT STARTED")

@@ -249,7 +249,7 @@ replace_dot_alias = function(e) {
     root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
     if (root == ":" ||
         (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
-        ( (!length(av<-all.vars(jsub)) || all(substring(av,1L,2L)=="..")) &&
+        ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) &&
           root %chin% c("","c","paste","paste0","-","!") &&
           missingby )) {   # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed)
       # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find.
@@ -266,8 +266,8 @@ replace_dot_alias = function(e) {
       with=FALSE
       if (length(av)) {
         for (..name in av) {
-          name = substring(..name, 3L)
-          if (name=="") stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.")
+          name = substr(..name, 3L, nchar(..name))
+          if (!nzchar(name)) stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.")
           if (!exists(name, where=parent.frame())) {
             stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.",
               if (exists(..name, where=parent.frame()))
@@ -283,7 +283,7 @@ replace_dot_alias = function(e) {
         ..syms = av
       }
     } else if (is.name(jsub)) {
-      if (substring(jsub, 1L, 2L) == "..") stop("Internal error:  DT[, ..var] should be dealt with by the branch above now.") # nocov
+      if (startsWith(as.character(jsub), "..")) stop("Internal error:  DT[, ..var] should be dealt with by the branch above now.") # nocov
       if (!with && !exists(as.character(jsub), where=parent.frame()))
         stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.")
     }
@@ -709,7 +709,7 @@ replace_dot_alias = function(e) {
         j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) # else j will be evaluated for the first time on next line
       } else {
         names(..syms) = ..syms
-        j = eval(jsub, lapply(substring(..syms,3L), get, pos=parent.frame()), parent.frame())
+        j = eval(jsub, lapply(substr(..syms, 3L, nchar(..syms)), get, pos=parent.frame()), parent.frame())
       }
       if (is.logical(j)) j <- which(j)
       if (!length(j) && !notj) return( null.data.table() )
@@ -815,7 +815,7 @@ replace_dot_alias = function(e) {
             # TODO: could be allowed if length(irows)>1 but then the index would need to be squashed for use by uniqlist, #3062
             # find if allbyvars is leading subset of any of the indices; add a trailing "__" to fix #3498 where a longer column name starts with a shorter column name
             tt = paste0(c(allbyvars,""), collapse="__")
-            w = which.first(substring(paste0(indices(x),"__"),1L,nchar(tt)) == tt)
+            w = which.first(startsWith(paste0(indices(x), "__"), tt))
             if (!is.na(w)) {
               byindex = indices(x)[w]
               if (!length(getindex(x, byindex))) {
@@ -921,8 +921,8 @@ replace_dot_alias = function(e) {
       jvnames = NULL
       drop_dot = function(x) {
         if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items")  # nocov
-        if (identical(substring(x<-as.character(x), 1L, 1L), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY"))
-          substring(x, 2L)
+        if (startsWith(x<-as.character(x), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY"))
+          substr(x, 2L, nchar(x))
         else
           x
       }
@@ -1242,16 +1242,16 @@ replace_dot_alias = function(e) {
   }
 
   syms = all.vars(jsub)
-  syms = syms[ substring(syms,1L,2L)==".." ]
-  syms = syms[ substring(syms,3L,3L)!="." ]  # exclude ellipsis
+  syms = syms[ startsWith(syms, "..") ]
+  syms = syms[ substr(syms, 3L, 3L) != "." ]  # exclude ellipsis
   for (sym in syms) {
     if (sym %chin% names_x) {
       # if "..x" exists as column name, use column, for backwards compatibility; e.g. package socialmixr in rev dep checks #2779
       next
       # TODO in future, as warned in NEWS item for v1.11.0 :
       # warning(sym," in j is looking for ",getName," in calling scope, but a column '", sym, "' exists. Column names should not start with ..")
     }
-    getName = substring(sym, 3L)
+    getName = substr(sym, 3L, nchar(sym))
     if (!exists(getName, parent.frame())) {
       if (exists(sym, parent.frame())) next  # user did 'manual' prefix; i.e. variable in calling scope has .. prefix
       stop("Variable '",getName,"' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.")
@@ -1731,8 +1731,9 @@ replace_dot_alias = function(e) {
           # is.symbol() is for #1369, #1974 and #2949
           if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE)
           if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE)  # 875
-          if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE)
-          # ... head-tail uses default value n=6 which as of now should not go gforce ^^
+          if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE)
+          #                       ^^ base::startWith errors on NULL unfortunately
+          #        head-tail uses default value n=6 which as of now should not go gforce ... ^^
           # otherwise there must be three arguments, and only in two cases:
           #   1) head/tail(x, 1) or 2) x[n], n>0
           length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) &&
@@ -1907,7 +1908,7 @@ replace_dot_alias = function(e) {
   if (length(expr)==2L)  # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE
     return(call(".External",quote(Cfastmean),expr[[2L]], FALSE))
     # return(call(".Internal",expr))  # slightly faster than .External, but R now blocks .Internal in coerce.c from apx Sep 2012
-  if (length(expr)==3L && identical("na",substring(names(expr)[3L], 1L, 2L)))   # one parameter passed to mean()
+  if (length(expr)==3L && startsWith(names(expr)[3L], "na"))   # one parameter passed to mean()
     return(call(".External",quote(Cfastmean),expr[[2L]], expr[[3L]]))  # faster than .Call
   assign("nomeanopt",TRUE,parent.frame())
   expr  # e.g. trim is not optimized, just na.rm

@@ -55,13 +55,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     if (input=="" || length(grep('\\n|\\r', input))) {
       # input is data itself containing at least one \n or \r
     } else {
-      if (substring(input,1L,1L)==" ") {
+      if (startsWith(input, " ")) {
         stop("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=")
       }
-      str6 = substring(input,1L,6L)   # avoid grepl() for #2531
-      str7 = substring(input,1L,7L)
-      str8 = substring(input,1L,8L)
-      if (str7=="ftps://" || str8=="https://") {
+      str7 = substr(input, 1L, 7L) # avoid grepl() for #2531
+      if (str7=="ftps://" || startsWith(input, "https://")) {
         # nocov start
         if (!requireNamespace("curl", quietly = TRUE))
           stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov
@@ -71,7 +69,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
         on.exit(unlink(tmpFile), add=TRUE)
         # nocov end
       }
-      else if (str6=="ftp://" || str7== "http://" || str7=="file://") {
+      else if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") {
         # nocov start
         method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto")
         # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668
@@ -107,12 +105,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
               if (data.table) 'data.table' else 'data.frame', ".")
       return(if (data.table) data.table(NULL) else data.frame(NULL))
     }
-    ext2 = substring(file, nchar(file)-2L, nchar(file))   # last 3 characters ".gz"
-    ext3 = substring(file, nchar(file)-3L, nchar(file))   # last 4 characters ".bz2"
-    if (ext2==".gz" || ext3==".bz2") {
+    if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) {
       if (!requireNamespace("R.utils", quietly = TRUE))
         stop("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov
-      FUN = if (ext2==".gz") gzfile else bzfile
+      FUN = if (is_gz) gzfile else bzfile
       R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE)   # ext is not used by decompressFile when destname is supplied, but isn't optional
       file = decompFile   # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download
       on.exit(unlink(decompFile), add=TRUE)
@@ -174,9 +170,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     yaml_border_re = '^#?---'
     if (!grepl(yaml_border_re, first_line)) {
       close(f)
-      stop('Encountered <', substring(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...', '> at the first ',
-           'unskipped line (', 1L+skip, '), which does not constitute the start to a valid YAML header ',
-           '(expecting something matching regex "', yaml_border_re, '"); please check your input and try again.')
+      stop(gettextf(
+        'Encountered <%s%s> at the first unskipped line (%d), which does not constitute the start to a valid YAML header (expecting something matching regex "%s"); please check your input and try again.',
+        substr(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...' else '', 1L+skip, yaml_border_re
+      ))
     }
 
     yaml_comment_re = '^#'

@@ -215,10 +215,10 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
 compactprint = function(DT, topn=2L) {
   tt = vapply_1c(DT,function(x)class(x)[1L])
   tt[tt=="integer64"] = "i64"
-  tt = substring(tt, 1L, 3L)
+  tt = substr(tt, 1L, 3L)
   makeString = function(x) paste(x, collapse = ",")  # essentially toString.default
   cn = paste0(" [Key=",makeString(key(DT)),
-             " Types=", makeString(substring(sapply(DT, typeof), 1L, 3L)),
+             " Types=", makeString(substr(sapply(DT, typeof), 1L, 3L)),
              " Classes=", makeString(tt), "]")
   if (nrow(DT)) {
     print(copy(DT)[,(cn):="",verbose=FALSE], topn=topn, class=FALSE)

@@ -25,6 +25,13 @@ if (base::getRversion() < "3.2.0") {  # Apr 2015
   isNamespaceLoaded = function(x) x %chin% loadedNamespaces()
 }
 
+if (!exists('startsWith', 'package:base', inherits=FALSE)) {  # R 3.3.0; Apr 2016
+  startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub
+}
+if (!exists('endsWith', 'package:base', inherits=FALSE)) {
+  endsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub}
+}
+
 # which.first
 which.first = function(x)
 {

@@ -161,10 +161,10 @@ set.seed(1)
 L = lapply(1:1e6, sample, x=100, size=2)
 x = capture.output(fwrite(L))
 test(1742.1, nchar(x), c(2919861L, 2919774L))   # tests 2 very long lines, too
-test(1742.2, substring(x,1,10), c("27,58,21,9","38,91,90,6"))
+test(1742.2, substr(x, 1L, 10L), c("27,58,21,9", "38,91,90,6"))
 test(1742.3, L[[1L]], c(27L,38L))
 test(1742.4, L[[1000000L]], c(76L, 40L))
-test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40"))
+test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40"))
 
 # Add scaled-up non-ASCII forder test 1896
 
@@ -3470,7 +3470,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2,
   set.seed(3)
   DT = data.table(a=5:1, b=runif(5))
   ans = dcast(DT, a ~ b, value.var="b")[c(4,.N), c(2,6)]
-  setnames(ans, substring(names(ans),1,6))
+  setnames(ans, substr(names(ans), 1L, 6L))
   test(1102.06, ans, data.table("0.1680"=c(NA,DT[1,b]), "0.8075"=c(DT[2,b],NA)))
 
   # Fix for case 2 in bug report #71 - dcast didn't aggregate properly when formula RHS has "."
@@ -7346,7 +7346,7 @@ test(1530.4, which.last(x), tail(which(x), 1L))
 set.seed(2L)
 x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="")
 y = factor(sample(c(letters[1:5], x), 20, TRUE))
-xsub = substring(x, 1L, 1L)
+xsub = substr(x, 1L, 1L)
 test(1532.01, y %like% xsub[1L], grepl(xsub[1L], y))
 test(1532.02, y %like% xsub[2L], grepl(xsub[2L], y))
 test(1532.03, like(y, xsub[1L]), grepl(xsub[1L], y))
@@ -9564,7 +9564,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") {
   runcmb = as.data.table(runcmb[, 1:min(100L, ncol(runcmb)), drop=FALSE]) # max 100 combinations to test
   runops = lapply(runcmb, function(cols) {
     thisops = sample(ops, k, TRUE)
-    thisops[substring(cols,1,1)=="c"] = "=="
+    thisops[startsWith(cols, "c")] = "=="
     thisops
   })
   is_only_na <- function(x) is.na(x) & !is.nan(x)
@@ -17728,3 +17728,15 @@ if (test_bit64) {
   test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA)))
 }
 
+# compatibility of endsWith backport with base::endsWith
+if (exists('endsWith', 'package:base', inherits=FALSE)) {
+  DTendsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub}
+  BSendsWith = base::endsWith
+  test(2194.1, DTendsWith('abcd', 'd'), BSendsWith('abcd', 'd'))
+  test(2194.2, DTendsWith(letters, 'e'), BSendsWith(letters, 'e'))
+  test(2194.3, DTendsWith(NA_character_, 'a'), BSendsWith(NA_character_, 'a'))
+  test(2194.4, DTendsWith(character(), 'a'), BSendsWith(character(), 'a'))
+  # file used in encoding tests
+  txt = readLines(testDir("issue_563_fread.txt"))
+  test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B'))
+}