diff --git a/DESCRIPTION b/DESCRIPTION index 523a40f041..ff8fe0ebf6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,7 +66,8 @@ Authors@R: c( person("Ofek","Shilon", role="ctb"), person("Vadim","Khotilovich", role="ctb"), person("Hadley","Wickham", role="ctb"), - person("Bennet","Becker", role="ctb")) + person("Bennet","Becker", role="ctb"), + person("Kyle","Haynes", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NAMESPACE b/NAMESPACE index 55d660a871..fbd4f8df21 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,7 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) -export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%") +export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%") export(timetaken) export(truelength, setalloccol, alloc.col, ":=") export(setattr, setnames, setcolorder, set, setDT, setDF) diff --git a/NEWS.md b/NEWS.md index 4d58f97c43..f944a2ffb8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -95,6 +95,8 @@ 14. `.datatable.aware` is now recognized in the calling environment in addition to the namespace of the calling package, [dtplyr#184](https://github.com/tidyverse/dtplyr/issues/184). Thanks to Hadley Wickham for the idea and PR. +15. New convenience function `%plike%` maps to `like(..., perl=TRUE)`, [#3702](https://github.com/Rdatatable/data.table/issues/3702). `%plike%` uses Perl-compatible regular expressions (PCRE) which extend TRE, and may be more efficient in some cases. Thanks @KyleHaynes for the suggestion and PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/like.R b/R/like.R index dd2a8c5b59..b86faca8d3 100644 --- a/R/like.R +++ b/R/like.R @@ -1,15 +1,15 @@ # Intended for use with a data.table 'where' # Don't use * or % like SQL's like. Uses regexpr syntax - more powerful. # returns 'logical' so can be combined with other where clauses. -like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { +like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE, perl = FALSE) { if (is.factor(vector)) { # indexing by factors is equivalent to indexing by the numeric codes, see ?`[` #4748 - ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed)[vector] + ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed, perl = perl)[vector] ret[is.na(ret)] = FALSE ret } else { # most usually character, but integer and numerics will be silently coerced by grepl - grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed) + grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed, perl = perl) } } @@ -19,3 +19,5 @@ like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { # as grep -F or fgrep -- grep against a fixed pattern (no regex) # (more efficient where applicable) "%flike%" = function(vector, pattern) like(vector, pattern, fixed = TRUE) +# Perl-compatible regex +"%plike%" = function(vector, pattern) like(vector, pattern, perl = TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a39d8bfac9..49dd28509f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7366,7 +7366,7 @@ test(1530.2, which.first(x), which(x)[1L]) test(1530.3, which.last(1:5), error = "x not boolean") test(1530.4, which.last(x), tail(which(x), 1L)) -# test for like, %like%, %ilike%, %flike% +# test for like, %like%, %ilike%, %flike%, %plike% set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) @@ -7382,10 +7382,11 @@ test(1532.06, like(x, '()'), c(TRUE, TRUE, TRUE)) test(1532.07, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) test(1532.08, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) test(1532.09, x %flike% '()', c(FALSE, FALSE, TRUE)) -## %like% test for ordered factor with NA -x = c("A", "B", "C", NA_character_) +test(1532.10, like(x, "(?=h)(?=.*y)", perl = TRUE), c(FALSE, TRUE, FALSE)) +test(1532.11, x %plike% "(?=h)(?=.*y)", c(FALSE, TRUE, FALSE)) #3702 +x = c("A", "B", "C", NA_character_) # ordered factor with NA x = ordered(x, levels = rev(x)[-1L]) -test(1532.10, x %like% "A", c(TRUE, FALSE, FALSE, FALSE)) +test(1532.12, x %like% "A", c(TRUE, FALSE, FALSE, FALSE)) # coverage for setkey() to 100% dt1 = data.table(x=sample(5), y=1:5, key="y") diff --git a/man/like.Rd b/man/like.Rd index 4eadb98a81..81016d2843 100644 --- a/man/like.Rd +++ b/man/like.Rd @@ -3,6 +3,7 @@ \alias{\%like\%} \alias{\%ilike\%} \alias{\%flike\%} +\alias{\%plike\%} \title{ Convenience function for calling grep. } \description{ Intended for use in \code{i} in \code{\link[=data.table]{[.data.table}}, i.e., for subsetting/filtering. @@ -10,16 +11,18 @@ Syntax should be familiar to SQL users, with interpretation as regex. } \usage{ -like(vector, pattern, ignore.case = FALSE, fixed = FALSE) +like(vector, pattern, ignore.case = FALSE, fixed = FALSE, perl = FALSE) vector \%like\% pattern vector \%ilike\% pattern vector \%flike\% pattern +vector \%plike\% pattern } \arguments{ \item{vector}{ Either a \code{character} or a \code{factor} vector. } \item{pattern}{ Pattern to be matched } \item{ignore.case}{ \code{logical}; is \code{pattern} case-sensitive? } \item{fixed}{ \code{logical}; should \code{pattern} be interpreted as a literal string (i.e., ignoring regular expressions)? } + \item{perl}{ \code{logical}; is \code{pattern} Perl-compatible regular expression? } } \details{ Internally, \code{like} is essentially a wrapper around \code{\link[base:grep]{base::grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion). @@ -34,5 +37,6 @@ DT = data.table(Name=c("Mary","George","Martha"), Salary=c(2,3,4)) DT[Name \%like\% "^Mar"] DT[Name \%ilike\% "mar"] DT[Name \%flike\% "Mar"] +DT[Name \%plike\% "(?=Ma)(?=.*y)"] } \keyword{ data }