diff --git a/NAMESPACE b/NAMESPACE index 27aa1805f0..249f2b2d55 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,8 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(set2key, set2keyv, key2) # deprecated with helpful error; remove after May 2019 (see #3399) -export(as.data.table,is.data.table,test.data.table,last,first,like,"%like%",between,"%between%",inrange,"%inrange%") +export(as.data.table,is.data.table,test.data.table) +export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%") export(timetaken) export(truelength, alloc.col, ":=") export(setattr, setnames, setcolorder, set, setDT, setDF) diff --git a/NEWS.md b/NEWS.md index d803be59ac..e7f3eb0ba1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -75,6 +75,8 @@ 8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). +9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/like.R b/R/like.R index c19c8791e7..393853be01 100644 --- a/R/like.R +++ b/R/like.R @@ -1,16 +1,18 @@ -like <- function(vector, pattern) -{ - # Intended for use with a data.table 'where' - # Don't use * or % like SQL's like. Uses regexpr syntax - more powerful. +# Intended for use with a data.table 'where' +# Don't use * or % like SQL's like. Uses regexpr syntax - more powerful. +# returns 'logical' so can be combined with other where clauses. +like <- function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { if (is.factor(vector)) { - as.integer(vector) %in% grep(pattern,levels(vector)) + as.integer(vector) %in% grep(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed) } else { # most usually character, but integer and numerics will be silently coerced by grepl - grepl(pattern,vector) + grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed) } - # returns 'logical' so can be combined with other where clauses. } -"%like%" = like - - +"%like%" = function(vector, pattern) like(vector, pattern) +# as grep -i -- grep, ignoring case +"%ilike%" = function(vector, pattern) like(vector, pattern, ignore.case = TRUE) +# as grep -F or fgrep -- grep against a fixed pattern (no regex) +# (more efficient where applicable) +"%flike%" = function(vector, pattern) like(vector, pattern, fixed = TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 04b85b889a..749fd54f9c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7117,7 +7117,7 @@ test(1530.2, which.first(x), which(x)[1L]) test(1530.3, which.last(1:5), error = "x not boolean") test(1530.4, which.last(x), tail(which(x), 1L)) -# test for like, %like% +# test for like, %like%, %ilike%, %flike% set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) @@ -7126,6 +7126,13 @@ test(1532.1, y %like% xsub[1L], grepl(xsub[1L], y)) test(1532.2, y %like% xsub[2L], grepl(xsub[2L], y)) test(1532.3, like(y, xsub[1L]), grepl(xsub[1L], y)) test(1532.4, like(y, xsub[2L]), grepl(xsub[2L], y)) +## %ilike% and %flike% for #3333 +x = c('HEY', 'hey', '()') +test(1532.5, like(x, 'hey', ignore.case = TRUE), c(TRUE, TRUE, FALSE)) +test(1532.6, like(x, '()'), c(TRUE, TRUE, TRUE)) +test(1532.7, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) +test(1532.8, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) +test(1532.9, x %flike% '()', c(FALSE, FALSE, TRUE)) # coverage for setkey() to 100% dt1 = data.table(x=sample(5), y=1:5, key="y") diff --git a/man/like.Rd b/man/like.Rd index 22f88f706c..de6edae0fd 100644 --- a/man/like.Rd +++ b/man/like.Rd @@ -1,27 +1,38 @@ \name{like} \alias{like} \alias{\%like\%} -\title{ Convenience function for calling regexpr. } +\alias{\%ilike\%} +\alias{\%flike\%} +\title{ Convenience function for calling grep. } \description{ - Intended for use in \code{i} in \code{[.data.table}. + Intended for use in \code{i} in \code{\link[=data.table]{[.data.table}}, i.e., for subsetting/filtering. + + Syntax should be familiar to SQL users, with interpretation as regex. } \usage{ -like(vector,pattern) +like(vector, pattern, ignore.case = FALSE, fixed = FALSE) vector \%like\% pattern +vector \%ilike\% pattern +vector \%flike\% pattern } \arguments{ - \item{vector}{ Either a \code{character} vector or a \code{factor}. A \code{factor} is faster. } - \item{pattern}{ Passed on to \code{\link{grepl}}. } + \item{vector}{ Either a \code{character} or a \code{factor} vector. } + \item{pattern}{ Pattern to be matched } + \item{ignore.case}{ \code{logical}; is \code{pattern} case-sensitive? } + \item{fixed}{ \code{logical}; should \code{pattern} be interpreted as a literal string (i.e., ignoring regular expressions)? } +} +\details{ + Internally, \code{like} is essentially a wrapper around \code{\link[base]{grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion). } -% \details{ -% } \value{ Logical vector, \code{TRUE} for items that match \code{pattern}. } \note{ Current implementation does not make use of sorted keys. } -\seealso{ \code{\link{data.table}}, \code{\link{grepl}} } +\seealso{ \code{\link[base]{grepl}} } \examples{ DT = data.table(Name=c("Mary","George","Martha"), Salary=c(2,3,4)) DT[Name \%like\% "^Mar"] +DT[Name \%ilike\% "mar"] +DT[Name \%flike\% "Mar"] } \keyword{ data }