From a2ffdc14ebfa67656e3598f0a0a0131f18f98aa5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 1 Feb 2018 03:10:53 +0000 Subject: [PATCH 1/5] R's substr should not reduce starting position by 1 when calling Scala API. --- R/pkg/R/column.R | 2 +- R/pkg/tests/fulltests/test_sparkSQL.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 3095adb918b67..6c10b4ccd3c1f 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -169,7 +169,7 @@ setMethod("alias", #' @note substr since 1.4.0 setMethod("substr", signature(x = "Column"), function(x, start, stop) { - jc <- callJMethod(x@jc, "substr", as.integer(start - 1), as.integer(stop - start + 1)) + jc <- callJMethod(x@jc, "substr", as.integer(start), as.integer(stop - start + 1)) column(jc) }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 5197838eaac66..bed26ec6a3752 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1649,6 +1649,7 @@ test_that("string operators", { expect_false(first(select(df, startsWith(df$name, "m")))[[1]]) expect_true(first(select(df, endsWith(df$name, "el")))[[1]]) expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi") + expect_equal(first(select(df, substr(df$name, 4, 6)))[[1]], "hae") if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { expect_true(startsWith("Hello World", "Hello")) expect_false(endsWith("Hello World", "a")) From 95c8a4e48e8f760bb9ca0df844136d19452521d7 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 1 Feb 2018 09:02:16 +0000 Subject: [PATCH 2/5] Add a note to migration guide of R doc. --- docs/sparkr.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/sparkr.md b/docs/sparkr.md index 6685b585a393a..7d73fe048f868 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -663,3 +663,7 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma - The `stringsAsFactors` parameter was previously ignored with `collect`, for example, in `collect(createDataFrame(iris), stringsAsFactors = TRUE))`. It has been corrected. - For `summary`, option for statistics to compute has been added. Its output is changed from that from `describe`. - A warning can be raised if versions of SparkR package and the Spark JVM do not match. + +## Upgrading to Spark 2.4.0 + + - The first parameter of `substr` method was wrongly subtracted by one, previously. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been corrected. From d994d76d45e474b3e4a31fff8250c30efef6a757 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 2 Feb 2018 08:30:43 +0000 Subject: [PATCH 3/5] Fix doc. --- docs/sparkr.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index 7d73fe048f868..96486db36d978 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -666,4 +666,4 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma ## Upgrading to Spark 2.4.0 - - The first parameter of `substr` method was wrongly subtracted by one, previously. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been corrected. + - The `start` parameter of `substr` method was wrongly subtracted by one, previously. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been corrected. From 0ebdf74942e0894bfaf6cbede4c03fd3f5d26411 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Mar 2018 04:54:48 +0000 Subject: [PATCH 4/5] Improve doc. --- R/pkg/R/column.R | 8 +++++++- docs/sparkr.md | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 6c10b4ccd3c1f..3d6d9f9746ee6 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -164,8 +164,14 @@ setMethod("alias", #' @aliases substr,Column-method #' #' @param x a Column. -#' @param start starting position. +#' @param start starting position. It should be 1-base. #' @param stop ending position. +#' @examples +#' \dontrun{ +#' df <- createDataFrame(list(list(a="abcdef"))) +#' collect(select(df, substr(df$a, 1, 4))) # the result is `abcd`. +#' collect(select(df, substr(df$a, 2, 4))) # the result is `bcd`. +#' } #' @note substr since 1.4.0 setMethod("substr", signature(x = "Column"), function(x, start, stop) { diff --git a/docs/sparkr.md b/docs/sparkr.md index 96486db36d978..e2ad785def84e 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -666,4 +666,4 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma ## Upgrading to Spark 2.4.0 - - The `start` parameter of `substr` method was wrongly subtracted by one, previously. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been corrected. + - The `start` parameter of `substr` method was wrongly subtracted by one, previously. In other words, the index specified by `start` parameter was considered as 0-base. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been fixed so the `start` parameter of `substr` method is now 1-base, e.g., `substr(df$a, 2, 5)` should be changed to `substr(df$a, 1, 4)`. From 8c1a8ec46ea28ce17fcaae42aa7b9955cb34bfc8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 7 Mar 2018 06:30:57 +0000 Subject: [PATCH 5/5] Improve doc clarity. --- docs/sparkr.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index e2ad785def84e..2909247e79e95 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -666,4 +666,4 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma ## Upgrading to Spark 2.4.0 - - The `start` parameter of `substr` method was wrongly subtracted by one, previously. In other words, the index specified by `start` parameter was considered as 0-base. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been fixed so the `start` parameter of `substr` method is now 1-base, e.g., `substr(df$a, 2, 5)` should be changed to `substr(df$a, 1, 4)`. + - The `start` parameter of `substr` method was wrongly subtracted by one, previously. In other words, the index specified by `start` parameter was considered as 0-base. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. It has been fixed so the `start` parameter of `substr` method is now 1-base, e.g., therefore to get the same result as `substr(df$a, 2, 5)`, it should be changed to `substr(df$a, 1, 4)`.