From a5ea3719c9f820c782aebe8aca9903b9791da760 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 12 Sep 2024 12:28:50 -0700 Subject: [PATCH 1/7] First take at this test case --- .ci/atime/tests.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 0420130cee..88a746c4bc 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -166,5 +166,15 @@ test.list <- atime::atime_test_list( Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue + # Improvement brought by: https://github.com/Rdatatable/data.table/pull/5054 + "melt improved in #5054" = atime::atime_test( + N = 10^seq(1, 8), + setup = { + dt = data.table(x = sample(c(1:N, NA), N, replace = TRUE), y = sample(c(letters, NA), N, replace = TRUE)) + }, + expr = data.table:::melt(dt, id.vars = "x", na.rm = TRUE), + Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that brought the improvement + Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that brought the improvement + NULL) # nolint end: undesirable_operator_linter. From 41814d995415c9c473b58e2a83865823c0ed4c54 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 12 Sep 2024 13:14:55 -0700 Subject: [PATCH 2/7] Isolate test --- .ci/atime/tests.R | 91 ----------------------------------------------- 1 file changed, 91 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 88a746c4bc..4ffeffb4f5 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -75,97 +75,6 @@ test.list <- atime::atime_test_list( paste0('useDynLib(', new.Package_)) }, - # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 - # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 - "shallow regression fixed in #4440" = atime::atime_test( - N = 10^seq(3, 8), - setup = { - set.seed(1L) - dt <- data.table(a = sample.int(N)) - setindexv(dt, "a") - }, - expr = data.table:::shallow(dt), - # Before = "", This needs to be updated later as there are two issues here: A) The source of regression (or the particular commit that led to it) is not clear; B) Older versions of data.table are having problems when being installed in this manner (This includes commits from before March 20 2020, when the issue that discovered or first mentioned the regression was created) - Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression - Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) - - # Test based on: https://github.com/Rdatatable/data.table/issues/5424 - # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 - # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 - "memrecycle regression fixed in #5463" = atime::atime_test( - N = 10^seq(3, 8), - setup = { - n <- N/100 - set.seed(2L) - dt <- data.table( - g = sample(seq_len(n), N, TRUE), - x = runif(N), - key = "g") - dt_mod <- copy(dt) - }, - expr = data.table:::`[.data.table`(dt_mod, , N := .N, by = g), - Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue - Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue - Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression - - # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 - # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 - "setDT improved in #5427" = atime::atime_test( - N = 10^seq(1, 7), - setup = { - L <- replicate(N, 1, simplify = FALSE) - setDT(L) - }, - expr = { - data.table:::setattr(L, "class", NULL) - data.table:::setDT(L) - }, - Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue - Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue - - # Issue reported in: https://github.com/Rdatatable/data.table/issues/4200 - # To be fixed in: https://github.com/Rdatatable/data.table/pull/4558 - "DT[by] fixed in #4558" = atime::atime_test( - N = 10^seq(1, 20), - setup = { - d <- data.table( - id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))), - v1 = sample(5L, N, TRUE), - v2 = sample(5L, N, TRUE) - ) - }, - expr = data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id), - Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression - Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression - Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression - - # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498 - # Fixed in: https://github.com/Rdatatable/data.table/pull/4501 - "DT[,.SD] improved in #4501" = atime::atime_test( - N = 10^seq(1, 10, by=0.5), - setup = { - set.seed(1) - L = as.data.table(as.character(rnorm(N, 1, 0.5))) - setkey(L, V1) - }, - ## New DT can safely retain key. - expr = data.table:::`[.data.table`(L, , .SD), - Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - - # Issue reported in: https://github.com/Rdatatable/data.table/issues/6286 - # Fixed in: https://github.com/Rdatatable/data.table/pull/6296 - "DT[by, verbose = TRUE] improved in #6296" = atime::atime_test( - N = 10^seq(1, 9), - setup = { - dt = data.table(a = 1:N) - dt_mod <- copy(dt) - }, - expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE), - Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue - Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue - # Improvement brought by: https://github.com/Rdatatable/data.table/pull/5054 "melt improved in #5054" = atime::atime_test( N = 10^seq(1, 8), From 50bf503130a46c401a8265924d7b9f7cb8bb68d6 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 12 Sep 2024 13:59:06 -0700 Subject: [PATCH 3/7] N++ --- .ci/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 4ffeffb4f5..4d2c8ac21d 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -77,7 +77,7 @@ test.list <- atime::atime_test_list( # Improvement brought by: https://github.com/Rdatatable/data.table/pull/5054 "melt improved in #5054" = atime::atime_test( - N = 10^seq(1, 8), + N = 10^seq(1, 9), setup = { dt = data.table(x = sample(c(1:N, NA), N, replace = TRUE), y = sample(c(letters, NA), N, replace = TRUE)) }, From b63c338e91d08734dd56b3d65816b45ac45fe859 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 12 Sep 2024 14:32:03 -0700 Subject: [PATCH 4/7] More columns, making the operation more complex and memory-consuming --- .ci/atime/tests.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 4d2c8ac21d..c245515297 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -79,9 +79,14 @@ test.list <- atime::atime_test_list( "melt improved in #5054" = atime::atime_test( N = 10^seq(1, 9), setup = { - dt = data.table(x = sample(c(1:N, NA), N, replace = TRUE), y = sample(c(letters, NA), N, replace = TRUE)) + dt = data.table(x = sample(c(1:N, NA), N, replace = TRUE), + y = sample(c(letters, NA), N, replace = TRUE), + z = sample(c(NA, 1:N), N, replace = TRUE), + w = replicate(N, paste0(sample(letters, 20, replace = TRUE), collapse = "")), + v = I(lapply(1:N, function(i) list(sample(1:100, 10, replace = TRUE)))) + ) }, - expr = data.table:::melt(dt, id.vars = "x", na.rm = TRUE), + expr = data.table:::melt(dt, id.vars = c("x", "z"), na.rm = TRUE), Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that brought the improvement Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that brought the improvement From 25af4a8de59527bc88f879cbd127b2f0764b4969 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 13 Sep 2024 11:33:29 -0700 Subject: [PATCH 5/7] N++ --- .ci/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index c245515297..241287d91e 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -77,7 +77,7 @@ test.list <- atime::atime_test_list( # Improvement brought by: https://github.com/Rdatatable/data.table/pull/5054 "melt improved in #5054" = atime::atime_test( - N = 10^seq(1, 9), + N = 10^seq(1, 10), setup = { dt = data.table(x = sample(c(1:N, NA), N, replace = TRUE), y = sample(c(letters, NA), N, replace = TRUE), From 9f3f65ac2766caa34e4b98f4f33b48e8580997a2 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 20 Sep 2024 11:48:32 -0700 Subject: [PATCH 6/7] Use the example Toby provided (or utilizing measure.vars with NAs in it) --- .ci/atime/tests.R | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 241287d91e..fcb1619e97 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -75,18 +75,28 @@ test.list <- atime::atime_test_list( paste0('useDynLib(', new.Package_)) }, + setup={ + DT <- as.data.table(as.list(1:N)) + measure.vars <- lapply(1:N,function(i){ + x=rep(NA,N) + x[i]=i + x + }) + }, + expr=data.table:::melt(DT, measure.vars=measure.vars) + # Improvement brought by: https://github.com/Rdatatable/data.table/pull/5054 "melt improved in #5054" = atime::atime_test( N = 10^seq(1, 10), setup = { - dt = data.table(x = sample(c(1:N, NA), N, replace = TRUE), - y = sample(c(letters, NA), N, replace = TRUE), - z = sample(c(NA, 1:N), N, replace = TRUE), - w = replicate(N, paste0(sample(letters, 20, replace = TRUE), collapse = "")), - v = I(lapply(1:N, function(i) list(sample(1:100, 10, replace = TRUE)))) - ) + DT <- as.data.table(as.list(1:N)) + measure.vars <- lapply(1:N, function(i) { + x = rep(NA, N) + x[i] = i + x + }) }, - expr = data.table:::melt(dt, id.vars = c("x", "z"), na.rm = TRUE), + expr = data.table:::melt(DT, measure.vars=measure.vars), Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that brought the improvement Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that brought the improvement From e9ca17da48c5b9da19fa647b39b62c6d8668a658 Mon Sep 17 00:00:00 2001 From: Ani Date: Sun, 22 Sep 2024 23:41:17 -0700 Subject: [PATCH 7/7] Update tests.R --- .ci/atime/tests.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index fcb1619e97..b419c6d166 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -75,16 +75,6 @@ test.list <- atime::atime_test_list( paste0('useDynLib(', new.Package_)) }, - setup={ - DT <- as.data.table(as.list(1:N)) - measure.vars <- lapply(1:N,function(i){ - x=rep(NA,N) - x[i]=i - x - }) - }, - expr=data.table:::melt(DT, measure.vars=measure.vars) - # Improvement brought by: https://github.com/Rdatatable/data.table/pull/5054 "melt improved in #5054" = atime::atime_test( N = 10^seq(1, 10),