From 2a7eee6a98f8ca63963baa73406b64b0064bfab6 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 20 Jul 2024 21:44:06 -0500 Subject: [PATCH 1/6] check if altrep --- r/DESCRIPTION | 1 + r/src/arrow_cpp11.h | 2 +- r/tests/testthat/test-csv.R | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index ff74c566ffe..f50bc840d6e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -62,6 +62,7 @@ Suggests: lubridate, pillar, pkgload, + readr, reticulate, rmarkdown, stringi, diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index b2ed66b83c3..b81f144ea6b 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -148,7 +148,7 @@ inline SEXP utf8_strings(SEXP x) { for (R_xlen_t i = 0; i < n; i++, ++p_x) { SEXP s = *p_x; - if (s != NA_STRING) { + if (s != NA_STRING && ALTREP(s)) { SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8)); } } diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index 36f1f229a60..b1966d28cb8 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -738,5 +738,14 @@ test_that("read_csv2_arrow correctly parses comma decimals", { tf <- tempfile() writeLines("x;y\n1,2;c", con = tf) expect_equal(read_csv2_arrow(tf), tibble(x = 1.2, y = "c")) +}) + +test_that("lazy readr can roundtrip to table", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.csv(tbl, tf, row.names = FALSE) + tab <- arrow::as_arrow_table(readr::read_csv(tf, lazy = TRUE, show_col_types = FALSE)) + expect_equal(tbl, as_tibble(tab)) }) From 2a0da1e24c796a1456e8faf81fca3b7306b5e39d Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 22 Jul 2024 21:47:48 -0500 Subject: [PATCH 2/6] ensure altrep strings aren't altrep first --- r/src/arrow_cpp11.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index b81f144ea6b..21a3d3743db 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -138,6 +138,11 @@ inline R_xlen_t r_string_size(SEXP s) { } // namespace unsafe inline SEXP utf8_strings(SEXP x) { + // ensure that x is not actually altrep first + if (ALTREP(x)) { + x = PROTECT(Rf_duplicate(x)); + UNPROTECT(1); + } return cpp11::unwind_protect([x] { R_xlen_t n = XLENGTH(x); @@ -148,7 +153,7 @@ inline SEXP utf8_strings(SEXP x) { for (R_xlen_t i = 0; i < n; i++, ++p_x) { SEXP s = *p_x; - if (s != NA_STRING && ALTREP(s)) { + if (s != NA_STRING) { SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8)); } } From 206e94dc8c8541ce40bb44159c999443a9b861a1 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Tue, 23 Jul 2024 17:42:50 -0500 Subject: [PATCH 3/6] a slightly different setup --- r/src/arrow_cpp11.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index 21a3d3743db..5b5a1de49c8 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -138,12 +138,12 @@ inline R_xlen_t r_string_size(SEXP s) { } // namespace unsafe inline SEXP utf8_strings(SEXP x) { - // ensure that x is not actually altrep first - if (ALTREP(x)) { - x = PROTECT(Rf_duplicate(x)); - UNPROTECT(1); - } - return cpp11::unwind_protect([x] { + return cpp11::unwind_protect([&] { + // ensure that x is not actually altrep first + bool was_altrep = ALTREP(x); + if (was_altrep) { + x = PROTECT(Rf_duplicate(x)); + } R_xlen_t n = XLENGTH(x); // if `x` is an altrep of some sort, this will @@ -157,6 +157,9 @@ inline SEXP utf8_strings(SEXP x) { SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8)); } } + if (was_altrep) { + UNPROTECT(1); + } return x; }); } From a22cb2efe4273f5633e05d445346069aa5a77c2b Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 24 Jul 2024 20:58:13 -0500 Subject: [PATCH 4/6] more comment --- r/src/arrow_cpp11.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index 5b5a1de49c8..073b577d63a 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -139,7 +139,8 @@ inline R_xlen_t r_string_size(SEXP s) { inline SEXP utf8_strings(SEXP x) { return cpp11::unwind_protect([&] { - // ensure that x is not actually altrep first + // ensure that x is not actually altrep first this also ensures that + // x is not altrep even after it is materialized bool was_altrep = ALTREP(x); if (was_altrep) { x = PROTECT(Rf_duplicate(x)); From 2a5746030db174cf11ba8727ac10d35cbddde49c Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 27 Jul 2024 10:03:56 -0500 Subject: [PATCH 5/6] An even simpler reprex --- r/DESCRIPTION | 1 - r/tests/testthat/test-csv.R | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index f50bc840d6e..ff74c566ffe 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -62,7 +62,6 @@ Suggests: lubridate, pillar, pkgload, - readr, reticulate, rmarkdown, stringi, diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index b1966d28cb8..083a46ed387 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -740,12 +740,21 @@ test_that("read_csv2_arrow correctly parses comma decimals", { expect_equal(read_csv2_arrow(tf), tibble(x = 1.2, y = "c")) }) -test_that("lazy readr can roundtrip to table", { +test_that("altrep columns can roundtrip to table", { tf <- tempfile() on.exit(unlink(tf)) - write.csv(tbl, tf, row.names = FALSE) - tab <- arrow::as_arrow_table(readr::read_csv(tf, lazy = TRUE, show_col_types = FALSE)) - expect_equal(tbl, as_tibble(tab)) + # read in, some columns will be altrep by default + new_df <- read_csv_arrow(tf) + expect_equal(tbl, as_tibble(arrow_table(new_df))) + + # but also if we materialize the vector + # this could also be accomplished with printing, + # though with that the error goes from + new_df <- read_csv_arrow(tf) + test_arrow_altrep_force_materialize(new_df$chr) + + # we should still be able to turn this into a table + expect_equal(tbl, as_tibble(arrow_table(new_df))) }) From 990e2cf694034a8aa0d512b4d41123d5a310c064 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 27 Jul 2024 10:50:45 -0500 Subject: [PATCH 6/6] ugh lintr --- r/tests/testthat/test-csv.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index 083a46ed387..a6291ebda09 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -750,8 +750,7 @@ test_that("altrep columns can roundtrip to table", { expect_equal(tbl, as_tibble(arrow_table(new_df))) # but also if we materialize the vector - # this could also be accomplished with printing, - # though with that the error goes from + # this could also be accomplished with printing new_df <- read_csv_arrow(tf) test_arrow_altrep_force_materialize(new_df$chr)