From a770b00d1ded8f09a2d71f46da951f6fedad20ac Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 6 Nov 2024 00:24:41 +0300 Subject: [PATCH 01/44] Use of non-API: introduction and gather links --- posts/2024-12-12-non-api-use/index.qmd | 41 ++++++++++++++++++++++++++ posts/2024-12-12-non-api-use/refs.bib | 32 ++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 posts/2024-12-12-non-api-use/index.qmd create mode 100644 posts/2024-12-12-non-api-use/refs.bib diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd new file mode 100644 index 00000000..23937d27 --- /dev/null +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -0,0 +1,41 @@ +--- +title: "Use of non-API functions in data.table" +author: "Ivan Krylov" +date: "2024-12-12" +categories: [code] +# image: "image.jpg" +draft: true +bibliography: refs.bib +--- + +In the late 1970's, people at Bell Laboratories designed the S +programming language in order to facilitate interactive exploratory data +analysis [@Chambers2016]. Instead of writing, compiling, scheduling, and +interpreting the output of individual Fortran programs, the goal of S +was to conduct all the necessary steps of the analysis on the fly. +S achieved this not by replacing the extensive collection of Fortran +subroutines, but by providing a special interface language [@Becker1985] +through which S could communicate with compiled code. + +Fast forward more than four decades and an increase by three orders of +magnitude in storage and processing capability of computers around us. +The [dominant implementation of S is now R][is.R]. It is now feasible to +implement algorithms solely in R, recouping the potential performance +losses in programmer effort debugging and maintaining the code +[@Nash2024]. Still, the capability of R to be extended by +special-purpose compiled code is as important as ever. + +Since the implementation language of R is C, not Fortran, the +programming interface for R is also defined in the C terms. + +What's in an API? +================= + + + +[is.R]: https://developer.r-project.org/blosxom.cgi/R-devel/NEWS/2024/03/08#n2024-03-09 + +[WRE]: https://cran.r-project.org/doc/manuals/R-exts.html +[NativeAPI2016]: https://wiki.r-consortium.org/view/R_Native_API +[Tierney2024]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html diff --git a/posts/2024-12-12-non-api-use/refs.bib b/posts/2024-12-12-non-api-use/refs.bib new file mode 100644 index 00000000..cceb86ab --- /dev/null +++ b/posts/2024-12-12-non-api-use/refs.bib @@ -0,0 +1,32 @@ +@book{Becker1985, + address = {Monterey, Calif}, + series = {The {Wadsworth} statistics/probability series}, + title = {Extending the {S} system}, + isbn = {978-0-534-05016-0}, + language = {eng}, + publisher = {Wadsworth}, + author = {Becker, Richard A. and Chambers, John M.}, + year = {1985}, +} +@book{Chambers2016, + address = {Milton}, + series = {Chapman \& {Hall} / {CRC} {The} {R} {Series}}, + title = {Extending {R}}, + isbn = {978-1-4987-7572-4 978-1-4987-7571-7}, + language = {eng}, + publisher = {CRC Press}, + author = {Chambers, John M.}, + year = {2016}, +} +@article{Nash2023, + author = {Nash, John C. and Bhattacharjee, Arkajyoti}, + title = {A Comparison of R Tools for Nonlinear Least Squares Modeling}, + journal = {The R Journal}, + year = {2024}, + note = {https://doi.org/10.32614/RJ-2023-091}, + doi = {10.32614/RJ-2023-091}, + volume = {15}, + issue = {4}, + issn = {2073-4859}, + pages = {198-215} +} From c71330ff159e1fb07fd3393debaf293b31f1d5b6 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 6 Nov 2024 13:37:11 +0300 Subject: [PATCH 02/44] Provide a skeleton for the post --- posts/2024-12-12-non-api-use/index.qmd | 123 +++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 9 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 23937d27..5f3538d3 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -17,25 +17,130 @@ S achieved this not by replacing the extensive collection of Fortran subroutines, but by providing a special interface language [@Becker1985] through which S could communicate with compiled code. -Fast forward more than four decades and an increase by three orders of -magnitude in storage and processing capability of computers around us. -The [dominant implementation of S is now R][is.R]. It is now feasible to -implement algorithms solely in R, recouping the potential performance -losses in programmer effort debugging and maintaining the code -[@Nash2024]. Still, the capability of R to be extended by +Fast forward more than four decades and an increase by more than three +orders of magnitude in storage and processing capability of computers +around us. The [dominant implementation of S is now R][is.R]. It is now +feasible to implement algorithms solely in R, recouping the potential +performance losses in programmer effort debugging and maintaining the +code [@Nash2024]. Still, the capability of R to be extended by special-purpose compiled code is as important as ever. - + Since the implementation language of R is C, not Fortran, the programming interface for R is also defined in the C terms. What's in an API? ================= + + + + + + +Use of non-API entry points in `data.table` +=========================================== + +> checking compiled code ... NOTE +> File ‘data.table/libs/data_table.so’: +> Found non-API calls to R: ‘LEVELS’, ‘SETLENGTH’, ‘SET_GROWABLE_BIT’, +> ‘SET_TRUELENGTH’, ‘STRING_PTR’, ‘TRUELENGTH’ +> +> Compiled code should not call non-API entry points in R. + -- ` R CMD check --as-cran ` on a released version of `data.table` + +Strings as C arrays of `CHARSXP` values: `STRING_PTR` +----------------------------------------------------- + +Fixed in git by switching to `STRING_PTR_RO`, present on CRAN for now + + + +Why non-API: writes to arrays of `SEXP` values *must* go through the +write barrier for GC to work, hence the need for `SET_STRING_ELT` and +`SET_VECTOR_ELT` + +See also: [PR18775] + +Encoding bits: `LEVELS` +----------------------- + +Waiting for R-4.5.0 to release with the new API + + +Why used: need to know the encoding. Distinguish between `CE_UTF8` and +string actually in UTF-8 (can also happen with `CE_NATIVE` in a UTF-8 +locale) + + + + + +Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` +------------------------------------------------------------------- + + + +Why used: need to create new columns by reference, which requires free +column and name slots + +Why non-API: make a length too long and the list is broken. + + + + +What to do about it: reimplement in ALTREP on R => 4.1 + +Fast string matching: `SET_TRUELENGTH`, `TRUELENGTH` +---------------------------------------------------- + +Why used: to exploit the `CHARSXP` cache. R interns strings, so a string +with the given contents and encoding bits exists as a single object, +even if manually recreated using `mkCharLenCE()` and friends. +Convert everything into UTF-8 and you can use pointer comparison. +Given `x` and `table` of strings to find elements of `x` in, `chmatch()` +puts indices into `table` into the `TRUELENGTH` field of the `CHARSXP` +contents of `table`, then walks `x` and reads the indices back from the +matching `CHARSXP`s, then carefully restores everything. + + + +Why non-API: this field is not always used (cf. `data.table` having to +work with it being completely uninitialised in old versions of R), but R +does use it for internal purposes sometimes (cf. `data.table` having to +restore nonzero `TRUELENGTH` for `CHARSXP` values used inside `SYMSXP` +values). + +Why this is hard to fix: the current happy path is very fast. +``O(length(table)) + O(length(x))` to convert encodings, +O(length(table))` to mark indices, `O(length(x))` to look them up, +`O(length(table))` to restore everything. Done. Pointer comparisons will +take `O(length(table)*length(x))`, which is Bad. How expensive is it to +build a hash for `O(length(table))` entries? Best case lookup will be +once again `O(length(x))`, but only without collisions, the constants +are unknown, and the C standard says that hashing pointers is fraught +with peril. + + [is.R]: https://developer.r-project.org/blosxom.cgi/R-devel/NEWS/2024/03/08#n2024-03-09 - [WRE]: https://cran.r-project.org/doc/manuals/R-exts.html [NativeAPI2016]: https://wiki.r-consortium.org/view/R_Native_API [Tierney2024]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html +[PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 From 04e391b12d36290cd6c574e6b9278f407517d3ee Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 6 Nov 2024 18:51:35 +0300 Subject: [PATCH 03/44] More links and details --- posts/2024-12-12-non-api-use/index.qmd | 77 +++++++++++++++++----- posts/2024-12-12-non-api-use/precomputed.R | 5 ++ posts/2024-12-12-non-api-use/refs.bib | 2 +- 3 files changed, 65 insertions(+), 19 deletions(-) create mode 100644 posts/2024-12-12-non-api-use/precomputed.R diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 5f3538d3..b1ae6560 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -8,26 +8,31 @@ draft: true bibliography: refs.bib --- +```{r} +#| echo: false +load('precomputed.rda') +``` + In the late 1970's, people at Bell Laboratories designed the S programming language in order to facilitate interactive exploratory data analysis [@Chambers2016]. Instead of writing, compiling, scheduling, and interpreting the output of individual Fortran programs, the goal of S -was to conduct all the necessary steps of the analysis on the fly. -S achieved this not by replacing the extensive collection of Fortran +was to conduct all the necessary steps of the analysis on the fly. S +achieved this not by replacing the extensive collection of Fortran subroutines, but by providing a special interface language [@Becker1985] through which S could communicate with compiled code. Fast forward more than four decades and an increase by more than three orders of magnitude in storage and processing capability of computers -around us. The [dominant implementation of S is now R][is.R]. It is now +around us. The [dominant implementation of S is now R][is.R]. It is now feasible to implement algorithms solely in R, recouping the potential performance losses in programmer effort debugging and maintaining the code [@Nash2024]. Still, the capability of R to be extended by -special-purpose compiled code is as important as ever. - -Since the implementation language of R is C, not Fortran, the -programming interface for R is also defined in the C terms. +special-purpose compiled code is as important as ever. As of `r when`, +`r with(cpdb, round(sum(NeedsCompilation=='yes')/length(NeedsCompilation)*100))`% +of CRAN packages use compiled code. Since the implementation language of +R is C, not Fortran, the programming interface for R is also defined in +the C terms. What's in an API? ================= @@ -41,9 +46,10 @@ Example: serialization API --> +After the [latest conflict][ALTREPnonAPI], Luke Tierney [started +work][clarifyingAPI] on declaring functions and symbols (variables or +preprocessor constants or enums?) as API / experimental API / embedding +API --> Use of non-API entry points in `data.table` =========================================== + + > checking compiled code ... NOTE > File ‘data.table/libs/data_table.so’: > Found non-API calls to R: ‘LEVELS’, ‘SETLENGTH’, ‘SET_GROWABLE_BIT’, @@ -60,11 +68,33 @@ Use of non-API entry points in `data.table` > Compiled code should not call non-API entry points in R. -- ` R CMD check --as-cran ` on a released version of `data.table` +Tracked in [#6180][remove_non_API] + +Testing for a `data.frame`: `isFrame` +------------------------------------- + +[#6325][remove_isframe] + +Operating on the S4 bit of R objects: `SET_S4_OBJECT`, `UNSET_S4_OBJECT` +------------------------------------------------------------------------ + +[#6183][remove_set_s4_object] + +Converting between calls and pairlists: `SET_TYPEOF` +---------------------------------------------------- + +[#6313][remove_set_typeof] + +Reading the reference counts: `NAMED` +------------------------------------- + +[#6420][remove_named] + Strings as C arrays of `CHARSXP` values: `STRING_PTR` ----------------------------------------------------- -Fixed in git by switching to `STRING_PTR_RO`, present on CRAN for now - +[Fixed in git][remove_string_ptr] by switching to `STRING_PTR_RO`, +present on CRAN for now. Why non-API: writes to arrays of `SEXP` values *must* go through the @@ -76,7 +106,7 @@ See also: [PR18775] Encoding bits: `LEVELS` ----------------------- -Waiting for R-4.5.0 to release with the new API +[Waiting for R-4.5.0 to release with the new API][remove_levels] Why used: need to know the encoding. Distinguish between `CE_UTF8` and @@ -105,7 +135,7 @@ reachable from the GC point of view? --> -What to do about it: reimplement in ALTREP on R => 4.1 +What to do about it: reimplement in ALTREP on R ≥ 4.1 Fast string matching: `SET_TRUELENGTH`, `TRUELENGTH` ---------------------------------------------------- @@ -128,8 +158,8 @@ restore nonzero `TRUELENGTH` for `CHARSXP` values used inside `SYMSXP` values). Why this is hard to fix: the current happy path is very fast. -``O(length(table)) + O(length(x))` to convert encodings, -O(length(table))` to mark indices, `O(length(x))` to look them up, +`O(length(table)) + O(length(x))` to convert encodings, +`O(length(table))` to mark indices, `O(length(x))` to look them up, `O(length(table))` to restore everything. Done. Pointer comparisons will take `O(length(table)*length(x))`, which is Bad. How expensive is it to build a hash for `O(length(table))` entries? Best case lookup will be @@ -139,8 +169,19 @@ with peril. +References +========== + [is.R]: https://developer.r-project.org/blosxom.cgi/R-devel/NEWS/2024/03/08#n2024-03-09 [WRE]: https://cran.r-project.org/doc/manuals/R-exts.html [NativeAPI2016]: https://wiki.r-consortium.org/view/R_Native_API -[Tierney2024]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html +[ALTREPnonAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-April/083349.html +[clarifyingAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html +[remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 +[remove_isframe]: https://github.com/Rdatatable/data.table/pull/6235 +[remove_set_s4_object]: https://github.com/Rdatatable/data.table/pull/6183 +[remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 +[remove_named]: https://github.com/Rdatatable/data.table/pull/6420 +[remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 +[remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 [PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 diff --git a/posts/2024-12-12-non-api-use/precomputed.R b/posts/2024-12-12-non-api-use/precomputed.R new file mode 100644 index 00000000..f0638226 --- /dev/null +++ b/posts/2024-12-12-non-api-use/precomputed.R @@ -0,0 +1,5 @@ +cpdb <- tools::CRAN_package_db() +checks <- subset(tools::CRAN_check_details(), Package == 'data.table') + +when <- Sys.Date() +save(cpdb, checks, when, file = 'precomputed.rda') diff --git a/posts/2024-12-12-non-api-use/refs.bib b/posts/2024-12-12-non-api-use/refs.bib index cceb86ab..60afca46 100644 --- a/posts/2024-12-12-non-api-use/refs.bib +++ b/posts/2024-12-12-non-api-use/refs.bib @@ -18,7 +18,7 @@ @book{Chambers2016 author = {Chambers, John M.}, year = {2016}, } -@article{Nash2023, +@article{Nash2024, author = {Nash, John C. and Bhattacharjee, Arkajyoti}, title = {A Comparison of R Tools for Nonlinear Least Squares Modeling}, journal = {The R Journal}, From 75812665918c769e3bb0164ae52309e19a850bfd Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 7 Nov 2024 00:28:59 +0300 Subject: [PATCH 04/44] Mostly document isFrame, *_S4_OBJECT --- posts/2024-12-12-non-api-use/index.qmd | 75 +++++++++++++++++++++----- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index b1ae6560..819e81eb 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -10,6 +10,7 @@ bibliography: refs.bib ```{r} #| echo: false +library(tools) # format.check_details load('precomputed.rda') ``` @@ -55,17 +56,17 @@ API --> from CRAN? download the one compiled by Luke Tierney?) and try to extract neither-official-API nor-official-nonAPI counts --> + + Use of non-API entry points in `data.table` =========================================== - +`r gsub( + '(?m)^', '> ', perl = TRUE, + format(subset(checks, grepl('API', Output))[1,]) +)` -> checking compiled code ... NOTE -> File ‘data.table/libs/data_table.so’: -> Found non-API calls to R: ‘LEVELS’, ‘SETLENGTH’, ‘SET_GROWABLE_BIT’, -> ‘SET_TRUELENGTH’, ‘STRING_PTR’, ‘TRUELENGTH’ -> -> Compiled code should not call non-API entry points in R. -- ` R CMD check --as-cran ` on a released version of `data.table` Tracked in [#6180][remove_non_API] @@ -73,12 +74,51 @@ Tracked in [#6180][remove_non_API] Testing for a `data.frame`: `isFrame` ------------------------------------- -[#6325][remove_isframe] - -Operating on the S4 bit of R objects: `SET_S4_OBJECT`, `UNSET_S4_OBJECT` ------------------------------------------------------------------------- - -[#6183][remove_set_s4_object] +Back in 2012, Matt Dowle needed to quickly test an object for being a +`data.frame`, and the internal function `isFrame` seemed like it +[did the job][datatable_isframe_added]. Since `isFrame` was not part of +the documented API, in 2024 Luke Tierney gave the function a +better-fitting name, [`isDataFrame`][R_isdataframe_added], and made it +an experimental API, while retaining the original function as a wrapper. + +Use of `isFrame` [doesn't give a NOTE yet][remove_isframe], but when +R-4.5.0 is released together with the new name for the function, +`data.table` will be able to use it, falling back to `isFrame` on older +versions of R. `isDataFrame` is documented among other [replacement +entry point names][WRE_replacement_entrypoints] in Writing R Extensions. + +Operating on the S4 bit: `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` +---------------------------------------------------------------------------------------- + +The `data.table` class is [registered][setOldClass] with the S4 OOP +system, making it possible to create S4 classes containing `data.table`s +as members (`setClass(slots = c(mytable = 'data.table'))`) or even +inheriting from `data.table` (and, in turn, from `data.frame`: +`setClass(contains = 'data.table')`). This latter case requires care +from the C code: when creating a copy of an S4 `data.table` from scratch +(or setting all attributes from one object onto another), the +destination value must also end up being an S4 object. This is +controlled by the special "S4" bit in the header of every R object, so +the code must read and set it correctly. + +The internal functions `IS_S4_OBJECT`, `SET_S4_OBJECT`, +`UNSET_S4_OBJECT` exist as bare interfaces to [the internal +macros][IS_S4_OBJECT] of the same names and directly access the flag +inside their argument. + +The [`Rf_isS4`][isS4] function is a wrapper for `IS_S4_OBJECT` that +follows the usual naming convention for remapped functions, has been +part of the API for a long time and could implement additional checks if +they are needed by R. The [`Rf_asS4`][asS4] function (experimental API) +is more involved, making sure to operate on a shallow copy of an object +instead of overwriting it in place and trying to "deconstruct" S4 +objects into S3 objects if possible and requested. + +Solution: [use `Rf_isS4` instead of +`IS_S4_OBJECT`][remove_set_s4_object], as +[documented][WRE_replacement_entrypoints] in Writing R Extensions. Use +`Rf_asS4` to control the S4 object bit, but be careful +around shared objects. Converting between calls and pairlists: `SET_TYPEOF` ---------------------------------------------------- @@ -178,7 +218,14 @@ References [ALTREPnonAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-April/083349.html [clarifyingAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html [remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 -[remove_isframe]: https://github.com/Rdatatable/data.table/pull/6235 +[datatable_isframe_added]: https://github.com/Rdatatable/data.table/commit/87666e70ce1a69b28f0e92ec7504d80e3d53a824#diff-4fc47a9752ba4edfef0cabcc1958eda943545ad3859e48d498b0e3f87a9ae5aeR192 +[R_isdataframe_added]: https://github.com/r-devel/r-svn/commit/4ef83b9dc3c6874e774195d329cbb6c11a71c414 +[WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points +[remove_isframe]: https://github.com/Rdatatable/data.table/issues/6244 +[setOldClass]: https://search.r-project.org/R/refmans/methods/html/setOldClass.html +[IS_S4_OBJECT]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/memory.c#L4033-L4035 +[isS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1838-L1841 +[asS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1843 [remove_set_s4_object]: https://github.com/Rdatatable/data.table/pull/6183 [remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 [remove_named]: https://github.com/Rdatatable/data.table/pull/6420 From c044ba0c7ca2e1afe25c8e17355f8c6e5db0bb5f Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 7 Nov 2024 12:43:16 +0300 Subject: [PATCH 05/44] Some history --- posts/2024-12-12-non-api-use/index.qmd | 102 +++++++++++++++++++------ 1 file changed, 77 insertions(+), 25 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 819e81eb..95047edf 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -1,5 +1,5 @@ --- -title: "Use of non-API functions in data.table" +title: "Use of non-API functions in `data.table`" author: "Ivan Krylov" date: "2024-12-12" categories: [code] @@ -27,31 +27,74 @@ Fast forward more than four decades and an increase by more than three orders of magnitude in storage and processing capability of computers around us. The [dominant implementation of S is now R][is.R]. It is now feasible to implement algorithms solely in R, recouping the potential -performance losses in programmer effort debugging and maintaining the -code [@Nash2024]. Still, the capability of R to be extended by -special-purpose compiled code is as important as ever. As of `r when`, +performance losses in reducing the programmer effort spent debugging and +maintaining the code [@Nash2024]. Still, the capability of R to be +extended by special-purpose compiled code is as important as ever. As of +`r when`, `r with(cpdb, round(sum(NeedsCompilation=='yes')/length(NeedsCompilation)*100))`% of CRAN packages use compiled code. Since the implementation language of R is C, not Fortran, the programming interface for R is also defined in -the C terms. +terms of C. What's in an API? ================= - - - + + + + @@ -75,20 +118,20 @@ Testing for a `data.frame`: `isFrame` ------------------------------------- Back in 2012, Matt Dowle needed to quickly test an object for being a -`data.frame`, and the internal function `isFrame` seemed like it +`data.frame`, and the undocumented function `isFrame` seemed like it [did the job][datatable_isframe_added]. Since `isFrame` was not part of the documented API, in 2024 Luke Tierney gave the function a better-fitting name, [`isDataFrame`][R_isdataframe_added], and made it an experimental API, while retaining the original function as a wrapper. -Use of `isFrame` [doesn't give a NOTE yet][remove_isframe], but when +Use of `isFrame` [doesn't give a `NOTE` yet][remove_isframe], but when R-4.5.0 is released together with the new name for the function, `data.table` will be able to use it, falling back to `isFrame` on older versions of R. `isDataFrame` is documented among other [replacement entry point names][WRE_replacement_entrypoints] in Writing R Extensions. Operating on the S4 bit: `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` ----------------------------------------------------------------------------------------- +--------------------------------------------------------------------------- The `data.table` class is [registered][setOldClass] with the S4 OOP system, making it possible to create S4 classes containing `data.table`s @@ -101,17 +144,17 @@ destination value must also end up being an S4 object. This is controlled by the special "S4" bit in the header of every R object, so the code must read and set it correctly. -The internal functions `IS_S4_OBJECT`, `SET_S4_OBJECT`, +The undocumented functions `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` exist as bare interfaces to [the internal macros][IS_S4_OBJECT] of the same names and directly access the flag inside their argument. The [`Rf_isS4`][isS4] function is a wrapper for `IS_S4_OBJECT` that follows the usual naming convention for remapped functions, has been -part of the API for a long time and could implement additional checks if -they are needed by R. The [`Rf_asS4`][asS4] function (experimental API) -is more involved, making sure to operate on a shallow copy of an object -instead of overwriting it in place and trying to "deconstruct" S4 +part of the API for a long time, and could implement additional checks +if they are needed by R. The [`Rf_asS4`][asS4] function (experimental +API) is more involved, making sure to operate on a shallow copy of an +object instead of overwriting it in place and trying to "deconstruct" S4 objects into S3 objects if possible and requested. Solution: [use `Rf_isS4` instead of @@ -147,7 +190,6 @@ Encoding bits: `LEVELS` ----------------------- [Waiting for R-4.5.0 to release with the new API][remove_levels] - Why used: need to know the encoding. Distinguish between `CE_UTF8` and string actually in UTF-8 (can also happen with `CE_NATIVE` in a UTF-8 @@ -209,12 +251,22 @@ with peril. +Conclusion +========== + References ========== [is.R]: https://developer.r-project.org/blosxom.cgi/R-devel/NEWS/2024/03/08#n2024-03-09 [WRE]: https://cran.r-project.org/doc/manuals/R-exts.html -[NativeAPI2016]: https://wiki.r-consortium.org/view/R_Native_API +[CRANpolicy]: https://cran.r-project.org/web/packages/policies.html +[WRE33API]: https://web.archive.org/web/20160609093632/https://cran.r-project.org/doc/manuals/R-exts.html#The-R-API +[ltierney_serialize]: https://homepage.divms.uiowa.edu/~luke/R/serialize/serialize.html +[WRE45serialize]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Custom-serialization-input-and-output +[fastdigest]: https://cran.r-project.org/package=fastdigest +[WRE33wilcox]: https://web.archive.org/web/20160609093632/https://cran.r-project.org/doc/manuals/R-exts.html#Distribution-functions +[wilcox_declared]: https://github.com/r-devel/r-svn/commit/1638b0106279aa1944b17742054bc6882656596e +[wilcox_api]: https://github.com/r-devel/r-svn/commit/32ea1f67f842e3247f782a91684023b0b5eec6c5 [ALTREPnonAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-April/083349.html [clarifyingAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html [remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 From c589fc151db859dd741e914272497b87e2a652af Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 7 Nov 2024 13:22:57 +0300 Subject: [PATCH 06/44] Count symbols exported by libR.so --- posts/2024-12-12-non-api-use/index.qmd | 11 ++++++---- posts/2024-12-12-non-api-use/precomputed.R | 24 +++++++++++++++++++--- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 95047edf..7c4ac1fa 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -10,6 +10,7 @@ bibliography: refs.bib ```{r} #| echo: false +library(data.table) library(tools) # format.check_details load('precomputed.rda') ``` @@ -41,10 +42,12 @@ What's in an API? [Writing R Extensions][WRE] is the definitive guide for R package development. Together with the [CRAN policy][CRANpolicy] it forms the -"rules as written" that the maintainers of CRAN packages must follow. -Even back in R-3.3.0, the oldest version currently supported by -`data.table`, the [chapter 6, "The R API"][WRE33API] classified R's -entry points into four categories: +"rules as written" that the maintainers of CRAN packages must follow. A +recent version of R exports `r nrow(symbols)` symbols, including +`r symbols[,sum(type=='function')]` functions and +`r symbols[,sum(type!='function')]` variables. Even back in R-3.3.0, the +oldest version currently supported by `data.table`, the [chapter 6, "The +R API"][WRE33API] classified R's entry points into four categories: > * __API__ > Entry points which are documented in this manual and declared in an diff --git a/posts/2024-12-12-non-api-use/precomputed.R b/posts/2024-12-12-non-api-use/precomputed.R index f0638226..d78d893f 100644 --- a/posts/2024-12-12-non-api-use/precomputed.R +++ b/posts/2024-12-12-non-api-use/precomputed.R @@ -1,5 +1,23 @@ -cpdb <- tools::CRAN_package_db() -checks <- subset(tools::CRAN_check_details(), Package == 'data.table') +library(depcache) # TODO: uncache everything before submission +library(data.table) + +symbols %<-% fread( + # most likely implies R on GNU/Linux built with --enable-R-shlib + paste('nm -gDP', file.path(R.home('lib'), 'libR.so')), + fill = TRUE, col.names = c('name', 'type', 'value', 'size') +)[ + type %in% c('B', 'D', 'R', 'T') # don't care about [weak] imports +][, + type := fcase( + type == 'B', 'variable', + type == 'D', 'data', + type == 'R', 'read-only data', + type == 'T', 'function' + ) +][] + +cpdb %<-% tools::CRAN_package_db() +checks %<-% subset(tools::CRAN_check_details(), Package == 'data.table') when <- Sys.Date() -save(cpdb, checks, when, file = 'precomputed.rda') +save(cpdb, checks, symbols, when, file = 'precomputed.rda', compress = 'xz') From 3931a7b7af38a22c9919ff9cf72e5ada21fd4211 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 7 Nov 2024 14:06:31 +0300 Subject: [PATCH 07/44] Count tools:::nonAPI entries --- posts/2024-12-12-non-api-use/index.qmd | 51 ++++++++++++---------- posts/2024-12-12-non-api-use/precomputed.R | 29 +++++++++++- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 7c4ac1fa..de33b0e1 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -28,10 +28,10 @@ Fast forward more than four decades and an increase by more than three orders of magnitude in storage and processing capability of computers around us. The [dominant implementation of S is now R][is.R]. It is now feasible to implement algorithms solely in R, recouping the potential -performance losses in reducing the programmer effort spent debugging and -maintaining the code [@Nash2024]. Still, the capability of R to be -extended by special-purpose compiled code is as important as ever. As of -`r when`, +performance losses in performance by reducing the programmer effort +spent debugging and maintaining the code [@Nash2024]. Still, the +capability of R to be extended by special-purpose compiled code is as +important as ever. As of `r when`, `r with(cpdb, round(sum(NeedsCompilation=='yes')/length(NeedsCompilation)*100))`% of CRAN packages use compiled code. Since the implementation language of R is C, not Fortran, the programming interface for R is also defined in @@ -40,13 +40,14 @@ terms of C. What's in an API? ================= -[Writing R Extensions][WRE] is the definitive guide for R package -development. Together with the [CRAN policy][CRANpolicy] it forms the -"rules as written" that the maintainers of CRAN packages must follow. A -recent version of R exports `r nrow(symbols)` symbols, including -`r symbols[,sum(type=='function')]` functions and -`r symbols[,sum(type!='function')]` variables. Even back in R-3.3.0, the -oldest version currently supported by `data.table`, the [chapter 6, "The +[Writing R Extensions][WRE] ("WRE") is the definitive guide for R +package development. Together with the [CRAN policy][CRANpolicy] it +forms the "rules as written" that the maintainers of CRAN packages must +follow. A recent version of R exports `r nrow(symbols)` symbols, +including `r symbols[,sum(type=='function')]` functions ("entry points", +not counting C preprocessor macros) and +`r symbols[,sum(type!='function')]` variables. Even back in R-3.3.0, +the oldest version currently supported by `data.table`, [chapter 6, "The R API"][WRE33API] classified R's entry points into four categories: > * __API__ @@ -75,18 +76,22 @@ still are) listed in the character vector `tools:::nonAPI`: `R CMD check` looks at the functions imported by the package and signals a `NOTE` if it finds any listed there. -The remaining functions, neither documented as API nor forbidden by `R -CMD check`, sat there, alluring the package developers with their -offers. For example, the [serialization interface][ltierney_serialize] -is only [documented in WRE since R-4.5][WRE45serialize], but it has been -powering the [fastdigest] CRAN package since 2015 at the latest, the -maintainer having successfully gambled on it not to change too -drastically. Some of the inclusions in `tools:::nonAPI` could have been -historical mistakes: while WRE has been saying [back in version -3.3.0][WRE33wilcox] that `wilcox_free` should be called after a call to -the (API) functions `dwilcox`, `pwilcox` or `qwilcox`, the function was -only [declared in the public headers][wilcox_declared] and [removed from -`tools:::nonAPI`][wilcox_api] in R-4.2.0. +The remaining _public_ functions, neither documented as API nor +explicitly forbidden by `R CMD check`, sat there, alluring the package +developers with their offers. For example, the [serialization +interface][ltierney_serialize] is only [documented in WRE since +R-4.5][WRE45serialize], but it has been powering the [fastdigest] CRAN +package since 2015 at the latest, the maintainer having successfully +gambled on it not to change too drastically. Some of the inclusions in +`tools:::nonAPI` could have been historical mistakes: while WRE has been +saying [back in version 3.3.0][WRE33wilcox] that `wilcox_free` should be +called after a call to the (API) functions `dwilcox`, `pwilcox` or +`qwilcox`, the function was only [declared in the public +headers][wilcox_declared] and [removed from +`tools:::nonAPI`][wilcox_api] in R-4.2.0. Still, between R-3.3.3 and +R-4.4.2, `tools:::nonAPI` grew from `r length(nonAPI.3_3)` to +`r length(nonAPI.4_4)` entries, and the package maintainers had to adapt +or face archival of their packages. +A [recent question on R-devel][ALTREPnonAPI] (whether the [ALTREP] +interface should be considered "API" for the purpose of CRAN package +developent) sparked a series of events and an extensive discussion +containing the highest count of occurrences of the word "API" per month +ever seen on R-devel (234), topping [October 2002][Rd200210] (package +versioning and API breakage, 150), [October 2005][Rd200510] (API for +graphical interfaces and console output, 124), and [May 2019][Rd201905] +(discussions of the ALTREP interface and multi-threading, 121). As a +result, Luke Tierney [started work][clarifyingAPI] on programmatically +describing the functions and other symbols exported by R (including +variables and preprocessor and enumeriation constants), giving a +stronger definition to the interface. His changes add the currently +unexported function `tools:::funAPI()` that lists entry points and two +more of their categories: + +> * __experimental__ +> Entry points declared in an installed header file that are part of +> an experimental API, such as `R_ext/Altrep.h`. These are subject to +> change, so package authors wishing to use these should be prepared +> to adapt. +> * __embedding__ +> Entry points intended primarily for embedding and creating new +> front-ends. It is not clear that this needs to be a separate +> category but it may be useful to keep it separate for now. + +Additionally, WRE now spells out that entry points not explicitly +documented or at least listed in the output of `tools:::funAPI` (or +something that will replace it) are now off-limits, even if not +currently present in `tools:::nonAPI` (emphasis added): - +> * __public__ +> Entry points declared in an installed header file that are exported +> on all R platforms but are not documented and subject to change +> without notice. _Do not use these in distributed code. Their +> declarations will eventually be moved out of installed header +> files._ + + @@ -113,6 +147,12 @@ packages still on CRAN --> Use of non-API entry points in `data.table` =========================================== +The first version of the `data.table` package in the CRAN archive dates +back to April 2006 (which corresponds to R version 2.3.0). It has been +evolving together with R and its API and thus has accumulated a number +of uses of R internals that are [no longer considered part of the +API][remove_non_API]: + `r gsub( '(?m)^', '> ', perl = TRUE, format(subset(checks, grepl('API', Output))[1,]) @@ -120,8 +160,6 @@ Use of non-API entry points in `data.table` -- ` R CMD check --as-cran ` on a released version of `data.table` -Tracked in [#6180][remove_non_API] - Testing for a `data.frame`: `isFrame` ------------------------------------- @@ -212,6 +250,8 @@ being able to read --> Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` ------------------------------------------------------------------- +Introduced in [v1.7.3, November 2011][news173]. + Why used: need to create new columns by reference, which requires free @@ -275,7 +315,11 @@ References [WRE33wilcox]: https://web.archive.org/web/20160609093632/https://cran.r-project.org/doc/manuals/R-exts.html#Distribution-functions [wilcox_declared]: https://github.com/r-devel/r-svn/commit/1638b0106279aa1944b17742054bc6882656596e [wilcox_api]: https://github.com/r-devel/r-svn/commit/32ea1f67f842e3247f782a91684023b0b5eec6c5 -[ALTREPnonAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-April/083349.html +[ALTREPnonAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-April/083339.html +[ALTREP]: https://svn.r-project.org/R/branches/ALTREP/ALTREP.html +[Rd200210]: https://stat.ethz.ch/pipermail/r-devel/2002-October/thread.html +[Rd200510]: https://stat.ethz.ch/pipermail/r-devel/2005-October/thread.html +[Rd201905]: https://stat.ethz.ch/pipermail/r-devel/2019-May/thread.html [clarifyingAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html [remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 [datatable_isframe_added]: https://github.com/Rdatatable/data.table/commit/87666e70ce1a69b28f0e92ec7504d80e3d53a824#diff-4fc47a9752ba4edfef0cabcc1958eda943545ad3859e48d498b0e3f87a9ae5aeR192 @@ -292,3 +336,4 @@ References [remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 [remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 [PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 +[news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 From ee0b290f5b1cc95f485b7fa09ae62e2df546814b Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 7 Nov 2024 18:23:43 +0300 Subject: [PATCH 09/44] Document SET_TYPEOF --- posts/2024-12-12-non-api-use/index.qmd | 53 +++++++++++++++++---- posts/2024-12-12-non-api-use/langsxp.pikchr | 21 ++++++++ posts/2024-12-12-non-api-use/langsxp.svg | 41 ++++++++++++++++ 3 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 posts/2024-12-12-non-api-use/langsxp.pikchr create mode 100644 posts/2024-12-12-non-api-use/langsxp.svg diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 686278cb..6273ec56 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -188,7 +188,8 @@ from the C code: when creating a copy of an S4 `data.table` from scratch (or setting all attributes from one object onto another), the destination value must also end up being an S4 object. This is controlled by the special "S4" bit in the header of every R object, so -the code must read and set it correctly. +the code must read and set it correctly. The undocumented functions `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` exist as bare interfaces to [the internal @@ -212,12 +213,37 @@ around shared objects. Converting between calls and pairlists: `SET_TYPEOF` ---------------------------------------------------- -[#6313][remove_set_typeof] - -Reading the reference counts: `NAMED` -------------------------------------- - -[#6420][remove_named] +In R, [function calls][call] are internally represented as Lisp-style +pairlists where the first pair is of special type `LANGSXP` instead of +`LISTSXP`. For example, the following diagram illustrates the data +structure of the call `print(x = 42L)`: + +![](langsxp.svg){width=40em} + +Here, every list item is a separate R object, a "cons cell"; each cell +contains the value in its `CAR` field and a reference to the rest of the +list in its `CDR` field. Argument names, if provided, are stored in the +third field, `TAG`. The list is terminated by `R_NilValue`, which is of +type `NILSXP`. These structures must be constructed every time C code +wants to evaluate a function call. + +Previously, R API contained a function to allocate `LISTSXP` pairlists +of arbitrary length, `allocList()`, but not function calls, so it became +a somewhat common idiom to first allocate the list and then use +`SET_TYPEOF` to change the type of the head pair to `LANGSXP`. This +did not immediately break, since the two types have the same internal +memory layout. + +The danger of `SET_TYPEOF` lies in the possibility to set the type of an +R value to one with an incompatible memory layout. (For example, vector +types `REALSXP` and `INTSXP` are built very differently from cons cells +`LISTSXP` and `LANGSXP`.) Starting with R-4.4.1, [R contains the +`allocLang` function in addition to the `allocList` function][WRE511] +that directly allocates a function call object with a head pair of type +`LANGSXP`. In order to stay compatible with previous R versions, +packages may [allocate the `LISTSXP` tail first and then use `lcons()` +to construct the `LANGSXP` head pair of the call][remove_set_typeof]. Strings as C arrays of `CHARSXP` values: `STRING_PTR` ----------------------------------------------------- @@ -232,6 +258,11 @@ write barrier for GC to work, hence the need for `SET_STRING_ELT` and See also: [PR18775] +Reading the reference counts: `NAMED` +------------------------------------- + +[#6420][remove_named] + Encoding bits: `LEVELS` ----------------------- @@ -324,16 +355,18 @@ References [remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 [datatable_isframe_added]: https://github.com/Rdatatable/data.table/commit/87666e70ce1a69b28f0e92ec7504d80e3d53a824#diff-4fc47a9752ba4edfef0cabcc1958eda943545ad3859e48d498b0e3f87a9ae5aeR192 [R_isdataframe_added]: https://github.com/r-devel/r-svn/commit/4ef83b9dc3c6874e774195d329cbb6c11a71c414 -[WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points [remove_isframe]: https://github.com/Rdatatable/data.table/issues/6244 +[WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points [setOldClass]: https://search.r-project.org/R/refmans/methods/html/setOldClass.html [IS_S4_OBJECT]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/memory.c#L4033-L4035 [isS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1838-L1841 [asS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1843 [remove_set_s4_object]: https://github.com/Rdatatable/data.table/pull/6183 +[call]: https://search.r-project.org/R/refmans/base/html/call.html +[WRE511]: https://cran.r-project.org/doc/manuals/R-exts.html#Evaluating-R-expressions-from-C [remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 -[remove_named]: https://github.com/Rdatatable/data.table/pull/6420 -[remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 [remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 [PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 +[remove_named]: https://github.com/Rdatatable/data.table/pull/6420 +[remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 diff --git a/posts/2024-12-12-non-api-use/langsxp.pikchr b/posts/2024-12-12-non-api-use/langsxp.pikchr new file mode 100644 index 00000000..14831a63 --- /dev/null +++ b/posts/2024-12-12-non-api-use/langsxp.pikchr @@ -0,0 +1,21 @@ +Head: ellipse "LANGSXP" fit +ellipse "SYMSXP" "print" fit with .ne at Head.sw + (-.3in, -.3in) +arrow <- from last ellipse.ne to Head.sw "CAR" above aligned + +ellipse "NILSXP" fit with .n at Head.s + (0,-.3in) +arrow from Head.s to last ellipse.n "TAG" above aligned + +Arg1: ellipse "LISTSXP" fit with .nw at Head.se + (.3in, -.3in) +arrow -> from Head.se to Arg1.nw "CDR" above aligned + +ellipse "INTSXP" "42" fit with .ne at Arg1.sw + (-.3in, -.3in) +arrow <- from last ellipse.ne to Arg1.sw "CAR" above aligned + +ellipse "SYMSXP" "x" fit with .n at Arg1.s + (0,-.42in) +arrow from Arg1.s to last ellipse.n "TAG" above aligned + +ellipse "NILSXP" fit with .nw at Arg1.se + (.3in, -.3in) +arrow -> from Arg1.se to last ellipse.nw "CDR" above aligned + +"SEXP call" mono with .s at Head.n + (0,.2in) +arrow <- from Head.n to last text.s diff --git a/posts/2024-12-12-non-api-use/langsxp.svg b/posts/2024-12-12-non-api-use/langsxp.svg new file mode 100644 index 00000000..292db794 --- /dev/null +++ b/posts/2024-12-12-non-api-use/langsxp.svg @@ -0,0 +1,41 @@ + + +LANGSXP + +SYMSXP +print + + +CAR + +NILSXP + + +TAG + +LISTSXP + + +CDR + +INTSXP +42 + + +CAR + +SYMSXP +x + + +TAG + +NILSXP + + +CDR +SEXP call + + + + From e7610e016e33a80fe8069844ed6bb464598fe9d1 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 7 Nov 2024 19:30:00 +0300 Subject: [PATCH 10/44] tweaks --- posts/2024-12-12-non-api-use/index.qmd | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 6273ec56..f49439cf 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -248,13 +248,16 @@ to construct the `LANGSXP` head pair of the call][remove_set_typeof]. Strings as C arrays of `CHARSXP` values: `STRING_PTR` ----------------------------------------------------- + + [Fixed in git][remove_string_ptr] by switching to `STRING_PTR_RO`, present on CRAN for now. - Why non-API: writes to arrays of `SEXP` values *must* go through the write barrier for GC to work, hence the need for `SET_STRING_ELT` and -`SET_VECTOR_ELT` +`SET_VECTOR_ELT`. Also, R assumes that `STRSXP` vectors only contain +`CHARSXP` values, and a writeable pointer is an invitation to violate +that assumption. See also: [PR18775] From 770c859ee49df270d8758bab9cc2287ed3fc3694 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 8 Nov 2024 10:27:29 +0300 Subject: [PATCH 11/44] tweaks --- posts/2024-12-12-non-api-use/index.qmd | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index f49439cf..f7a2aa38 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -131,6 +131,10 @@ currently present in `tools:::nonAPI` (emphasis added): > declarations will eventually be moved out of installed header > files._ +Correspondingly, the number of `tools:::nonAPI` entry points in the +current development version of R rose to `r length(nonAPI.trunk)`, hence +the present blog post. + @@ -232,8 +236,8 @@ Previously, R API contained a function to allocate `LISTSXP` pairlists of arbitrary length, `allocList()`, but not function calls, so it became a somewhat common idiom to first allocate the list and then use `SET_TYPEOF` to change the type of the head pair to `LANGSXP`. This -did not immediately break, since the two types have the same internal -memory layout. +did not previously lead to problems, since the two types have the same +internal memory layout. The danger of `SET_TYPEOF` lies in the possibility to set the type of an R value to one with an incompatible memory layout. (For example, vector From e6a1fef6ced2b5d58486646290b18cdeab70f744 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 8 Nov 2024 18:26:31 +0300 Subject: [PATCH 12/44] Significantly reduce the size of precomputed.rda --- posts/2024-12-12-non-api-use/index.qmd | 9 ++++----- posts/2024-12-12-non-api-use/precomputed.R | 9 +++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index f7a2aa38..4cd98b1d 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -32,10 +32,9 @@ performance losses in performance by reducing the programmer effort spent debugging and maintaining the code [@Nash2024]. Still, the capability of R to be extended by special-purpose compiled code is as important as ever. As of `r when`, -`r with(cpdb, round(sum(NeedsCompilation=='yes')/length(NeedsCompilation)*100))`% -of CRAN packages use compiled code. Since the implementation language of -R is C, not Fortran, the programming interface for R is also defined in -terms of C. +`r round(sum(needscomp)/length(needscomp)*100)`% of CRAN packages use +compiled code. Since the implementation language of R is C, not Fortran, +the programming interface for R is also defined in terms of C. What's in an API? ================= @@ -159,7 +158,7 @@ API][remove_non_API]: `r gsub( '(?m)^', '> ', perl = TRUE, - format(subset(checks, grepl('API', Output))[1,]) + format(subset(dtchecks, grepl('API', Output))[1,]) )` -- ` R CMD check --as-cran ` on a released version of `data.table` diff --git a/posts/2024-12-12-non-api-use/precomputed.R b/posts/2024-12-12-non-api-use/precomputed.R index a9df2341..d167d3d4 100644 --- a/posts/2024-12-12-non-api-use/precomputed.R +++ b/posts/2024-12-12-non-api-use/precomputed.R @@ -31,8 +31,7 @@ getNonAPI <- function(ver, identical(e[[1]], quote(`<-`)) && identical(e[[2]], quote(`nonAPI`)) ) - # FIXME: extract the plain strings instead of evaluating c(...) from the Internet - return(eval(e[[3]])) + return(do.call(c, as.list(e[[3]])[-1])) } } @@ -41,10 +40,12 @@ nonAPI.4_4 <- getNonAPI('4-4') nonAPI.trunk <- getNonAPI(url = 'https://svn.r-project.org/R/trunk/src/library/tools/R/sotools.R') cpdb %<-% tools::CRAN_package_db() -checks %<-% subset(tools::CRAN_check_details(), Package == 'data.table') +needscomp <- cpdb[,'NeedsCompilation'] == 'yes' +checks %<-% tools::CRAN_check_details() +dtchecks <- subset(checks, Package == 'data.table') when <- Sys.Date() save( - cpdb, checks, symbols, nonAPI.3_3, nonAPI.4_4, nonAPI.trunk, + needscomp, dtchecks, symbols, nonAPI.3_3, nonAPI.4_4, nonAPI.trunk, when, file = 'precomputed.rda', compress = 'xz' ) From d1fae921e01c5a696733dfad793c34481768bfd4 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 8 Nov 2024 20:24:12 +0300 Subject: [PATCH 13/44] tweaks --- posts/2024-12-12-non-api-use/index.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 4cd98b1d..57c10fe3 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -242,7 +242,7 @@ The danger of `SET_TYPEOF` lies in the possibility to set the type of an R value to one with an incompatible memory layout. (For example, vector types `REALSXP` and `INTSXP` are built very differently from cons cells `LISTSXP` and `LANGSXP`.) Starting with R-4.4.1, [R contains the -`allocLang` function in addition to the `allocList` function][WRE511] +`allocLang` function in addition to the `allocList` function][WRE_call] that directly allocates a function call object with a head pair of type `LANGSXP`. In order to stay compatible with previous R versions, packages may [allocate the `LISTSXP` tail first and then use `lcons()` @@ -369,7 +369,7 @@ References [asS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1843 [remove_set_s4_object]: https://github.com/Rdatatable/data.table/pull/6183 [call]: https://search.r-project.org/R/refmans/base/html/call.html -[WRE511]: https://cran.r-project.org/doc/manuals/R-exts.html#Evaluating-R-expressions-from-C +[WRE_call]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Creating-call-expressions [remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 [remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 [PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 From d5e40c1ef85ad32542023f6d2dd0e4f3b8fb4361 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Mon, 11 Nov 2024 19:30:55 +0300 Subject: [PATCH 14/44] Document STRING_PTR() Provide examples for other cases. More tweaks. --- posts/2024-12-12-non-api-use/index.qmd | 130 +++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 9 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 57c10fe3..e8ac99be 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -179,6 +179,32 @@ R-4.5.0 is released together with the new name for the function, versions of R. `isDataFrame` is documented among other [replacement entry point names][WRE_replacement_entrypoints] in Writing R Extensions. +Problem (the only instance in `data.table`): + +```c +if (!isVector(thiscol) || isFrame(thiscol)) + /* ^^^^^^^ may disappear in a future R version */ +``` + +Solution: + +```c +// include non-R headers first + +// include R headers last +#include +#include + +// provide overrides after the R headers +#if R_VERSION < R_Version(4, 5, 0) +// R versions older than 4.5.0 released use the old name of the function +#define isDataFrame(x) isFrame(x) +#endif + +// later: +if (!isVector(thiscol) || isDataFrame(thiscol)) +``` + Operating on the S4 bit: `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` --------------------------------------------------------------------------- @@ -213,6 +239,8 @@ Solution: [use `Rf_isS4` instead of `Rf_asS4` to control the S4 object bit, but be careful around shared objects. + + Converting between calls and pairlists: `SET_TYPEOF` ---------------------------------------------------- @@ -248,21 +276,103 @@ that directly allocates a function call object with a head pair of type packages may [allocate the `LISTSXP` tail first and then use `lcons()` to construct the `LANGSXP` head pair of the call][remove_set_typeof]. + +Problem (the only instance in `data.table`): + +```c + SEXP s = PROTECT(allocList(2)); + SET_TYPEOF(s, LANGSXP); +// ^^^^^^^^^^ unsafe operation, could be used to corrupt objects + SETCAR(s, install("format.POSIXct")); + SETCAR(CDR(s), column); +``` + +Solutions: + +```c +// for fixed-size calls with contents known ahead of time +SEXP s = lang2(install("format.POSIXct"), column); +``` +or: +```c +// partially pre-populate +SEXP s = lang2(install("format.POSIXct"), R_NilValue); +// later, when 'column' is known: +SETCAR(CDR(s), column); +``` +or: +```c +// allocate a call with 'n' elements +SEXP call = lcons(R_NilValue, allocList(n - 1)); + +// in R >= 4.4.1 only: +SEXP call = allocLang(n); +``` + +Unfortunately, the `LCONS` macro didn't work with `#define R_NO_REMAP` +prior to R-4.4, because it expanded to `lcons()` instead of +`Rf_lcons()`. + Strings as C arrays of `CHARSXP` values: `STRING_PTR` ----------------------------------------------------- - +From the point of view of R code, strings are very simple things, much +like numbers: they live in atomic vectors and can be directly compared +with other objects. It is only natural to desire to work with them as +easily from C code as it's possible with other atomic types, where +functions `REAL()`, `INTEGER()`, or `COMPLEX()` can be used to access +the buffer containing the numbers. + +The underlying reality of strings is more complicated: since they +internally manage memory buffers containing text in a given encoding, +they must be subject to garbage collection. Like other managed objects +in R, they are represented as `SEXP` values of special type `CHARSXP`. +R's garbage collector is [generational][Tierney_gengc] and requires the +use of [write barrier][Tierney_writebr] any time a `SEXP` value (such as +a `STRSXP` vector) references another `SEXP` value (such as a `CHARSXP` +string). In a generational garbage collector, "younger" generations are +marked and sweeped more frequently than "older" ones. If package C code +manually writes a reference to a "young" `CHARSXP` object into an "old" +`STRSXP` vector without taking generations into account, a following +collection of the "young" pool of objects will miss the `CHARSXP` being +referenced by the "old" `STRSXP` and remove the `CHARSXP` as "garbage". +This makes the `SEXP *` pointers returned by `STRING_PTR` unsafe and +requires the use of `STRING_PTR_RO` function, which returns a read-only +`const SEXP *`. + +Thankfully, `data.table` has already been using read-only `const` +pointers when working with `STRSXP` vectors, so the required changes to +the code were [not too substantial][remove_string_ptr], only changing +the name of the accessor function used: + +Example of the problem: + +```c +const SEXP *sourceD = STRING_PTR(source); +// ^^^^^^^^^^ +// returns a writeable SEXP * pointer, therefore unsafe +``` + +Solution: -[Fixed in git][remove_string_ptr] by switching to `STRING_PTR_RO`, -present on CRAN for now. +```c +// first include non-R headers -Why non-API: writes to arrays of `SEXP` values *must* go through the -write barrier for GC to work, hence the need for `SET_STRING_ELT` and -`SET_VECTOR_ELT`. Also, R assumes that `STRSXP` vectors only contain -`CHARSXP` values, and a writeable pointer is an invitation to violate -that assumption. +// next include R headers + +// then provide version-specific overrides +#if R_VERSION < R_Version(3, 5, 0) +// STRING_PTR_RO only appeared in R-3.5 +#define STRING_PTR_RO(x) STRING_PTR(x) +#endif + +// later: +const SEXP *sourceD = STRING_PTR_RO(source); +// ^^^^^^^^^^^^^ +// returns a const SEXP * pointer, which prevents accidental writes +``` -See also: [PR18775] +See also: [PR18775]. Reading the reference counts: `NAMED` ------------------------------------- @@ -371,6 +481,8 @@ References [call]: https://search.r-project.org/R/refmans/base/html/call.html [WRE_call]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Creating-call-expressions [remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 +[Tierney_gengc]: https://homepage.stat.uiowa.edu/~luke/R/gengcnotes.html +[Tierney_writebr]: https://homepage.stat.uiowa.edu/~luke/R/barrier.html [remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 [PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 [remove_named]: https://github.com/Rdatatable/data.table/pull/6420 From 4ed778ebfdcc51bb08b95ac3415aef31da48c2b0 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Mon, 11 Nov 2024 23:58:46 +0300 Subject: [PATCH 15/44] Document NAMED --- posts/2024-12-12-non-api-use/index.qmd | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index e8ac99be..a1330369 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -340,7 +340,7 @@ This makes the `SEXP *` pointers returned by `STRING_PTR` unsafe and requires the use of `STRING_PTR_RO` function, which returns a read-only `const SEXP *`. -Thankfully, `data.table` has already been using read-only `const` +Thankfully, `data.table` has already been using read-only `const SEXP *` pointers when working with `STRSXP` vectors, so the required changes to the code were [not too substantial][remove_string_ptr], only changing the name of the accessor function used: @@ -377,7 +377,73 @@ See also: [PR18775]. Reading the reference counts: `NAMED` ------------------------------------- -[#6420][remove_named] +In plain R, all value types -- numbers, strings, lists -- have +pass-by-value semantics. Wihout dark and disturbing things in play, such +as non-standard evaluation or active bindings, R code can give a plain +value (`x <- 1:10`) to a function (`f(x)`) or store it in a variable (`y +<- x`), have the function modify its argument (`f <- \(x) { x[1] <- 0 +}`) or change the duplicate variable (`y[2] <- 3`), and still have the +original value intact (`stopifnot(identical(x, 1:10))`). Only the +inherently mutable types, such as environments, external pointers and +weak references, will stay shared between all assignments and function +arguments; the value types behave as if R copies them every time. + +And yet actually making these copies is wasteful when the code only +reads the variable and does not alter it. (In fact, one of the original +motivations of `data.table` was to further reduce wasteful copying that +R used to perform during certain sub-assignment operations.) Until +version 4.0.0, `NAMED` was R's mechanism to save memory and CPU time +instead of creating and storing these copies. A temporary object such as +the value of `1:10` was not bound to a symbol and thus could be modified +right away. Assigning it to a variable, as in `x <- 1:10`, gave it a +`NAMED(x)` count of 1, for which R had an internal optimisation in +replacement function calls like `foo(x) <- 3`. Assigning the same value +to yet another symbol (by copying `y <- x` or calling a function +`foo(x)`) increased the `NAMED()` count to 2 or more, for which there +was no optimisation: in order to modify one of the symbols, R was +required to duplicate `x` first. `NAMED()` was not necessarily decreased +after the bindings disappeared, and decreasing it after having reached +`NAMEDMAX` was impossible. During the lifetime of R-3.x, `NAMEDMAX` was +increased from 2 to 3 and later to 7. + +Between R-3.1.0 and R-4.0.0, R [migrated from `NAMED` to reference +counting][Tierney_refcnt]. Reference counts are easier to properly +decrement than `NAMED`, thus preventing unneeded copies of objects that +became unreferenced. R-3.5.0 [documented the symbols][Rnews_setnamed] +`MAYBE_REFERENCED(.)` / `NO_REFERENCES(.)` for use instead of checking +`NAMED(.) == 0`, `MAYBE_SHARED(.)` / `NOT_SHARED(.)` instead of checking +`NAMED(.) > 1`, and `MARK_NOT_MUTABLE(.)` instead of setting `NAMED(.)` +to `NAMEDMAX`, which later became part of the API instead of the +`NAMED(.)` and `REFCNT(.)` functions. The hard rules are that a value is +safe to modify in place if it has `NO_REFERENCES()` (reference count of +0), definitely unsafe to modify in place (requiring a call to +`duplicate` or `shallow_duplicate`) if it is `MAYBE_SHARED()` (reference +count above 1), and almost certainly unsafe to modify in place if it is +`MAYBE_REFERENCED()` (reference count of 1). + +`data.table`'s only uses of `NAMED()` were in the [verbose output during +assignment][remove_named]: + +```c +if (verbose) { + Rprintf(_("RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d, but then is being plonked. length(values)==%d; length(cols)==%d)\n"), + i+1, NAMED(thisvalue), MAYBE_SHARED(thisvalue), length(values), length(cols)); + ^^^^^ non-API function +} +``` + +Since the correctness of the modification operation hinges on the +reference count being 0 (and it may be important whether it's exactly 1 +or above 1), the same amount of _useful_ information can be conveyed by +printing `MAYBE_REFERENCED()` and `MAYBE_SHARED()` instead of `NAMED()`: + +```c +if (verbose) { + Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d, but then is being plonked. length(values)==%d; length(cols)==%d)\n"), + i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), length(values), length(cols)); + ^^^^^^^^^^^^^^^^ API function +} +``` Encoding bits: `LEVELS` ----------------------- @@ -485,6 +551,8 @@ References [Tierney_writebr]: https://homepage.stat.uiowa.edu/~luke/R/barrier.html [remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 [PR18775]: https://bugs.r-project.org/show_bug.cgi?id=18775 -[remove_named]: https://github.com/Rdatatable/data.table/pull/6420 +[Tierney_refcnt]: https://developer.r-project.org/Refcnt.html +[Rnews_setnamed]: https://developer.r-project.org/blosxom.cgi/R-devel/NEWS/2017/09/02#n2017-09-03 +[remove_named]: https://github.com/Rdatatable/data.table/pull/6420/files#diff-22b103646a1efab9bbfc374791ccfc3fd1422eefc48918a3e126fc2f30d1f572L552 [remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 From 04d0817576c8b7db3699920e6985fd7014dd010e Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 12 Nov 2024 19:53:33 +0300 Subject: [PATCH 16/44] The many uses of sxpinfo.gp --- posts/2024-12-12-non-api-use/index.qmd | 68 ++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index a1330369..67cfcb0c 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -448,6 +448,46 @@ if (verbose) { Encoding bits: `LEVELS` ----------------------- +`LEVELS` is the name of the internal R [macro][LEVELS_macro] and a +"public" [function][LEVELS_function] accessing a [16-bit field called +`gp`][LEVELS_field] that is present in the header of every `SEXP` value. +Not every access to this field is done using the `LEVELS()` macro; there +are bits of R code that access `(sexp)->sxpinfo.gp` directly. R uses +this field for many purposes: + + * matching given arguments against the formals of a function + ([1][gp_for_match1], [2][gp_for_match2], [3][gp_for_match3]) + * remembering the previous [type][gp_for_gc] of a garbage-collected value + * [finalizing][gp_for_finalize] the reference-semantics objects before + garbage-collecting them + * [marking][gp_for_calling] condition handlers as "calling" (executing + on top of where the condition was signalled in the call stack), as + opposed to "non-calling" (executing at the site of the `tryCatch` + call) + * [marking][gp_for_assignment] objects in complex assignment calls + * storing the [S4 object bit][gp_for_s4] + * [marking][gp_for_jit] functions as (un)suitable for bytecode + compilation + * [marking][gp_for_growable] vectors as growable + * [marking][gp_for_missing] provided ("actial") function arguments as + [missing][gp_for_missing2] + * [marking][gp_for_ddval] the `..1`, `..2`, etc symbols as + corresponding to the [given element of the `...` + argument][Rhelp_dots] + * [marking][gp_for_env] environments as [locked][envflags_locked] or + for [caching][envflags_global] the global variable lookup + ([1][gp_for_basesym], [2][basesym2]) + + * [marking][gp_for_hashash] symbols naming environment contents for + [hash lookup][hashash2] + * [marking][gp_for_active] bindings inside environments as + [active][active_binding] + * [marking][gp_for_promsxp] promise objects (which are themselves not + part of the API) as already evaluated + * [marking][gp_for_charsxp] `CHARSXP` values as present in the global + cache or being in a certain encoding (bytes, Latin-1, ASCII, UTF-8, + or "unknown" (native)) + [Waiting for R-4.5.0 to release with the new API][remove_levels] Why used: need to know the encoding. Distinguish between `CE_UTF8` and @@ -554,5 +594,33 @@ References [Tierney_refcnt]: https://developer.r-project.org/Refcnt.html [Rnews_setnamed]: https://developer.r-project.org/blosxom.cgi/R-devel/NEWS/2017/09/02#n2017-09-03 [remove_named]: https://github.com/Rdatatable/data.table/pull/6420/files#diff-22b103646a1efab9bbfc374791ccfc3fd1422eefc48918a3e126fc2f30d1f572L552 +[LEVELS_macro]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L228 +[LEVELS_function]:https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/memory.c#L3902 +[LEVELS_field]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L132 +[gp_for_match1]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/match.c#L175 +[gp_for_match2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/match.c#L233-L236 +[gp_for_match3]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/unique.c#L53 +[gp_for_gc]:https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/memory.c#L151-L155 +[gp_for_finalize]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/memory.c#L1364-L1374 +[gp_for_calling]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/errors.c#L1660-L1665 +[gp_for_assignment]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L280-L324 +[gp_for_s4]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L359-L362 +[gp_for_jit]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L364-L371 +[gp_for_growable]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L373-L377 +[gp_for_missing]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L449-L456 +[gp_for_missing2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/eval.c#L2260-L2281 +[gp_for_ddval]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L519-L523 +[Rhelp_dots]: https://search.r-project.org/R/refmans/base/html/dots.html +[gp_for_env]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L529-L530 +[envflags_locked]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L106-L108 +[envflags_global]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L613-L655 +[gp_for_hashash]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1182-L1186 +[hashash2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L517-L520 +[gp_for_active]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1205-L1210 +[active_binding]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L3466-L3483 +[gp_for_basesym]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1225-L1228 +[basesym2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L754-L768 +[gp_for_promsxp]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1165-L1166 +[gp_for_charsxp]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L843-L853 [remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 From 62503bff0823753b365471e8450a5b6b4da1f90c Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 13 Nov 2024 13:26:27 +0300 Subject: [PATCH 17/44] Tweaks --- posts/2024-12-12-non-api-use/index.qmd | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 67cfcb0c..43d36c70 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -474,10 +474,11 @@ this field for many purposes: * [marking][gp_for_ddval] the `..1`, `..2`, etc symbols as corresponding to the [given element of the `...` argument][Rhelp_dots] - * [marking][gp_for_env] environments as [locked][envflags_locked] or - for [caching][envflags_global] the global variable lookup - ([1][gp_for_basesym], [2][basesym2]) - + * [marking][gp_for_env] environments as [locked][envflags_locked], or + for [caching][envflags_global] the global variable lookup, or for + looking up values in the base environment or the special functions + ([1][gp_for_basesym], [2][basesym2]) * [marking][gp_for_hashash] symbols naming environment contents for [hash lookup][hashash2] * [marking][gp_for_active] bindings inside environments as @@ -620,6 +621,7 @@ References [active_binding]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L3466-L3483 [gp_for_basesym]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1225-L1228 [basesym2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L754-L768 + [gp_for_promsxp]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1165-L1166 [gp_for_charsxp]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L843-L853 [remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 From b13a024d379b7ce435feed9541bb41c9b8222e45 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 13 Nov 2024 17:12:32 +0300 Subject: [PATCH 18/44] refs: escape the title from auto-capitalisation --- posts/2024-12-12-non-api-use/refs.bib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posts/2024-12-12-non-api-use/refs.bib b/posts/2024-12-12-non-api-use/refs.bib index 60afca46..cf280acd 100644 --- a/posts/2024-12-12-non-api-use/refs.bib +++ b/posts/2024-12-12-non-api-use/refs.bib @@ -20,7 +20,7 @@ @book{Chambers2016 } @article{Nash2024, author = {Nash, John C. and Bhattacharjee, Arkajyoti}, - title = {A Comparison of R Tools for Nonlinear Least Squares Modeling}, + title = {A Comparison of {R} Tools for Nonlinear Least Squares Modeling}, journal = {The R Journal}, year = {2024}, note = {https://doi.org/10.32614/RJ-2023-091}, From 30693a26f48e92faf4daa2020e7cf491c224c6da Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 13 Nov 2024 17:33:24 +0300 Subject: [PATCH 19/44] LEVELS and how to replace it --- posts/2024-12-12-non-api-use/index.qmd | 91 ++++++++++++++++++-------- 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 43d36c70..77130873 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -448,12 +448,12 @@ if (verbose) { Encoding bits: `LEVELS` ----------------------- -`LEVELS` is the name of the internal R [macro][LEVELS_macro] and a -"public" [function][LEVELS_function] accessing a [16-bit field called -`gp`][LEVELS_field] that is present in the header of every `SEXP` value. -Not every access to this field is done using the `LEVELS()` macro; there -are bits of R code that access `(sexp)->sxpinfo.gp` directly. R uses -this field for many purposes: +`LEVELS` is the name of the internal R [macro][LEVELS_macro] and an +exported non-API [function][LEVELS_function] accessing a [16-bit field +called `gp`][LEVELS_field] that is present in the header of every `SEXP` +value. Not every access to this field is done using the `LEVELS()` +macro; there are bits of R code that access `(sexp)->sxpinfo.gp` +directly. R uses this field for many purposes: * matching given arguments against the formals of a function ([1][gp_for_match1], [2][gp_for_match2], [3][gp_for_match3]) @@ -477,29 +477,59 @@ this field for many purposes: * [marking][gp_for_env] environments as [locked][envflags_locked], or for [caching][envflags_global] the global variable lookup, or for looking up values in the base environment or the special functions - ([1][gp_for_basesym], [2][basesym2]) + ([1][gp_for_basesym], [2][basesym2], [3][gp_for_special], + [4][specialsym2]) * [marking][gp_for_hashash] symbols naming environment contents for [hash lookup][hashash2] * [marking][gp_for_active] bindings inside environments as [active][active_binding] - * [marking][gp_for_promsxp] promise objects (which are themselves not - part of the API) as already evaluated + * [marking][gp_for_promsxp] promise objects as already evaluated * [marking][gp_for_charsxp] `CHARSXP` values as present in the global - cache or being in a certain encoding (bytes, Latin-1, ASCII, UTF-8, - or "unknown" (native)) - -[Waiting for R-4.5.0 to release with the new API][remove_levels] - -Why used: need to know the encoding. Distinguish between `CE_UTF8` and -string actually in UTF-8 (can also happen with `CE_NATIVE` in a UTF-8 -locale) - - - - + cache or being in a certain encoding + +Although the value of `gp` is directly stored in R's serialized data +stream, neither of these are part of the API. Out of all possible uses +for this flag, `data.table` is only used in string encodings. From the +viewpoints of [plain R][R_Encoding] and the [C API][WRE_encoding], an +individual string (`CHARSXP` value) can be represented in the following +encodings: + +R-level encoding name | C-level encoding constant | Meaning +:----------------:|:----------------:|------------------------------ +`"latin1"` | `CE_LATIN1` | ISO/IEC 8859-1 or CP1252 +`"UTF-8"` | `CE_UTF8` | ISO/IEC 10646 +`"unknown"` | `CE_NATIVE` | Encoding of the current locale +`"bytes"` | `CE_BYTES` | Not text, `translateChar` will fail + +Internally, R also [marks strings as encoded in ASCII][R_SET_ASCII]: +since all three encodings are ASCII-compatible, an ASCII string will +never need to be translated into a different encoding. Note that there +is a subtle difference between a string _marked_ in a certain encoding +and actually _being_ in a certain encoding: in an R session running with +a UTF-8 locale (which includes most modern Unix-alikes and Windows ≥ +10, November 2019 update) a string marked as `CE_NATIVE` will also be in +UTF-8. (Similarly, with an increasingly rare Latin-1 locale, a +`CE_NATIVE` string will be in Latin-1.) + +The `data.table` code is interested in knowing whether a string is +[marked as UTF-8, Latin-1, or ASCII][datatable_isencoded]. This is used +to [convert strings to UTF-8 when needed][datatable_needUTF8] (also: +[output to native encoding or UTF-8 in +`fwrite`][datatable_ENCODED_CHAR], [automatic conversion in +`forder`][datatable_anynotascii]). The `getCharCE` API function appeared +in R-2.7.0 together with the encoding support, so switching the +`IS_UTF8` and `IS_LATIN` macros from `LEVELS` to API calls [was +relatively straightforward][datatable_levels1]. + +R-4.5.0 is expected to introduce the `charIsASCII` "experimental" API +function that returns the value of the ASCII marker for a `CHARSXP` +value, which [will replace the use of `LEVELS` in the `IS_ASCII` +macro][remove_levels]. Curiously, while it looks like the code could +benefit from switching from the `getCharCE()` tests (which only look at +the value of the flags and so may needlessly translate strings from +`CE_NATIVE`) to the new experimental `charIs(ASCII|UTF8|Latin1)` +functions that will also return `TRUE` for a matching native encoding, +actually making the change breaks a number of unit tests. Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` ------------------------------------------------------------------- @@ -621,8 +651,17 @@ References [active_binding]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L3466-L3483 [gp_for_basesym]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1225-L1228 [basesym2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/envir.c#L754-L768 - +[gp_for_special]: https://github.com/r-devel/r-svn/blob/2753df314f7d8e154bc42b5abd99daaf6472dbe1/src/include/Defn.h#L1230-L1236 +[specialsym2]: https://github.com/r-devel/r-svn/blob/2753df314f7d8e154bc42b5abd99daaf6472dbe1/src/main/names.c#L1019-L1046 [gp_for_promsxp]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L1165-L1166 [gp_for_charsxp]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L843-L853 -[remove_levels]: https://github.com/Rdatatable/data.table/pull/6422 +[R_Encoding]: https://search.r-project.org/R/refmans/base/html/Encoding.html +[WRE_Encoding]: https://cran.r-project.org/doc/manuals/R-exts.html#Character-encoding-issues +[R_SET_ASCII]: https://github.com/r-devel/r-svn/blob/2753df314f7d8e154bc42b5abd99daaf6472dbe1/src/main/envir.c#L4312-L4375 +[datatable_isencoded]: https://github.com/Rdatatable/data.table/blob/40ad2e6978202ecc626db9eaae3a18ed5e4df769/src/data.table.h#L36-L38 +[datatable_needUTF8]: https://github.com/Rdatatable/data.table/blob/40ad2e6978202ecc626db9eaae3a18ed5e4df769/src/data.table.h#L63-L73 +[datatable_ENCODED_CHAR]: https://github.com/Rdatatable/data.table/blob/40ad2e6978202ecc626db9eaae3a18ed5e4df769/src/fwriteR.c#L8-L12 +[datatable_anynotascii]: https://github.com/Rdatatable/data.table/blob/40ad2e6978202ecc626db9eaae3a18ed5e4df769/src/forder.c#L312-L331 +[datatable_levels1]: https://github.com/Rdatatable/data.table/pull/6420/commits/46dbfa93e72776c59dacb286de9831fa28c481b5#diff-3b83136e49e2df4f5df80b312d7d4199fed9e0d283401dbf7bd9159a5096bcaaL36 +[remove_levels]: https://github.com/Rdatatable/data.table/pull/6422/commits/72cbd170fd16844dd8094b8d049d2e56d0926d22 [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 From c6892a56f920845552575b7a4ed21eb305a858e5 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 13 Nov 2024 17:41:44 +0300 Subject: [PATCH 20/44] We regret to inform you of fastdigest archival --- posts/2024-12-12-non-api-use/index.qmd | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 77130873..3fb9a682 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -80,14 +80,13 @@ The remaining _public_ functions, neither documented as API nor explicitly forbidden by ` R CMD check `, sat there, alluring the package developers with their offers. For example, the [serialization interface][ltierney_serialize] is only [documented in WRE since -R-4.5][WRE45serialize], but it has been powering the [fastdigest] CRAN -package since 2015 at the latest, the maintainer having successfully -gambled on it not to change too drastically. Some of the inclusions in -`tools:::nonAPI` could have been historical mistakes: while WRE has been -saying [back in version 3.3.0][WRE33wilcox] that `wilcox_free` should be -called after a call to the (API) functions `dwilcox`, `pwilcox` or -`qwilcox`, the function was only [declared in the public -headers][wilcox_declared] and [removed from +R-4.5][WRE45serialize], but it has been powering part of the [digest] +CRAN package since 2019 (and other packages before it) without any +drastic changes. Some of the inclusions in `tools:::nonAPI` could have +been historical mistakes: while WRE has been saying [back in version +3.3.0][WRE33wilcox] that `wilcox_free` should be called after a call to +the (API) functions `dwilcox`, `pwilcox` or `qwilcox`, the function was +only [declared in the public headers][wilcox_declared] and [removed from `tools:::nonAPI`][wilcox_api] in R-4.2.0. Still, between R-3.3.3 and R-4.4.2, `tools:::nonAPI` grew from `r length(nonAPI.3_3)` to `r length(nonAPI.4_4)` entries, and the package maintainers had to adapt @@ -595,7 +594,7 @@ References [WRE33API]: https://web.archive.org/web/20160609093632/https://cran.r-project.org/doc/manuals/R-exts.html#The-R-API [ltierney_serialize]: https://homepage.divms.uiowa.edu/~luke/R/serialize/serialize.html [WRE45serialize]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Custom-serialization-input-and-output -[fastdigest]: https://cran.r-project.org/package=fastdigest +[digest]: https://cran.r-project.org/package=digest [WRE33wilcox]: https://web.archive.org/web/20160609093632/https://cran.r-project.org/doc/manuals/R-exts.html#Distribution-functions [wilcox_declared]: https://github.com/r-devel/r-svn/commit/1638b0106279aa1944b17742054bc6882656596e [wilcox_api]: https://github.com/r-devel/r-svn/commit/32ea1f67f842e3247f782a91684023b0b5eec6c5 From 59fb0eaabf63d620477b25751be2bc5aef2a85c2 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 13 Nov 2024 18:00:14 +0300 Subject: [PATCH 21/44] Tweaks --- posts/2024-12-12-non-api-use/index.qmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 3fb9a682..3bec50a6 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -488,9 +488,9 @@ directly. R uses this field for many purposes: Although the value of `gp` is directly stored in R's serialized data stream, neither of these are part of the API. Out of all possible uses -for this flag, `data.table` is only used in string encodings. From the -viewpoints of [plain R][R_Encoding] and the [C API][WRE_encoding], an -individual string (`CHARSXP` value) can be represented in the following +for this flag, `data.table` is only interested in string encodings. From +the viewpoints of [plain R][R_Encoding] and the [C API][WRE_encoding], +an individual string (`CHARSXP` value) can be marked with the following encodings: R-level encoding name | C-level encoding constant | Meaning @@ -498,7 +498,7 @@ R-level encoding name | C-level encoding constant | Meaning `"latin1"` | `CE_LATIN1` | ISO/IEC 8859-1 or CP1252 `"UTF-8"` | `CE_UTF8` | ISO/IEC 10646 `"unknown"` | `CE_NATIVE` | Encoding of the current locale -`"bytes"` | `CE_BYTES` | Not text, `translateChar` will fail +`"bytes"` | `CE_BYTES` | Not necessarily text; `translateChar` will fail Internally, R also [marks strings as encoded in ASCII][R_SET_ASCII]: since all three encodings are ASCII-compatible, an ASCII string will @@ -526,9 +526,9 @@ value, which [will replace the use of `LEVELS` in the `IS_ASCII` macro][remove_levels]. Curiously, while it looks like the code could benefit from switching from the `getCharCE()` tests (which only look at the value of the flags and so may needlessly translate strings from -`CE_NATIVE`) to the new experimental `charIs(ASCII|UTF8|Latin1)` -functions that will also return `TRUE` for a matching native encoding, -actually making the change breaks a number of unit tests. +`CE_NATIVE`) to the new experimental `charIs(UTF8|Latin1)` functions +that will also return `TRUE` for a matching native encoding, actually +making the change breaks a number of unit tests. Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` ------------------------------------------------------------------- From be38ad9b168f973ea29c325e3ee5e58b3ec86e16 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 13 Nov 2024 21:01:39 +0300 Subject: [PATCH 22/44] Start work on over-allocated lists & names --- posts/2024-12-12-non-api-use/index.qmd | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 3bec50a6..a75337ab 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -533,7 +533,11 @@ making the change breaks a number of unit tests. Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` ------------------------------------------------------------------- -Introduced in [v1.7.3, November 2011][news173]. +Over-allocated `data.table`s have been introduced in [v1.7.3, November +2011][news173]. It was a simpler time, when package code could [just set +`LENGTH(newdt) = l` and `TRUELENGTH(newdt) = +n`][datatable_overallocation] for the newly allocated `VECSXP` list. + @@ -664,3 +668,4 @@ References [datatable_levels1]: https://github.com/Rdatatable/data.table/pull/6420/commits/46dbfa93e72776c59dacb286de9831fa28c481b5#diff-3b83136e49e2df4f5df80b312d7d4199fed9e0d283401dbf7bd9159a5096bcaaL36 [remove_levels]: https://github.com/Rdatatable/data.table/pull/6422/commits/72cbd170fd16844dd8094b8d049d2e56d0926d22 [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 +[datatable_overallocation]: https://github.com/Rdatatable/data.table/commit/e09d91beccc862eebcd9497c27b422058320396b#diff-22b103646a1efab9bbfc374791ccfc3fd1422eefc48918a3e126fc2f30d1f572R262-R276 From 3a9235a60035a8011f9b88d1b03bdb828b393dbb Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 15 Nov 2024 19:42:23 +0300 Subject: [PATCH 23/44] Tweaks --- posts/2024-12-12-non-api-use/index.qmd | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index a75337ab..e05173c5 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -534,12 +534,20 @@ Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` ------------------------------------------------------------------- Over-allocated `data.table`s have been introduced in [v1.7.3, November -2011][news173]. It was a simpler time, when package code could [just set -`LENGTH(newdt) = l` and `TRUELENGTH(newdt) = -n`][datatable_overallocation] for the newly allocated `VECSXP` list. - - - +2011][news173], together with the `:=` operator for changing the columns +by reference. Since `data.frame`s and `data.table`s are lists, and lists +in R are value types with pass-by-value semantics (see above), adding or +removing a column to one the normally involves allocating a new vector +of pointers to columns (performing a "shallow duplicate"). + +The [over-allocated vector of column pointers][datatable_overallocation] +(together with the `names` vector) remember their original length in the +`TRUELENGTH` field and can be resized in place by gradually increasing +their `LENGTH`. The walrus operator, `:=`, has since become [the +defining feature of data.table][datatable_logo]. `data.table` version +1.8.8 (March 2013) [saw][news188] the addition of the `fread` function +that [over-allocated and later truncated the individual +columns][datatable_stretch_column] using the same approach. Why used: need to create new columns by reference, which requires free column and name slots @@ -669,3 +677,7 @@ References [remove_levels]: https://github.com/Rdatatable/data.table/pull/6422/commits/72cbd170fd16844dd8094b8d049d2e56d0926d22 [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 [datatable_overallocation]: https://github.com/Rdatatable/data.table/commit/e09d91beccc862eebcd9497c27b422058320396b#diff-22b103646a1efab9bbfc374791ccfc3fd1422eefc48918a3e126fc2f30d1f572R262-R276 +[datatable_logo]: https://raw.githubusercontent.com/Rdatatable/data.table/master/.graphics/logo.png +[news188]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-5 +[datatable_stretch_column]: https://github.com/Rdatatable/data.table/commit/b4e023df736fed8c4dc536ac0061e895a565b375#diff-697a3094ef3d287d25b94aa344f7ed0262aa3fdb97af9b7e04e3b0ef585b05bcR30-R56 +[RI113]: https://cran.r-project.org/doc/manuals/R-ints.html#The-_0027data_0027 From d028fcb1ca07f3a4f3de6a6fa1f8f5aa3e622203 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 16 Nov 2024 21:25:26 +0300 Subject: [PATCH 24/44] History of TRUELENGTH Also, add references to R Internals and restructure the (TRUE)LENGTH sections into related sub-sections. --- posts/2024-12-12-non-api-use/index.qmd | 102 ++++++++++++++++--------- 1 file changed, 64 insertions(+), 38 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index e05173c5..74e264aa 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -326,18 +326,18 @@ The underlying reality of strings is more complicated: since they internally manage memory buffers containing text in a given encoding, they must be subject to garbage collection. Like other managed objects in R, they are represented as `SEXP` values of special type `CHARSXP`. -R's garbage collector is [generational][Tierney_gengc] and requires the -use of [write barrier][Tierney_writebr] any time a `SEXP` value (such as -a `STRSXP` vector) references another `SEXP` value (such as a `CHARSXP` -string). In a generational garbage collector, "younger" generations are -marked and sweeped more frequently than "older" ones. If package C code -manually writes a reference to a "young" `CHARSXP` object into an "old" -`STRSXP` vector without taking generations into account, a following -collection of the "young" pool of objects will miss the `CHARSXP` being -referenced by the "old" `STRSXP` and remove the `CHARSXP` as "garbage". -This makes the `SEXP *` pointers returned by `STRING_PTR` unsafe and -requires the use of `STRING_PTR_RO` function, which returns a read-only -`const SEXP *`. +R's garbage collector is [generational and requires the use of write +barrier][RI17] ([1][Tierney_gengc], [2][Tierney_writebr]) any time a +`SEXP` value (such as a `STRSXP` vector) references another `SEXP` value +(such as a `CHARSXP` string). In a generational garbage collector, +"younger" generations are marked and sweeped more frequently than +"older" ones. If package C code manually writes a reference to a "young" +`CHARSXP` object into an "old" `STRSXP` vector without taking +generations into account, a following collection of the "young" pool of +objects will miss the `CHARSXP` being referenced by the "old" `STRSXP` +and remove the `CHARSXP` as "garbage". This makes the `SEXP *` pointers +returned by `STRING_PTR` unsafe and requires the use of `STRING_PTR_RO` +function, which returns a read-only `const SEXP *`. Thankfully, `data.table` has already been using read-only `const SEXP *` pointers when working with `STRSXP` vectors, so the required changes to @@ -449,10 +449,10 @@ Encoding bits: `LEVELS` `LEVELS` is the name of the internal R [macro][LEVELS_macro] and an exported non-API [function][LEVELS_function] accessing a [16-bit field -called `gp`][LEVELS_field] that is present in the header of every `SEXP` -value. Not every access to this field is done using the `LEVELS()` -macro; there are bits of R code that access `(sexp)->sxpinfo.gp` -directly. R uses this field for many purposes: +called `gp`][LEVELS_field] ([general-purpose][RI112]) that is present in +the header of every `SEXP` value. Not every access to this field is +done using the `LEVELS()` macro; there are bits of R code that access +`(sexp)->sxpinfo.gp` directly. R uses this field for many purposes: * matching given arguments against the formals of a function ([1][gp_for_match1], [2][gp_for_match2], [3][gp_for_match3]) @@ -530,40 +530,55 @@ the value of the flags and so may needlessly translate strings from that will also return `TRUE` for a matching native encoding, actually making the change breaks a number of unit tests. -Growable vectors: `SETLENGTH`, `SET_GROWABLE_BIT`, `SET_TRUELENGTH` -------------------------------------------------------------------- +`SETLENGTH`, `SET_GROWABLE_BIT`, `(SET_)TRUELENGTH` +--------------------------------------------------- -Over-allocated `data.table`s have been introduced in [v1.7.3, November +### Growable vectors + +Since `data.frame`s and `data.table`s are lists, and lists in R are +value types with pass-by-value semantics (see above), adding or removing +a column to one the normally involves allocating a new list referencing +the same columns (performing a "shallow duplicate"). By contrast, the +[over-allocated vectors][datatable_overallocation] can be resized in +place by gradually increasing their `LENGTH` (remembering their original +length in the `TRUELENGTH` field), obviating the need for shallow +duplicates at the cost of making `data.table` shared, by-reference +values. The change has been introduced in [v1.7.3, November 2011][news173], together with the `:=` operator for changing the columns -by reference. Since `data.frame`s and `data.table`s are lists, and lists -in R are value types with pass-by-value semantics (see above), adding or -removing a column to one the normally involves allocating a new vector -of pointers to columns (performing a "shallow duplicate"). - -The [over-allocated vector of column pointers][datatable_overallocation] -(together with the `names` vector) remember their original length in the -`TRUELENGTH` field and can be resized in place by gradually increasing -their `LENGTH`. The walrus operator, `:=`, has since become [the -defining feature of data.table][datatable_logo]. `data.table` version -1.8.8 (March 2013) [saw][news188] the addition of the `fread` function -that [over-allocated and later truncated the individual -columns][datatable_stretch_column] using the same approach. - -Why used: need to create new columns by reference, which requires free -column and name slots +by reference (which has since become [the defining feature of +data.table][datatable_logo]). `data.table` version 1.8.8 (March 2013) +[saw][news188] the addition of the `fread` function that [over-allocated +and later truncated the individual columns][datatable_stretch_column] +using the same approach. + +R's own use of `TRUELENGTH` is [varied][RI113]. The field itself +appeared in [R-0.63][R_truelength] together with the `VECSXP` lists (to +replace the Lisp-style linked pairlists). R [uses this +field][R_hashvalue] in `CHARSXP` strings to store the hash values +[belonging to symbols][R_install_truelen]. R's many `VECSXP`-based hash +tables use it to count the primary slots in use: hashes are used for +reference tracking during (un)serialization ([1][R_serialize_hash], +[2][R_saveload_hash]) and looking up environment contents +([1][R_envir_hashpri], [2][R_envir_hashval]). R-3.3 (May 2016) saw the +inclusion of [radix sort][R_radixsort] from `data.table` itself, which +uses `TRUELENGTH` to sort strings (more on that below). R-3.4 (April +2017) [introduced][R_growable] over-allocation when growing vectors due +to assignment outside their bounds. The [growable bit][gp_for_growable] +was added to prevent the mismanagement of the allocated memory counter: +without the bit set on the over-allocated vectors, the garbage collector +only counted `LENGTH(x)` instead of `TRUELENGTH(x)` units as released +when garbage-collecting the vector, inflating the counter over time. Why non-API: make a length too long and the list is broken. - What to do about it: reimplement in ALTREP on R ≥ 4.1 -Fast string matching: `SET_TRUELENGTH`, `TRUELENGTH` ----------------------------------------------------- +### Fast string matching Why used: to exploit the `CHARSXP` cache. R interns strings, so a string with the given contents and encoding bits exists as a single object, @@ -629,6 +644,7 @@ References [call]: https://search.r-project.org/R/refmans/base/html/call.html [WRE_call]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Creating-call-expressions [remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 +[RI17]: https://cran.r-project.org/doc/manuals/R-ints.html#The-write-barrier [Tierney_gengc]: https://homepage.stat.uiowa.edu/~luke/R/gengcnotes.html [Tierney_writebr]: https://homepage.stat.uiowa.edu/~luke/R/barrier.html [remove_string_ptr]: https://github.com/Rdatatable/data.table/pull/6312 @@ -639,6 +655,7 @@ References [LEVELS_macro]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L228 [LEVELS_function]:https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/memory.c#L3902 [LEVELS_field]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/include/Defn.h#L132 +[RI112]: https://cran.r-project.org/doc/manuals/R-ints.html#Rest-of-header [gp_for_match1]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/match.c#L175 [gp_for_match2]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/match.c#L233-L236 [gp_for_match3]: https://github.com/r-devel/r-svn/blob/c9437a83b9677074fe01310caac6a2a66cc7f680/src/main/unique.c#L53 @@ -681,3 +698,12 @@ References [news188]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-5 [datatable_stretch_column]: https://github.com/Rdatatable/data.table/commit/b4e023df736fed8c4dc536ac0061e895a565b375#diff-697a3094ef3d287d25b94aa344f7ed0262aa3fdb97af9b7e04e3b0ef585b05bcR30-R56 [RI113]: https://cran.r-project.org/doc/manuals/R-ints.html#The-_0027data_0027 +[R_truelength]: https://github.com/r-devel/r-svn/commit/2d4ae2c4bd593bc2aa2273076997b6e63bbcb782 +[R_hashvalue]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/include/Defn.h#L1184-L1187 +[R_install_truelen]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/names.c#L1256-L1272 +[R_serialize_hash]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/serialize.c#L617-L634 +[R_saveload_hash]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/saveload.c#L807-L834 +[R_envir_hashpri]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/envir.c#L193-L207 +[R_envir_hashval]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/envir.c#L497-L520 +[R_radixsort]: https://github.com/r-devel/r-svn/commit/4907092c953bd0b9c059474f77e40990ecf312b1 +[R_growable]: https://github.com/r-devel/r-svn/commit/287b8316232aea7c619d0cadcb515507b1e3ebfa From 82e5d1fc7270d4b463b42b1291fa474778dac131 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 16 Nov 2024 23:44:48 +0300 Subject: [PATCH 25/44] List all uses of SETLENGTH, SET_GROWABLE_BIT --- posts/2024-12-12-non-api-use/index.qmd | 42 +++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 74e264aa..1b25d64b 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -546,10 +546,7 @@ duplicates at the cost of making `data.table` shared, by-reference values. The change has been introduced in [v1.7.3, November 2011][news173], together with the `:=` operator for changing the columns by reference (which has since become [the defining feature of -data.table][datatable_logo]). `data.table` version 1.8.8 (March 2013) -[saw][news188] the addition of the `fread` function that [over-allocated -and later truncated the individual columns][datatable_stretch_column] -using the same approach. +data.table][datatable_logo]). R's own use of `TRUELENGTH` is [varied][RI113]. The field itself appeared in [R-0.63][R_truelength] together with the `VECSXP` lists (to @@ -569,12 +566,32 @@ without the bit set on the over-allocated vectors, the garbage collector only counted `LENGTH(x)` instead of `TRUELENGTH(x)` units as released when garbage-collecting the vector, inflating the counter over time. +Nowadays, `data.table` uses vectors whose length is different from their +allocated size in many places. `src/dogroups.c` reuses the same memory +for the [`data.table` subset object `.SD`][datatable_docols_SD] and for +the [virtual row number column `.I`][datatable_docols_I] by shortening +the vectors to the size of the current group and later [restoring their +natural length][datatable_docols_restore]. Naturally, it [extends the +`data.table` for new columns][datatable_docols_extend] as needed. +`src/freadR.c` works with an over-estimated line count and so can +[truncate the columns][datatable_freadR_truncate] after the value is +known precisely. It may also [drop columns by +reference][datatable_freadR_drop]. In `src/subset.c`, the `subsetDT` +function [prepares an over-allocated +`data.table`][datatable_subset_alloc] together with its names. The most +important uses can be found in `src/assign.c`: the `shallow` function +[prepares][datatable_assign_shallow] an over-allocated `data.table` +referencing the columns of an existing `data.table`, `assign` +[creates][datatable_assign_create] or [removes][datatable_assign_remove] +columns by reference; finally, the `finalizer` causes an `INTSXP` vector +[with the fake length][datatable_assign_finalizer] to be (eventually) +garbage-collected to fix up a discrepancy in R's vector size accounting +caused by the existence of the over-allocated `data.table`. + Why non-API: make a length too long and the list is broken. - What to do about it: reimplement in ALTREP on R ≥ 4.1 @@ -695,7 +712,6 @@ References [news173]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-13 [datatable_overallocation]: https://github.com/Rdatatable/data.table/commit/e09d91beccc862eebcd9497c27b422058320396b#diff-22b103646a1efab9bbfc374791ccfc3fd1422eefc48918a3e126fc2f30d1f572R262-R276 [datatable_logo]: https://raw.githubusercontent.com/Rdatatable/data.table/master/.graphics/logo.png -[news188]: https://github.com/Rdatatable/data.table/blob/6a15f8617de121a406cee97b22e83e0c2c4bb034/NEWS.0.md#new-features-5 [datatable_stretch_column]: https://github.com/Rdatatable/data.table/commit/b4e023df736fed8c4dc536ac0061e895a565b375#diff-697a3094ef3d287d25b94aa344f7ed0262aa3fdb97af9b7e04e3b0ef585b05bcR30-R56 [RI113]: https://cran.r-project.org/doc/manuals/R-ints.html#The-_0027data_0027 [R_truelength]: https://github.com/r-devel/r-svn/commit/2d4ae2c4bd593bc2aa2273076997b6e63bbcb782 @@ -707,3 +723,15 @@ References [R_envir_hashval]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/envir.c#L497-L520 [R_radixsort]: https://github.com/r-devel/r-svn/commit/4907092c953bd0b9c059474f77e40990ecf312b1 [R_growable]: https://github.com/r-devel/r-svn/commit/287b8316232aea7c619d0cadcb515507b1e3ebfa +[datatable_docols_SD]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L197 +[datatable_docols_I]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L230-L237 +[datatable_docols_restore]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L482-L485 +[datatable_docols_extend]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L318-L324 +[datatable_freadR_truncate]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/freadR.c#L536-L538 +[datatable_freadR_drop]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/freadR.c#L551-L552 +[datatable_subset_alloc]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/subset.c#L300-L334 +[datatable_assign_shallow]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L192-L196 +[datatable_assign_create]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L535-L536 +[datatable_assign_remove]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L733-L734 +[datatable_assign_finalizer]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L21 + From 88030b6bd7f77aa3a2f66d956258929f67068238 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 00:23:32 +0300 Subject: [PATCH 26/44] SETLENGTH: use bullet lists Enumerate the problems with SETLENGTH --- posts/2024-12-12-non-api-use/index.qmd | 87 ++++++++++++++++++-------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 1b25d64b..a9fb2b9f 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -567,31 +567,68 @@ only counted `LENGTH(x)` instead of `TRUELENGTH(x)` units as released when garbage-collecting the vector, inflating the counter over time. Nowadays, `data.table` uses vectors whose length is different from their -allocated size in many places. `src/dogroups.c` reuses the same memory -for the [`data.table` subset object `.SD`][datatable_docols_SD] and for -the [virtual row number column `.I`][datatable_docols_I] by shortening -the vectors to the size of the current group and later [restoring their -natural length][datatable_docols_restore]. Naturally, it [extends the -`data.table` for new columns][datatable_docols_extend] as needed. -`src/freadR.c` works with an over-estimated line count and so can -[truncate the columns][datatable_freadR_truncate] after the value is -known precisely. It may also [drop columns by -reference][datatable_freadR_drop]. In `src/subset.c`, the `subsetDT` -function [prepares an over-allocated -`data.table`][datatable_subset_alloc] together with its names. The most -important uses can be found in `src/assign.c`: the `shallow` function -[prepares][datatable_assign_shallow] an over-allocated `data.table` -referencing the columns of an existing `data.table`, `assign` -[creates][datatable_assign_create] or [removes][datatable_assign_remove] -columns by reference; finally, the `finalizer` causes an `INTSXP` vector -[with the fake length][datatable_assign_finalizer] to be (eventually) -garbage-collected to fix up a discrepancy in R's vector size accounting -caused by the existence of the over-allocated `data.table`. - -Why non-API: make a length too long and the list is broken. - +allocated size in many places: + +* `src/dogroups.c` + * reuses the same memory for the [`data.table` subset object + `.SD`][datatable_docols_SD] and for the [virtual row number column + `.I`][datatable_docols_I] by shortening the vectors to the size of + the current group + * later [restores their natural length][datatable_docols_restore] + * [extends the `data.table` for new columns][datatable_docols_extend] + as needed +* `src/freadR.c` + * works with an over-estimated line count and so can [truncate the + columns][datatable_freadR_truncate] after the value is known + precisely + * may also [drop columns by reference][datatable_freadR_drop] +* `src/subset.c` + * the `subsetDT` function [prepares an over-allocated + `data.table`][datatable_subset_alloc] together with its names. +* `src/assign.c` + * the `shallow` function [prepares][datatable_assign_shallow] an + over-allocated `data.table` referencing the columns of an existing + `data.table` + * `assign` [creates][datatable_assign_create] or + [removes][datatable_assign_remove] columns by reference + * `finalizer` causes an `INTSXP` vector [with the fake + length][datatable_assign_finalizer] to be (eventually) + garbage-collected to fix up a discrepancy in R's vector size + accounting caused by the existence of the over-allocated + `data.table` + +`SETLENGTH` presents many opportunities to create inconsistencies within +R: + + +* When copying shortened objects without the `GROWABLE_BIT` set, R + allocates and copies only `XLENGTH` elements, but preserves the + `TRUELENGTH`. + * For this and other reasons, `data.table`s have a special + `.internal.selfrep` attribute attached containing an `EXTPTR` back + to itself. A copy of a table can be detected because it will have a + different address. + * Setting the `GROWABLE_BIT` on the `data.table` would make R set the + `TRUELENGTH` equal to the length of the vector when copying it. +* When deallocating shortened objects without the `GROWABLE_BIT` set, R + accounts only for the `XLENGTH` elements being released, overcounting + the total amount of allocated memory. + * `data.table` accounts for this using the `finalizer` on the + `.internal.selfrep` attribute. + * Setting the `GROWABLE_BIT` on the `data.table` would make R account + for `TRUELENGTH` elements instead of `LENGTH` elements. + +Unfortunately, `GROWABLE_BIT` is not part of the API, so using it is not +a real solution. + +* Setting `LENGTH` larger than the allocated length may cause the + process to access undefined or even unmapped memory. +* For `VECSXP` lists: when reducing the `LENGTH`, having anything other + than `R_NilValue` in the newly unaccessible cells will also make them + unreachable from the viewpoint of the garbage collector, potentially + prompting it to reuse or unmap the pointed-to memory. Increasing the + `LENGTH` again with invalid pointers in the newly accessible slots + will make an invalid list that cannot be safely altered or discarded. What to do about it: reimplement in ALTREP on R ≥ 4.1 From 67ad762959433afbdc40a52a9ed093cf4ad9b09a Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 11:24:15 +0300 Subject: [PATCH 27/44] Links and elaborations for SETLENGTH --- posts/2024-12-12-non-api-use/index.qmd | 63 ++++++++++++++++++-------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index a9fb2b9f..62b7cdff 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -600,35 +600,59 @@ allocated size in many places: `SETLENGTH` presents many opportunities to create inconsistencies within R: - * When copying shortened objects without the `GROWABLE_BIT` set, R - allocates and copies only `XLENGTH` elements, but preserves the - `TRUELENGTH`. + allocates and copies only `XLENGTH` elements and [duplicates the value + of `TRUELENGTH`][R_duplicate_truelength]. * For this and other reasons, `data.table`s have a special - `.internal.selfrep` attribute attached containing an `EXTPTR` back - to itself. A copy of a table can be detected because it will have a - different address. - * Setting the `GROWABLE_BIT` on the `data.table` would make R set the - `TRUELENGTH` equal to the length of the vector when copying it. + [`.internal.selfrep` attribute][datatable_assign_selfref] attached + containing an `EXTPTR` back to itself. A copy of a table can be + detected because it will have a different address. + * Setting the `GROWABLE_BIT` on the `data.table` would make R keep the + default `TRUELENGTH` (0) instead of copying it. * When deallocating shortened objects without the `GROWABLE_BIT` set, R - accounts only for the `XLENGTH` elements being released, overcounting - the total amount of allocated memory. - * `data.table` accounts for this using the `finalizer` on the - `.internal.selfrep` attribute. + [accounts only for the `XLENGTH` elements][R_memory_getVecSize] being + released, overcounting the total amount of allocated memory. + * `data.table` compensates for this using the + [`finalizer`][datatable_assign_finalizer] on the `.internal.selfrep` + attribute. * Setting the `GROWABLE_BIT` on the `data.table` would make R account - for `TRUELENGTH` elements instead of `LENGTH` elements. + for `TRUELENGTH` elements instead of `XLENGTH` elements. -Unfortunately, `GROWABLE_BIT` is not part of the API, so using it is not -a real solution. +Unfortunately, `GROWABLE_BIT` is not part of the API, so using it will +not help in the long run. * Setting `LENGTH` larger than the allocated length may cause the process to access undefined or even unmapped memory. -* For `VECSXP` lists: when reducing the `LENGTH`, having anything other - than `R_NilValue` in the newly unaccessible cells will also make them +* For vectors containing other `SEXP` values (`VECSXP`, `EXPRSXP`, + `STRSXP`) vectors: when reducing the `LENGTH`, having a non-persistent + value (something other than `R_NilValue` or `R_BlankString` or + `R_NaString`) in the newly inaccessible cells will also make them unreachable from the viewpoint of the garbage collector, potentially prompting it to reuse or unmap the pointed-to memory. Increasing the `LENGTH` again with invalid pointers in the newly accessible slots - will make an invalid list that cannot be safely altered or discarded. + will make an invalid vector that cannot be safely altered or + discarded: + + ```c + #include + #include + void foo(void) { + { + SEXP list = PROTECT(allocVector(VECSXP, 100)), elt; + SET_VECTOR_ELT(list, 99, elt = allocVector(REALSXP, 1000)); + + double * p = REAL(elt); // initialise the vector + for (R_xlen_t i = 0; i < xlength(elt); ++i) p[i] = i; + + SETLENGTH(list, 1); // elt is unreachable + R_gc(); // elt is collected + SETLENGTH(list, 100); // invalid elt is reachable again + R_gc(); // invalid elt is accessed + UNPROTECT(1); + } + R_gc(); // crash here if not above + } + ``` What to do about it: reimplement in ALTREP on R ≥ 4.1 @@ -771,4 +795,7 @@ References [datatable_assign_create]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L535-L536 [datatable_assign_remove]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L733-L734 [datatable_assign_finalizer]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L21 +[R_duplicate_truelength]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/duplicate.c#L43-L81 +[datatable_assign_selfref]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L27-L63 +[R_memory_getVecSize]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/memory.c#L1108-L1109 From 65b4a8ee056f38d5cd63951efbdaebc7e036fcc1 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 13:14:00 +0300 Subject: [PATCH 28/44] ALTREP over-allocated data.tables --- posts/2024-12-12-non-api-use/index.qmd | 82 +++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 62b7cdff..a676f1bd 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -565,6 +565,9 @@ was added to prevent the mismanagement of the allocated memory counter: without the bit set on the over-allocated vectors, the garbage collector only counted `LENGTH(x)` instead of `TRUELENGTH(x)` units as released when garbage-collecting the vector, inflating the counter over time. +[ALTREP] objects introduced in R-3.5 (April 2018) don't have a +`TRUELENGTH`: it [cannot be set][R_altrep_set_truelen] and is [returned +as 0][R_altrep_truelen]. Nowadays, `data.table` uses vectors whose length is different from their allocated size in many places: @@ -654,7 +657,73 @@ not help in the long run. } ``` -What to do about it: reimplement in ALTREP on R ≥ 4.1 +[Starting with R-4.3][R_PR17620], R packages can implement their own +`VECSXP`-like objects using the [ALTREP] framework; `STRSXP` objects +have been supported since R-3.5. An `ALTREP` object is defined by its +_class_ (a collection of methods) and two arbitrary R values, `data1` +and `data2`. A simple implementation of a shortened, expandable vector +could hold a full-length vector in the `data1` slot and its +pretend-length as a one-element `REALSXP` value in the `data2` slot. +(Currently, `R_xlen_t` values are limited by the largest integer +precisely representable in an IEEE `double` value, which is $2^{52}$.) +The overallocated class will need to implement the following methods: + +* [Common ALTREP methods][Rapi_altrep_methods]: + * `Length()`, returning the pretend-length of the vector. Required. + * `Duplicate(deep)`. If not implemented, R will create a copy as an + ordinary object using the length and the data pointer provided by + the class. + * There is also `DuplicateEX(deep)`, which is responsible for + copying the value _and_ the attributes, but it may be hard to + implement within the API bounds (`ATTRIB` is not API), and R + provides a default implementation that calls `Duplicate` above. + * Shared mutable vectors [can cause problems][Tierney_mutable], so a + decision to let the `Duplicate()` return value share the original + vector will require a lot of thought and testing. + * `Serialized_state()`, `Unserialize(state)`. If not implemented, R + will serialize the value as an ordinary object, which is what + currently happens for `data.table`s. Once an R package implements an + ALTREP class with a `Serialized_state` method, the format is set in + stone; any changes will have to introduce a new class. + * Similarly, there is `UnserializeEX(state, attr, + objf, levs)` responsible for setting `LEVELS`, the object bit, and + the attributes; the default implementation should suffice. + * `Inspect(pre, deep, pvec, inspect_subtree)`. May `Rprintf` some + information from the ALTREP fields before returning `FALSE` to let R + continue `inspect`ing the object. +* [Common `altvec` methods][Rapi_altvec_methods] required for most code + to work with the class: + * `Dataptr(writable)`, returning the pointer to the start of the array + backing the underlying vector. For `VECSXP` or `STRSXP` vectors, + `writable` should always be `FALSE`, so `DATAPTR_RO` can be used. + * `Dataptr_or_null()`. May delegate to `Dataptr(FALSE)` above. + * `Extract_subset(indx, call)`. Must allocate and return `x[indx]` for + 1-based numeric `indx` that may be outside the bounds of `x`. +* Class-specific methods: + * [`altstring` methods][Rapi_altstring_methods]: + * `Elt(i)`. Must return `x[[i]]` for 0-based `i` or signal an error. + Required. + * `Set_elt(i, v)`. Must perform `x[[i]] <- v` for 0-based `i` or + signal an error. Required. + * `Is_sorted()`. If not implemented, will always return + `UNKNOWN_SORTEDNESS`. + * `No_NA()`. If not implemented, will always return 0 (unknown whether + contains missing values). + * [`altlist` methods][Rapi_altlist_methods]: + * `Elt(i)` and `Set_elt(i, v)` like above. + +Additionally, `data.table` will need a function to [create new ALTREP +tables][Rapi_new_altrep] and to resize the vector in place. The resize +function will need to check whether the given value +[`R_altrep_inherits`][Rapi_altrep_inherits] from the overallocated class +and then modify the ALTREP data slots as needed. The function may even +reallocate the payload to enlarge the vector in place past the original +allocation limit without requiring a `setDT` call from the user. Since a +reallocation will invalidate any cached data pointers, it must be only +used from inside `data.table`, not from the ALTREP methods. + +The original implementation that uses `SETLENGTH` can be kept behind +`#if R_VERSION < R_Version(4,3,0)` for backwards compatibility. ### Fast string matching @@ -784,6 +853,8 @@ References [R_envir_hashval]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/envir.c#L497-L520 [R_radixsort]: https://github.com/r-devel/r-svn/commit/4907092c953bd0b9c059474f77e40990ecf312b1 [R_growable]: https://github.com/r-devel/r-svn/commit/287b8316232aea7c619d0cadcb515507b1e3ebfa +[R_altrep_set_truelen]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/include/Defn.h#L391 +[R_altrep_truelen]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/altrep.c#L345 [datatable_docols_SD]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L197 [datatable_docols_I]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L230-L237 [datatable_docols_restore]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L482-L485 @@ -798,4 +869,11 @@ References [R_duplicate_truelength]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/duplicate.c#L43-L81 [datatable_assign_selfref]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L27-L63 [R_memory_getVecSize]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/memory.c#L1108-L1109 - +[R_PR17620]: https://bugs.r-project.org/show_bug.cgi?id=17620 +[Rapi_altrep_methods]: https://aitap.codeberg.page/R-api/#R_005fset_005faltrep_005f_002e_002e_002e_005fmethod +[Tierney_mutable]: https://github.com/ALTREP-examples/Rpkg-mutable/blob/master/vignettes/mutable.Rmd +[Rapi_altvec_methods]: https://aitap.codeberg.page/R-api/#R_005fset_005faltvec_005f_002e_002e_002e_005fmethod +[Rapi_altstring_methods]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltstring_005fclass +[Rapi_altlist_methods]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltlist_005fclass +[Rapi_new_altrep]: https://aitap.codeberg.page/R-api/#R_005fnew_005faltrep +[Rapi_altrep_inherits]: https://aitap.codeberg.page/R-api/#index-R_005faltrep_005finheritsaltrep_005finherits From 5dcb5dc2cf447d6ed69a84a9b0fa64c9acecf4bb Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 14:07:13 +0300 Subject: [PATCH 29/44] Tweaks, more on ALTREP methods --- posts/2024-12-12-non-api-use/index.qmd | 89 ++++++++++++++++---------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index a676f1bd..0c6e141d 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -28,13 +28,13 @@ Fast forward more than four decades and an increase by more than three orders of magnitude in storage and processing capability of computers around us. The [dominant implementation of S is now R][is.R]. It is now feasible to implement algorithms solely in R, recouping the potential -performance losses in performance by reducing the programmer effort -spent debugging and maintaining the code [@Nash2024]. Still, the -capability of R to be extended by special-purpose compiled code is as -important as ever. As of `r when`, -`r round(sum(needscomp)/length(needscomp)*100)`% of CRAN packages use -compiled code. Since the implementation language of R is C, not Fortran, -the programming interface for R is also defined in terms of C. +performance losses by reducing the programmer effort spent debugging and +maintaining the code [@Nash2024]. Still, the capability of R to be +extended by special-purpose compiled code is as important as ever. As of +`r when`, `r round(sum(needscomp)/length(needscomp)*100)`% of CRAN +packages use compiled code. Since the implementation language of R is C, +not Fortran, the application programming interface (API) for R is also +defined in terms of C. What's in an API? ================= @@ -130,8 +130,8 @@ currently present in `tools:::nonAPI` (emphasis added): > files._ Correspondingly, the number of `tools:::nonAPI` entry points in the -current development version of R rose to `r length(nonAPI.trunk)`, hence -the present blog post. +current development version of R rose to `r length(nonAPI.trunk)`, +prompting the blog post you are currently reading. @@ -146,14 +146,14 @@ extract neither-official-API nor-official-nonAPI counts --> -Use of non-API entry points in `data.table` -=========================================== +Non-API entry points used in `data.table` +========================================= The first version of the `data.table` package in the CRAN archive dates back to April 2006 (which corresponds to R version 2.3.0). It has been evolving together with R and its API and thus has accumulated a number -of uses of R internals that are [no longer considered part of the -API][remove_non_API]: +of uses of R internals that are [now flagged by ` R CMD check ` as +non-API][remove_non_API]: `r gsub( '(?m)^', '> ', perl = TRUE, @@ -168,9 +168,10 @@ Testing for a `data.frame`: `isFrame` Back in 2012, Matt Dowle needed to quickly test an object for being a `data.frame`, and the undocumented function `isFrame` seemed like it [did the job][datatable_isframe_added]. Since `isFrame` was not part of -the documented API, in 2024 Luke Tierney gave the function a +the documented interface, in 2024 Luke Tierney gave the function a better-fitting name, [`isDataFrame`][R_isdataframe_added], and made it -an experimental API, while retaining the original function as a wrapper. +an experimental entry point, while retaining the original function as a +wrapper. Use of `isFrame` [doesn't give a `NOTE` yet][remove_isframe], but when R-4.5.0 is released together with the new name for the function, @@ -182,7 +183,7 @@ Problem (the only instance in `data.table`): ```c if (!isVector(thiscol) || isFrame(thiscol)) - /* ^^^^^^^ may disappear in a future R version */ + // ^^^^^^^ may disappear in a future R version ``` Solution: @@ -328,8 +329,8 @@ they must be subject to garbage collection. Like other managed objects in R, they are represented as `SEXP` values of special type `CHARSXP`. R's garbage collector is [generational and requires the use of write barrier][RI17] ([1][Tierney_gengc], [2][Tierney_writebr]) any time a -`SEXP` value (such as a `STRSXP` vector) references another `SEXP` value -(such as a `CHARSXP` string). In a generational garbage collector, +`SEXP` value (such as an `STRSXP` vector) references another `SEXP` +value (such as a `CHARSXP` string). In a generational garbage collector, "younger" generations are marked and sweeped more frequently than "older" ones. If package C code manually writes a reference to a "young" `CHARSXP` object into an "old" `STRSXP` vector without taking @@ -389,12 +390,12 @@ arguments; the value types behave as if R copies them every time. And yet actually making these copies is wasteful when the code only reads the variable and does not alter it. (In fact, one of the original -motivations of `data.table` was to further reduce wasteful copying that -R used to perform during certain sub-assignment operations.) Until -version 4.0.0, `NAMED` was R's mechanism to save memory and CPU time -instead of creating and storing these copies. A temporary object such as -the value of `1:10` was not bound to a symbol and thus could be modified -right away. Assigning it to a variable, as in `x <- 1:10`, gave it a +motivations of `data.table` was to reduce certain wasteful copying of +data that happens during normal R computations.) Until version 4.0.0, +`NAMED` was R's mechanism to save memory and CPU time instead of +creating and storing these copies. A temporary object such as the value +of `1:10` was not bound to a symbol and thus could be modified right +away. Assigning it to a variable, as in `x <- 1:10`, gave it a `NAMED(x)` count of 1, for which R had an internal optimisation in replacement function calls like `foo(x) <- 3`. Assigning the same value to yet another symbol (by copying `y <- x` or calling a function @@ -624,8 +625,8 @@ R: Unfortunately, `GROWABLE_BIT` is not part of the API, so using it will not help in the long run. -* Setting `LENGTH` larger than the allocated length may cause the - process to access undefined or even unmapped memory. +* Setting `LENGTH` larger than the allocated length may cause R to + access undefined or even unmapped memory. * For vectors containing other `SEXP` values (`VECSXP`, `EXPRSXP`, `STRSXP`) vectors: when reducing the `LENGTH`, having a non-persistent value (something other than `R_NilValue` or `R_BlankString` or @@ -661,12 +662,14 @@ not help in the long run. `VECSXP`-like objects using the [ALTREP] framework; `STRSXP` objects have been supported since R-3.5. An `ALTREP` object is defined by its _class_ (a collection of methods) and two arbitrary R values, `data1` -and `data2`. A simple implementation of a shortened, expandable vector -could hold a full-length vector in the `data1` slot and its -pretend-length as a one-element `REALSXP` value in the `data2` slot. -(Currently, `R_xlen_t` values are limited by the largest integer -precisely representable in an IEEE `double` value, which is $2^{52}$.) -The overallocated class will need to implement the following methods: +and `data2`. (Attributes are not a part of the ALTREP representation and +exist the same way as on normal R objects.) A simple implementation of a +shortened, expandable vector could hold a full-length vector in the +`data1` slot and its pretend-length as a one-element `REALSXP` value in +the `data2` slot. (Currently, `R_xlen_t` values are limited by the +largest integer precisely representable in an IEEE `double` value, which +is $2^{52}$.) The overallocated class will need to implement the +following methods: * [Common ALTREP methods][Rapi_altrep_methods]: * `Length()`, returning the pretend-length of the vector. Required. @@ -711,6 +714,19 @@ The overallocated class will need to implement the following methods: contains missing values). * [`altlist` methods][Rapi_altlist_methods]: * `Elt(i)` and `Set_elt(i, v)` like above. + * The rest of the atomic vector types ([integer][Rapi_altinteger], + [logical][Rapi_altlogical], [numeric][Rapi_altreal], + [complex][Rapi_altcomplex], [raw][Rapi_altraw]) will each need a + subset of the following methods: + * `Elt(i)`, `Is_sorted()`, `No_NA()`, as above. + * `Get_region(i, n, buf)` to populate the buffer `buf[n]` of the + corresponding C type with elements at the given 0-based indices + `i`. The indices are not guaranteed to be within bounds; the + number of actually copied elements must be returned. If not + implemented, R will use the `Elt(i)` method, which is slower. + * `Sum(narm)`, `Min(narm)`, `Max(narm)` to compute a summary of the + vector, optionally ignoring the missing values. If not + implemented, R will use `Get_region` to compute the summaries. Additionally, `data.table` will need a function to [create new ALTREP tables][Rapi_new_altrep] and to resize the vector in place. The resize @@ -719,8 +735,8 @@ function will need to check whether the given value and then modify the ALTREP data slots as needed. The function may even reallocate the payload to enlarge the vector in place past the original allocation limit without requiring a `setDT` call from the user. Since a -reallocation will invalidate any cached data pointers, it must be only -used from inside `data.table`, not from the ALTREP methods. +reallocation will invalidate the data pointer, it must be only used from +inside `data.table`, not from the ALTREP methods. The original implementation that uses `SETLENGTH` can be kept behind `#if R_VERSION < R_Version(4,3,0)` for backwards compatibility. @@ -875,5 +891,10 @@ References [Rapi_altvec_methods]: https://aitap.codeberg.page/R-api/#R_005fset_005faltvec_005f_002e_002e_002e_005fmethod [Rapi_altstring_methods]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltstring_005fclass [Rapi_altlist_methods]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltlist_005fclass +[Rapi_altinteger]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltinteger_005fclass +[Rapi_altlogical]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltlogical_005fclass +[Rapi_altreal]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltreal_005fclass +[Rapi_altcomplex]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltcomplex_005fclass +[Rapi_altraw]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltraw_005fclass [Rapi_new_altrep]: https://aitap.codeberg.page/R-api/#R_005fnew_005faltrep [Rapi_altrep_inherits]: https://aitap.codeberg.page/R-api/#index-R_005faltrep_005finheritsaltrep_005finherits From 6efc704059856316e1b5d982fd911ba229abe5b9 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 18:55:26 +0300 Subject: [PATCH 30/44] Gather the remaining uses of (SET_)TRUELENGTH --- posts/2024-12-12-non-api-use/index.qmd | 145 +++++++++++++++++++------ 1 file changed, 114 insertions(+), 31 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 0c6e141d..8d96928e 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -568,7 +568,9 @@ only counted `LENGTH(x)` instead of `TRUELENGTH(x)` units as released when garbage-collecting the vector, inflating the counter over time. [ALTREP] objects introduced in R-3.5 (April 2018) don't have a `TRUELENGTH`: it [cannot be set][R_altrep_set_truelen] and is [returned -as 0][R_altrep_truelen]. +as 0][R_altrep_truelen]. In very old versions of R, `TRUELENGTH` wasn't +initialised, but it is nowadays set to 0, which `data.table` [depends +upon][datatable_init_testtl]. Nowadays, `data.table` uses vectors whose length is different from their allocated size in many places: @@ -585,6 +587,7 @@ allocated size in many places: * works with an over-estimated line count and so can [truncate the columns][datatable_freadR_truncate] after the value is known precisely + * the columns are [prepared to be truncated][datatable_freadR_settl] * may also [drop columns by reference][datatable_freadR_drop] * `src/subset.c` * the `subsetDT` function [prepares an over-allocated @@ -608,9 +611,11 @@ R: allocates and copies only `XLENGTH` elements and [duplicates the value of `TRUELENGTH`][R_duplicate_truelength]. * For this and other reasons, `data.table`s have a special - [`.internal.selfrep` attribute][datatable_assign_selfref] attached + [`.internal.selfref` attribute][datatable_assign_selfref] attached containing an `EXTPTR` back to itself. A copy of a table can be detected because it will have a different address. + * The `_selfrefok` function tries to [restore the correct + `TRUELENGTH`][datatable_assign_selfrefok] if it detects a copy. * Setting the `GROWABLE_BIT` on the `data.table` would make R keep the default `TRUELENGTH` (0) instead of copying it. * When deallocating shortened objects without the `GROWABLE_BIT` set, R @@ -739,38 +744,93 @@ reallocation will invalidate the data pointer, it must be only used from inside `data.table`, not from the ALTREP methods. The original implementation that uses `SETLENGTH` can be kept behind -`#if R_VERSION < R_Version(4,3,0)` for backwards compatibility. +`#if R_VERSION < R_Version(4, 3, 0)` for backwards compatibility. ### Fast string matching -Why used: to exploit the `CHARSXP` cache. R interns strings, so a string -with the given contents and encoding bits exists as a single object, -even if manually recreated using `mkCharLenCE()` and friends. -Convert everything into UTF-8 and you can use pointer comparison. -Given `x` and `table` of strings to find elements of `x` in, `chmatch()` -puts indices into `table` into the `TRUELENGTH` field of the `CHARSXP` -contents of `table`, then walks `x` and reads the indices back from the -matching `CHARSXP`s, then carefully restores everything. - - - -Why non-API: this field is not always used (cf. `data.table` having to -work with it being completely uninitialised in old versions of R), but R -does use it for internal purposes sometimes (cf. `data.table` having to -restore nonzero `TRUELENGTH` for `CHARSXP` values used inside `SYMSXP` -values). - -Why this is hard to fix: the current happy path is very fast. -`O(length(table)) + O(length(x))` to convert encodings, -`O(length(table))` to mark indices, `O(length(x))` to look them up, -`O(length(table))` to restore everything. Done. Pointer comparisons will -take `O(length(table)*length(x))`, which is Bad. How expensive is it to -build a hash for `O(length(table))` entries? Best case lookup will be -once again `O(length(x))`, but only without collisions, the constants -are unknown, and the C standard says that hashing pointers is fraught -with peril. - - +`data.table`'s use of `TRUELENGTH` is not limited to growable buffers. A +common idiom is to set the `TRUELENGTH`s of `CHARSXP` values from a +vector to their negative 1-based indices in that vector, then look up +other `CHARSXP`s in the original vector using `-TRUELENGTH(s)`. This +technique relies on [R's `CHARSXP` cache][RI110]: for the given string +contents and encoding, only one copy of a string created by +`mkCharLenCE` (and related functions) will exist in the memory. As a +result, if a string does exist in the original vector, it will be the +_same_ `CHARSXP` object whose `TRUELENGTH` had been set to its negative +index. R does not currently set negative `TRUELENGTH`s by itself, so any +positive `TRUELENGTH`s can be safely discarded as non-matches. + +In the best case scenario, this lookup is very fast: for a table of size +`n` and `k` strings to look up in it, it takes $\mathrm{O}(1)$ memory +(the `TRUELENGTH` is already there, unused) and $\mathrm{O}(n)$ time for +overhead plus $\mathrm{O}(k)$ time for the actual lookups. + +Care must be taken for the technique to work properly: + +* The strings must be converted to UTF-8. Two copies of the same text in + different encodings will be stored in different objects at different + addresses, preventing the technique from working: + ```r + packageVersion('data.table') + # [1] ‘1.16.99’ + x <- data.table(factor(rep(enc2utf8('ø'), 3))) + # memrecycle() forgot to account for encodings + x[1,V1 := iconv('ø', to='latin1')] + as.numeric(x$V1) + # [1] 2 1 1 + levels(x$V1) # duplicated levels! + # [1] "ø" "ø" + identical(levels(x$V1)[[1]], levels(x$V1)[[2]]) + # [1] TRUE + levels(x$V1) <- levels(x$V1) + levels(x$V1) # R restores unique levels + # [1] "ø" + ``` +* Any non-zero `TRUELENGTH` values resulting from R-internal usage must + be [saved][datatable_assign_savetl] beforehand and restored + afterwards. +* The `TRUELENGTH`s are used to look up variables in hashed + environments, so R code should not run while the values are disturbed. + +The fast string lookup is used in the following places: + +* `src/assign.c`: [factor level merging in + `memrecycle`][datatable_assign_memrecycle], [`savetl` + helper][datatable_assign_savetl] +* `src/rbindlist.c`: [matching column + names][datatable_rbindlist_matchcolumns], [matching factor + levels][datatable_rbindlist_matchfactors] +* `src/forder.c`: (different purpose, same technique) [storing the + group numbers][datatable_forder_truelen], [looking them + up][datatable_forder_truelen], [restoring the original + values][datatable_forder_free_ustr] +* `src/chmatch.c`: [saving the original + `TRUELENGTH`s][datatable_chmatch_savetl], [remembering the positions + of `CHARSXP`s in the table][datatable_chmatch_settl], [cleaning up on + error][datatable_chmatch_cleanup1], [looking up strings in the + table][datatable_chmatch_lookup], [cleaning up before + exit][datatable_chmatch_cleanup2] +* `src/fmelt.c`: [combining factor levels by merging their `CHARSXP`s in + a common array with indices in `TRUELENGTH`][datatable_fmelt_truelen] + + + +### Marking columns for copying + +* `src/dogroups.c`: special symbols `.BY`, `.I`, `.N`, `.GRP` live in + their own special vectors that must not appear inside regular + `data.table`s; [setting the marker][datatable_dogroups_setlen-1], + [checking the marker][datatable_dogroups_anyspecialstatic] +* `src/utils.c`: need to copy columns that share memory or are ALTREP + [preparing zero `TRUELENGTH`s][datatable_copyShared1] (wouldn't the + `SET_TRUELENGTH` call always fail for `ALTREP` columns?), [marking + ALTREP, special, and already marked columns for + copy][datatable_copyShared2], [marking not previously marked columns + with their column number][datatable_copyShared3], [restoring the + `TRUELENGTH`s for columns that won't be + overwritten][datatable_copyShared4] + + Conclusion ========== @@ -871,11 +931,13 @@ References [R_growable]: https://github.com/r-devel/r-svn/commit/287b8316232aea7c619d0cadcb515507b1e3ebfa [R_altrep_set_truelen]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/include/Defn.h#L391 [R_altrep_truelen]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/altrep.c#L345 +[datatable_init_testtl]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/init.c#L206 [datatable_docols_SD]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L197 [datatable_docols_I]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L230-L237 [datatable_docols_restore]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L482-L485 [datatable_docols_extend]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L318-L324 [datatable_freadR_truncate]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/freadR.c#L536-L538 +[datatable_freadR_settl]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/freadR.c#L519 [datatable_freadR_drop]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/freadR.c#L551-L552 [datatable_subset_alloc]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/subset.c#L300-L334 [datatable_assign_shallow]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L192-L196 @@ -884,6 +946,7 @@ References [datatable_assign_finalizer]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L21 [R_duplicate_truelength]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/duplicate.c#L43-L81 [datatable_assign_selfref]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L27-L63 +[datatable_assign_selfrefok]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L99-L138 [R_memory_getVecSize]: https://github.com/r-devel/r-svn/blob/04a3b015e7d20598f66954b88ae2d39068451494/src/main/memory.c#L1108-L1109 [R_PR17620]: https://bugs.r-project.org/show_bug.cgi?id=17620 [Rapi_altrep_methods]: https://aitap.codeberg.page/R-api/#R_005fset_005faltrep_005f_002e_002e_002e_005fmethod @@ -898,3 +961,23 @@ References [Rapi_altraw]: https://aitap.codeberg.page/R-api/#R_005fmake_005faltraw_005fclass [Rapi_new_altrep]: https://aitap.codeberg.page/R-api/#R_005fnew_005faltrep [Rapi_altrep_inherits]: https://aitap.codeberg.page/R-api/#index-R_005faltrep_005finheritsaltrep_005finherits +[datatable_assign_savetl]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L1274-L1328 +[RI110]: https://cran.r-project.org/doc/manuals/R-ints.html#The-CHARSXP-cache +[datatable_assign_memrecycle]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L833-L867 +[datatable_rbindlist_matchcolumns]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/rbindlist.c#L70-L179 +[datatable_rbindlist_matchfactors]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/rbindlist.c#L367-L516 +[datatable_forder_range_str]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/forder.c#L295-L383 +[datatable_forder_truelen]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/forder.c#L769 +[datatable_forder_free_ustr]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/forder.c#L75 +[datatable_chmatch_savetl]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L58-L64 +[datatable_chmatch_settl]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L78-L80 +[datatable_chmatch_cleanup1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L103 +[datatable_chmatch_lookup]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L108-L130 +[datatable_chmatch_cleanup2]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L135-L136 +[datatable_fmelt_truelen]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 +[datatable_dogroups_setlen-1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L105-L152 +[datatable_dogroups_anyspecialstatic]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L6-L64 +[datatable_copyShared1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L260-L261 +[datatable_copyShared2]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L266-L267 +[datatable_copyShared3]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 +[datatable_copyShared4]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 From 5ba6b0f0240de2f1e2804d53234db8266da7da71 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 21:00:31 +0300 Subject: [PATCH 31/44] Of course there's more non-API entry points And other tweaks --- posts/2024-12-12-non-api-use/index.qmd | 113 ++++++++++++--------- posts/2024-12-12-non-api-use/precomputed.R | 36 ++++++- 2 files changed, 101 insertions(+), 48 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 8d96928e..a4a9d173 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -162,49 +162,6 @@ non-API][remove_non_API]: -- ` R CMD check --as-cran ` on a released version of `data.table` -Testing for a `data.frame`: `isFrame` -------------------------------------- - -Back in 2012, Matt Dowle needed to quickly test an object for being a -`data.frame`, and the undocumented function `isFrame` seemed like it -[did the job][datatable_isframe_added]. Since `isFrame` was not part of -the documented interface, in 2024 Luke Tierney gave the function a -better-fitting name, [`isDataFrame`][R_isdataframe_added], and made it -an experimental entry point, while retaining the original function as a -wrapper. - -Use of `isFrame` [doesn't give a `NOTE` yet][remove_isframe], but when -R-4.5.0 is released together with the new name for the function, -`data.table` will be able to use it, falling back to `isFrame` on older -versions of R. `isDataFrame` is documented among other [replacement -entry point names][WRE_replacement_entrypoints] in Writing R Extensions. - -Problem (the only instance in `data.table`): - -```c -if (!isVector(thiscol) || isFrame(thiscol)) - // ^^^^^^^ may disappear in a future R version -``` - -Solution: - -```c -// include non-R headers first - -// include R headers last -#include -#include - -// provide overrides after the R headers -#if R_VERSION < R_Version(4, 5, 0) -// R versions older than 4.5.0 released use the old name of the function -#define isDataFrame(x) isFrame(x) -#endif - -// later: -if (!isVector(thiscol) || isDataFrame(thiscol)) -``` - Operating on the S4 bit: `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` --------------------------------------------------------------------------- @@ -822,18 +779,80 @@ The fast string lookup is used in the following places: `data.table`s; [setting the marker][datatable_dogroups_setlen-1], [checking the marker][datatable_dogroups_anyspecialstatic] * `src/utils.c`: need to copy columns that share memory or are ALTREP - [preparing zero `TRUELENGTH`s][datatable_copyShared1] (wouldn't the - `SET_TRUELENGTH` call always fail for `ALTREP` columns?), [marking + [preparing zero `TRUELENGTH`s][datatable_copyShared1], [marking ALTREP, special, and already marked columns for copy][datatable_copyShared2], [marking not previously marked columns with their column number][datatable_copyShared3], [restoring the `TRUELENGTH`s for columns that won't be overwritten][datatable_copyShared4] + * The `SET_TRUELENGTH` call in `copySharedColumns` would fail if it + ever got an ALTREP column, but the only use of `copySharedColumns` + in `reorder` guards against those. -Conclusion -========== +But there's more +================ + +Using `tools:::funAPI` together with the lists of symbols exported from +R and imported by `data.table` gives a number of non-API entry points +which ` R CMD check ` doesn't complain about yet: +`r paste(paste0('', sort(DTnonAPI_yet), ''), collapse = ', ')` + +`(SET_)ATTRIB` +-------------- + +`findVar` +--------- + +`GetOption` +----------- + +Testing for a `data.frame`: `isFrame` +------------------------------------- + +Back in 2012, Matt Dowle needed to quickly test an object for being a +`data.frame`, and the undocumented function `isFrame` seemed like it +[did the job][datatable_isframe_added]. Since `isFrame` was not part of +the documented interface, in 2024 Luke Tierney gave the function a +better-fitting name, [`isDataFrame`][R_isdataframe_added], and made it +an experimental entry point, while retaining the original function as a +wrapper. + +Use of `isFrame` [doesn't give a `NOTE` yet][remove_isframe], but when +R-4.5.0 is released together with the new name for the function, +`data.table` will be able to use it, falling back to `isFrame` on older +versions of R. `isDataFrame` is documented among other [replacement +entry point names][WRE_replacement_entrypoints] in Writing R Extensions. + +Problem (the only instance in `data.table`): + +```c +if (!isVector(thiscol) || isFrame(thiscol)) + // ^^^^^^^ may disappear in a future R version +``` + +Solution: + +```c +// include non-R headers first + +// include R headers last +#include +#include + +// provide overrides after the R headers +#if R_VERSION < R_Version(4, 5, 0) +// R versions older than 4.5.0 released use the old name of the function +#define isDataFrame(x) isFrame(x) +#endif + +// later: +if (!isVector(thiscol) || isDataFrame(thiscol)) +``` + +`(SET_)OBJECT` +-------------- References ========== diff --git a/posts/2024-12-12-non-api-use/precomputed.R b/posts/2024-12-12-non-api-use/precomputed.R index d167d3d4..09c26e32 100644 --- a/posts/2024-12-12-non-api-use/precomputed.R +++ b/posts/2024-12-12-non-api-use/precomputed.R @@ -1,6 +1,9 @@ library(depcache) # TODO: uncache everything before submission library(data.table) +# The results are not reproducible because they depend on both the R-devel +# version and the data.table-git version, hence the pre-computation. + symbols %<-% fread( # most likely implies R on GNU/Linux built with --enable-R-shlib paste('nm -gDP', file.path(R.home('lib'), 'libR.so')), @@ -16,6 +19,36 @@ symbols %<-% fread( ) ][] +DTsymbols %<-% fread( + # again, only tested on GNU/Linux + paste('nm -gDP', system.file( + file.path('libs', 'data_table.so'), package = 'data.table' + )), + fill = TRUE, col.names = c('name', 'type', 'value', 'size') +)[type %in% c('U', 'w')][, + type := fcase( + type == 'U', 'undefined', + type == 'w', 'weak' + ) +][, + name := sub('@.*', '', name) +][] + +# this is entirely different on late-2024 tools:::{funAPI,nonAPI} +setdiff( + # symbols exported by R and imported by data.table... + intersect(symbols$name, DTsymbols$name) |> + tools:::unmap(), # renamed according to how R API entry points are named + # except those listed among API entry points + tools:::funAPI()$name |> tools:::unmap() +) |> setdiff( + # and also skip variables because they are omitted in funAPI + symbols[type == 'variable', name] +) %->% DTnonAPI +# which ones does R CMD check _not_ complain about... yet? +DTnonAPI_yet <- setdiff(DTnonAPI, tools:::nonAPI) + +# History of tools:::nonAPI getNonAPI <- function(ver, url = sprintf( "https://svn.r-project.org/R/branches/R-%s-branch/src/library/tools/R/sotools.R", @@ -34,11 +67,11 @@ getNonAPI <- function(ver, return(do.call(c, as.list(e[[3]])[-1])) } } - nonAPI.3_3 <- getNonAPI('3-3') nonAPI.4_4 <- getNonAPI('4-4') nonAPI.trunk <- getNonAPI(url = 'https://svn.r-project.org/R/trunk/src/library/tools/R/sotools.R') +# CRAN package metadata and check results cpdb %<-% tools::CRAN_package_db() needscomp <- cpdb[,'NeedsCompilation'] == 'yes' checks %<-% tools::CRAN_check_details() @@ -47,5 +80,6 @@ dtchecks <- subset(checks, Package == 'data.table') when <- Sys.Date() save( needscomp, dtchecks, symbols, nonAPI.3_3, nonAPI.4_4, nonAPI.trunk, + DTnonAPI, DTnonAPI_yet, when, file = 'precomputed.rda', compress = 'xz' ) From b3d0917f1275a031ab71d17455769cd54a2c9d1c Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 22:00:58 +0300 Subject: [PATCH 32/44] Solutions for findVar, getOption Expand other solutions, tweak the rest of the text --- posts/2024-12-12-non-api-use/index.qmd | 144 +++++++++++++++++++------ 1 file changed, 113 insertions(+), 31 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index a4a9d173..c767e5a4 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -136,8 +136,8 @@ prompting the blog post you are currently reading. - + -Non-API entry points used in `data.table` -========================================= +Non-API entry points marked by ` R CMD check ` +============================================== The first version of the `data.table` package in the CRAN archive dates back to April 2006 (which corresponds to R version 2.3.0). It has been @@ -196,7 +196,7 @@ Solution: [use `Rf_isS4` instead of `Rf_asS4` to control the S4 object bit, but be careful around shared objects. - + Converting between calls and pairlists: `SET_TYPEOF` ---------------------------------------------------- @@ -261,7 +261,9 @@ or: ```c // allocate a call with 'n' elements SEXP call = lcons(R_NilValue, allocList(n - 1)); - +``` +or: +```c // in R >= 4.4.1 only: SEXP call = allocLang(n); ``` @@ -313,14 +315,9 @@ const SEXP *sourceD = STRING_PTR(source); Solution: ```c -// first include non-R headers - -// next include R headers - -// then provide version-specific overrides #if R_VERSION < R_Version(3, 5, 0) // STRING_PTR_RO only appeared in R-3.5 -#define STRING_PTR_RO(x) STRING_PTR(x) +#define STRING_PTR_RO(x) (STRING_PTR(x)) #endif // later: @@ -770,7 +767,7 @@ The fast string lookup is used in the following places: * `src/fmelt.c`: [combining factor levels by merging their `CHARSXP`s in a common array with indices in `TRUELENGTH`][datatable_fmelt_truelen] - + ### Marking columns for copying @@ -795,19 +792,95 @@ But there's more ================ Using `tools:::funAPI` together with the lists of symbols exported from -R and imported by `data.table` gives a number of non-API entry points -which ` R CMD check ` doesn't complain about yet: -`r paste(paste0('', sort(DTnonAPI_yet), ''), collapse = ', ')` +R and imported by `data.table`, we can find a number of non-API entry +points which ` R CMD check ` doesn't complain about yet: +`r paste(paste0('', sort(DTnonAPI_yet), ''), collapse = ', ')`. -`(SET_)ATTRIB` +`(SET_)ATTRIB`, `SET_OBJECT` -------------- +> Use `getAttrib` for individual attributes. <...> Use `setAttrib` for +> individual attributes, `DUPLICATE_ATTRIB` or +> `SHALLOW_DUPLICATE_ATTRIB` for copying attributes from one object to +> another. Use `CLEAR_ATTRIB` for removing all attributes, added in R +> 4.5.0. + +-- [WRE 6.21.1][WRE_replacement_entrypoints] + `findVar` --------- +[Used in `dogroups`][datatable_dogroups_findVar] to look up the +pre-created variables corresponding to the special symbols `.SDall`, +`.SD`, `.N`, `.GRP`, `.iSD`, `.xSD` in their environment. + +> The functions `findVar` and `findVarInFrame` have been used in a +> number of packages but are too low level to be part of the API. For +> most uses the functions `R_getVar` and `R_getVarEx` added in R 4.5.0 +> will be sufficient. These are analogous to the R functions `get` and +> `get0`. + +-- [WRE 6.21.7] + +The new function `R_getVar` is different in that it will never return a +`PROMSXP` (which are an internal implementation detail) or an +`R_UnboundValue`, but the current code doesn't try to care about either. + +Example of the problem: + +```c +SEXP SD = PROTECT(findVar(install(".SD"), env)); + // ^^^^^^^ non-API function +``` + +Solution: + + +```c +#if R_VERSION < R_Version(4, 5, 0) +#define R_getVar(sym, rho, inherits) \ + ((inherits) ? findVar((sym), (rho)) : findVarInFrame((rho), (sym))) +#endif + +SEXP SD = PROTECT(R_getVar(install(".SD"), env, TRUE)); + // ^^^^^^^^ introduced in R-4.5 +``` + `GetOption` ----------- +Used in `src/rbindlist.c` to read the +[`datatable.rbindlist.check`][datatable_rbindlist_getoption] option, +`src/freadR.c` to read the +[`datatable.old.fread.datetime.character`][datatable_freadR_getoption] +option, `src/init.c` to read the +[`datatable.verbose`][datatable_init_getoption] option, `src/forder.c` +to get the [`datatable.use.index` and +`datatable.forder.auto.index`][datatable_forder_getoption] options, and +`src/subset.c` to read the +[`datatable.alloccol`][datatable_subset_getoption] option. + +> Use `GetOption1`. + +-- [WRE 6.21.1][WRE_replacement_entrypoints] + +The difference is that `GetOption1` doesn't take a second argument +`rho`, which `GetOption` has been ignoring anyway. + +Example of the problem: + +```c +SEXP opt = GetOption(install("datatable.use.index"), R_NilValue); + // ^^^^^^^^^ non-API function +``` + +Solution: + +```c +SEXP opt = GetOption1(install("datatable.use.index")); + // ^^^^^^^^^^ API function introduced in R-2.13 +``` + Testing for a `data.frame`: `isFrame` ------------------------------------- @@ -835,24 +908,22 @@ if (!isVector(thiscol) || isFrame(thiscol)) Solution: ```c -// include non-R headers first - -// include R headers last -#include -#include - -// provide overrides after the R headers #if R_VERSION < R_Version(4, 5, 0) // R versions older than 4.5.0 released use the old name of the function -#define isDataFrame(x) isFrame(x) +#define isDataFrame(x) (isFrame(x)) #endif // later: if (!isVector(thiscol) || isDataFrame(thiscol)) + // ^^^^^^^^^^^ introduced in R-4.5 ``` -`(SET_)OBJECT` --------------- +`OBJECT` +-------- + +> Use `isObject`. + +-- [WRE 6.21.1][WRE_replacement_entrypoints] References ========== @@ -874,10 +945,6 @@ References [Rd201905]: https://stat.ethz.ch/pipermail/r-devel/2019-May/thread.html [clarifyingAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html [remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 -[datatable_isframe_added]: https://github.com/Rdatatable/data.table/commit/87666e70ce1a69b28f0e92ec7504d80e3d53a824#diff-4fc47a9752ba4edfef0cabcc1958eda943545ad3859e48d498b0e3f87a9ae5aeR192 -[R_isdataframe_added]: https://github.com/r-devel/r-svn/commit/4ef83b9dc3c6874e774195d329cbb6c11a71c414 -[remove_isframe]: https://github.com/Rdatatable/data.table/issues/6244 -[WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points [setOldClass]: https://search.r-project.org/R/refmans/methods/html/setOldClass.html [IS_S4_OBJECT]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/memory.c#L4033-L4035 [isS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1838-L1841 @@ -1000,3 +1067,18 @@ References [datatable_copyShared2]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L266-L267 [datatable_copyShared3]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 [datatable_copyShared4]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 +[datatable_dogroups_findVar]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L90-L118 +[WRE 6.21.7]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Working-with-variable-bindings +[datatable_rbindlist_getoption]: https://github.com/Rdatatable/data.table/blob/master/src/rbindlist.c#L231 +[datatable_freadR_getoption]: https://github.com/Rdatatable/data.table/blob/master/src/freadR.c#L132 +[datatable_init_getoption]: https://github.com/Rdatatable/data.table/blob/master/src/init.c#L331 +[datatable_forder_getoption]: https://github.com/Rdatatable/data.table/blob/master/src/forder.c#L1619-L1637 +[datatable_subset_getoption]: https://github.com/Rdatatable/data.table/blob/master/src/subset.c#L299 +[datatable_isframe_added]: https://github.com/Rdatatable/data.table/commit/87666e70ce1a69b28f0e92ec7504d80e3d53a824#diff-4fc47a9752ba4edfef0cabcc1958eda943545ad3859e48d498b0e3f87a9ae5aeR192 +[R_isdataframe_added]: https://github.com/r-devel/r-svn/commit/4ef83b9dc3c6874e774195d329cbb6c11a71c414 +[remove_isframe]: https://github.com/Rdatatable/data.table/issues/6244 +[WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points +[datatable_isframe_added]: https://github.com/Rdatatable/data.table/commit/87666e70ce1a69b28f0e92ec7504d80e3d53a824#diff-4fc47a9752ba4edfef0cabcc1958eda943545ad3859e48d498b0e3f87a9ae5aeR192 +[R_isdataframe_added]: https://github.com/r-devel/r-svn/commit/4ef83b9dc3c6874e774195d329cbb6c11a71c414 +[remove_isframe]: https://github.com/Rdatatable/data.table/issues/6244 +[WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points From bccf6120df2d27bc4655fed3bbfa4888cf420745 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 22:40:03 +0300 Subject: [PATCH 33/44] Solution for standalone OBJECT --- posts/2024-12-12-non-api-use/index.qmd | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index c767e5a4..9e870bf1 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -797,7 +797,9 @@ points which ` R CMD check ` doesn't complain about yet: `r paste(paste0('', sort(DTnonAPI_yet), ''), collapse = ', ')`. `(SET_)ATTRIB`, `SET_OBJECT` --------------- +---------------------------- + + > Use `getAttrib` for individual attributes. <...> Use `setAttrib` for > individual attributes, `DUPLICATE_ATTRIB` or @@ -807,6 +809,8 @@ points which ` R CMD check ` doesn't complain about yet: -- [WRE 6.21.1][WRE_replacement_entrypoints] + + `findVar` --------- @@ -921,10 +925,31 @@ if (!isVector(thiscol) || isDataFrame(thiscol)) `OBJECT` -------- +Used in `src/assign.c` to [test whether S3 dispatch is possible on an +object][datatable_assign_OBJECT] before spending CPU time on +constructing and evaluating an R-level call to `as.character` instead of +`coerceVector`. + > Use `isObject`. -- [WRE 6.21.1][WRE_replacement_entrypoints] +Problem: +```c +if (OBJECT(source) && getAttrib(source, R_ClassSymbol)!=R_NilValue) { + // ^^^^^^ non-API entry point +``` + +Solution: +```c +if (isObject(source)) { + // ^^^^^^^^ API entry point +``` + +Most likely, the check for `getAttrib(source, R_ClassSymbol)` is +superfluous, because when used correctly, R API maintains the object bit +set only when the `class` attribute is non-empty. + References ========== @@ -1082,3 +1107,4 @@ References [R_isdataframe_added]: https://github.com/r-devel/r-svn/commit/4ef83b9dc3c6874e774195d329cbb6c11a71c414 [remove_isframe]: https://github.com/Rdatatable/data.table/issues/6244 [WRE_replacement_entrypoints]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Some-API-replacements-for-non_002dAPI-entry-points +[datatable_assign_OBJECT]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L1158 From 53233661560130bbaf0a80aaada78d553dcb76de Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 17 Nov 2024 23:29:31 +0300 Subject: [PATCH 34/44] ATTRIB may pose a problem in the future --- posts/2024-12-12-non-api-use/index.qmd | 74 +++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 9e870bf1..22214530 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -174,8 +174,8 @@ from the C code: when creating a copy of an S4 `data.table` from scratch (or setting all attributes from one object onto another), the destination value must also end up being an S4 object. This is controlled by the special "S4" bit in the header of every R object, so -the code must read and set it correctly. +the code must read and set it correctly. The undocumented functions `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` exist as bare interfaces to [the internal @@ -213,8 +213,8 @@ contains the value in its `CAR` field and a reference to the rest of the list in its `CDR` field. Argument names, if provided, are stored in the third field, `TAG`. The list is terminated by `R_NilValue`, which is of type `NILSXP`. These structures must be constructed every time C code -wants to evaluate a function call. +wants to evaluate a function call. Previously, R API contained a function to allocate `LISTSXP` pairlists of arbitrary length, `allocList()`, but not function calls, so it became @@ -799,9 +799,8 @@ points which ` R CMD check ` doesn't complain about yet: `(SET_)ATTRIB`, `SET_OBJECT` ---------------------------- - - -> Use `getAttrib` for individual attributes. <...> Use `setAttrib` for +> Use `getAttrib` for individual attributes. To test whether there are +> any attributes use `ANY_ATTRIB`, added in R 4.5.0. Use `setAttrib` for > individual attributes, `DUPLICATE_ATTRIB` or > `SHALLOW_DUPLICATE_ATTRIB` for copying attributes from one object to > another. Use `CLEAR_ATTRIB` for removing all attributes, added in R @@ -809,7 +808,61 @@ points which ` R CMD check ` doesn't complain about yet: -- [WRE 6.21.1][WRE_replacement_entrypoints] - +### Testing for presence of attributes + +`src/nafill.c` [checks][datatable_nafill_ATTRIB] whether the source +object has any attributes before trying to copy them using +`copyMostAttrib`. + +Problem: + +```c +if (!isNull(ATTRIB(VECTOR_ELT(x, i)))) + // ^^^^^^ non-API entry point +``` + +Solution: + +```c +#if R_VERSION < R_Version(4, 5, 0) +#define ANY_ATTRIB(x) (!isNull(ATTRIB(x))) +#endif + +if (ANY_ATTRIB(VECTOR_ELT(x, i))) + // ^^^^^^^^^^ introduced in R-4.5 +``` + +### Iterating over all attributes + +* The code in `src/assign.c` needs to [iterate over all the attributes of +`attr(dt, 'index')`][datatable_assign_ATTRIB] in order to find indices +that use the given column. +* The code in `src/dogroups.c` needs to [iterate over all attributes of + a column][datatable_dogroups_ATTRIB] in case a reference to the value + of a special symbol has been stashed there and must be duplicated. + +Without `ATTRIB`, this will only be possible using an R-level call to +`attributes()`. + + + +### Raw `c(NA, n)` row names + +The code in `src/dogroups.c` needs to [access the raw `rownames` +attribute][datatable_dogroups_rownames] of a `data.table`, even if it's +in the compact form as a 2-element integer vector starting with `NA`. +The `getAttrib` function has a special case for the `R_RowNamesSymbol`, +which returns an ALTREP representation of this attribute. + + + +### Direct transplantation of attributes + +The code in `src/dogroups.c` needs to +[transplant][datatable_dogroups_SETATTR] the attributes from one object +to another without duplicating them, even shallowly. + + `findVar` --------- @@ -1092,6 +1145,11 @@ References [datatable_copyShared2]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L266-L267 [datatable_copyShared3]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 [datatable_copyShared4]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 +[datatable_nafill_ATTRIB]: https://github.com/Rdatatable/data.table/blob/master/src/nafill.c#L216 +[datatable_assign_ATTRIB]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L618-L629 +[datatable_dogroups_ATTRIB]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L57-L58 +[datatable_dogroups_rownames]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L131-L134 +[datatable_dogroups_SETATTR]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L509-L515 [datatable_dogroups_findVar]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L90-L118 [WRE 6.21.7]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Working-with-variable-bindings [datatable_rbindlist_getoption]: https://github.com/Rdatatable/data.table/blob/master/src/rbindlist.c#L231 From 315106762c0a3e0b9a8bfb120a70e821fc9fde32 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 19 Nov 2024 12:50:29 +0300 Subject: [PATCH 35/44] Commit precomputed.rda to share the draft Otherwise rendering the document would require first compiling R-devel on GNU/Linux with --enable-R-shlib. --- posts/2024-12-12-non-api-use/precomputed.rda | Bin 0 -> 22124 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 posts/2024-12-12-non-api-use/precomputed.rda diff --git a/posts/2024-12-12-non-api-use/precomputed.rda b/posts/2024-12-12-non-api-use/precomputed.rda new file mode 100644 index 0000000000000000000000000000000000000000..5eb4e3ea91ea580740ef3f4779bb000977943b1c GIT binary patch literal 22124 zcmV(zK<2;wH+ooF0004LBHlIv03iV!0000G&sfakG-XyWT>vQ&2UJ%gRpOV=m z?~(SQxTJTDlT=umZ9S9Ru-A5y_s#$48QHPe&L0!MwZzt;{Jq$&1Vi#sk>Fv2f;ebg zq0zx;o~qmBWge0zVyjBoE9cZwf*@-}b+v~O>rsBukcgTi$|rJE-Ih4vn>$`4zR&WL zqS4nle1dwT$_b9&Ga>0>*3MkKoMlnxg;bsd=28>2ODJ7?2L55&)7KkOHK6hB$MFO| z_1m_X)B67me$6bl%0>3fS{$*@sc6Q5UR6_Rqn27Q@l`Q-x`bNdq2zn!D$q6XUg0G2z>5W{+V!oyq7r9SpX34a zpiUC1Q0T5Ne`{M7X8U{l_LPXUJ`9hiDGv5Q}zU{-99yy|CNQ8GvQ_-hnhh)uR_o+}m2VyQ9(DA(t=4${RMCx_Se&2oX0irs z|D4$>Vm2NO9gO~aGpfO6X~9SVqk<>gh-!(utC}{y8rrqnuAQ9hTDRXvmIOM*oI=iH z3s?BqO@RE{|_A)z=1-$wo$Rm{z z9E~fI0+*P{UTCtinG8X-N)P&%7H!*EA0hh3QZ0Z`Mj?d5Z@f(5Pq z?T^kH3JX??bLiW_neqVs0`@UTntyHJD$MU2u*(VTz@B^lBH9(K+_o z^mX4LHw`&n{JEeu32EG8jn@6C6k$USLFPKb$-4{3A4aD=-MXNV=i}B6Tls}sq_K?+l*xvu z(|B|fjxlimLgTm`Y;mU3PrTD zsC6D$XJ4w7r>FbmPyW+Rdb|M3j;Vd4JvUXSAPJ4B?xLz-Go%=or4k&qv~>hy18v>{ z=bn4x@j3C;qV#!({Yq1_?)YO=&5B{#ExRGG91}`Qdo!szHuR}))rl43T?tW=6b-EK zfzCr%!||o8PS(CM!@I^I5$FHE)#2Z$}jKp7tz>YVulV9yQ86;vAz*UT%0GlZyh5;q* ziHiR!`QjX(W!79^_>`rzUB8`&tjZy`%9ixA-@^v!VH~(jkk=SU8~5#QEsX>g=F{?g z8_$@Mj8i-ok{O3yYg6s$^YJ85FvdG~B=b<8r@rZ9Q8#V${Ij7d>RZa9ZmQZeiP)Fa zf(280Vh?(@@~C5gt0a9pR*_K>cFm9LPc+b43#(J5gfZ;zx3dyY%5YW81^C*vE_Upa zDt7m-z0@X}sP6THzZnKLhAKB{!_`OpA4_@?+E!9+$R9B%PR>x@9cQ=mA4Pkn>6bGK z)+d9wBfbThOFH^p-xn-D2FHFmGho}wQklY3@7e|OS%nz`A^OdvPVfCrX+lYG#?Vg3 zyzk=PeQYjZWq9uBOX&B5P}MD#wY4{}k>%SvH6KTMH=~ri{v40V2xv) z=~A($!tO|iGVCcN+S1w@9HYh=IY|7tt|O3mbd1VqD`DS8s17e+%heGAOzmmu#l%9& z4kFGM7CI%>!9stYK(ruTs#BGqIxMe(EU|c(>>`~OwsqSCF7l>XXW``4Uk6}A(3hE6 z`)~(CCG2ShZP92|wZ`G)-ua3EZREI}83udNCFT<&!ta4vK4n^U$JdfxcbpFbM>7;p!L zIg#5fT-xI%^w(qz5AOec&JwX%=T$Hra*?ooDV56|Y{6ea=`iw<>ka^(5I}BPEp+lw2MPHkJaTt?Z7qXgy$fp=hyL5X~ zv9L9u&WpYgS$JAb;@OP{ra8dx_|1^zyduK<#*^X*$#{4)=a}KbuJ~bNnm(_8(6mkG1(o{k!7`{-{&Fn+A-(DJ)qzefwS?cqHJ!~jz_ z-KefwB_@oW9efzr^^g>?-OOJWOQ9O>iiI-6DKaKfj2?Q@&e)~gif5O?E`Puek1=jDVV>U&AvX3 z3laTHtud+_-1(wC#oGgbmY7hgQ;IXh8vlSX#Rn&j9r4Oa!$Q5`@FPy&UpaW#kplMB zUoIc0ir}}<@rIZun=F!MnTuUUq8(A9R8$piH^e;HI;5r+C56_y2`_ykbj_C=aDY#=AXAcF>Y zOd}0E^?dc{lvMHAl?~?;eI+HvzigN}5kVbB7?voqka>@>Ch?%^g{n@@Pyc~^G25G& zA^|{uEFaB}u6vU+Hgb*#6h`_0dD96`;)`UtA;4R19r^Z!kJ8$0`t$+$G1{mh`^03? z7^C_u-s^oPI9Y7%*Q)t&0kospw@^Z0n=7PgaV?c{)#=LNI}EGEHOJ2 zfUj~n<^2N!ZLrqV`ZV>edY!fTwK6c{^wCgK6d30c9h6XsJI2SULv9e5Nfh|hq?Na* zKaX-Hw&*RA91c_w!tkIJEW^^V(J9{C^=>2enK~_BbnM}7`)44eAxj+lgV(vc-?+UJ zQncZsOR!t7trnu9!&PZPsf84h)ScleJqSL8(cW|KI9PmD0$68lbUO<$tiVOw)s9=f zNhs_tAcwQ7ZnxZ8I6$s+55AvC7q-3|=~*w7Ot42=S{^sWQpFOz#Nx=Kqx&nS=oef? zxu9Wan;+^At&RHhb)ubtynem}I`uF#?wUzf2)1P|4_(mLV8)Ab=&is6HfEMteD0{<5nE*%I1WGYsKGQ|vt?SWtOQ^MCBmUi9N z?DC(vv0GKPEWMitD$u5Uo6r|3+BndE!3s8OB+$)xNpJj&GC#lapu9^=PF_9j0ts!{ z`qh9BaTVw5wr*Avw^=TC8h|f0)PYLS!%u#{Rv6%^{5HANMI6p1;96Bga~=9DSA4>`^QZUpvY>oC6+b6Sfn;%^_LaP zQNd6RKa2Se%L;=PwRrJ;RecOHjCfFDRixN*VUr`=%I2~isGj&Xd+)YjJ1)gY{DL_2 z3p0!w81N3^9bc=EiHeT4cg&_pkYlF32j~TEh&m;kpd&w+B5A<7w~M3A7xvg4+DTTi zj0zzMPgLn|tRk#FRRWp%Y7bIb3OIU*D^d%`bbfP70py$ayQwzIP!2ekBnJ4ZP7mPw zF*(q)ol-}0;QjVyGx7l}whV;LWo>PlHuN|mY^u$(%U}XEQ`mlP_50NDT~(6$!}ZQ+ zH*8)~#!eYKUc&%ORhTs!T?=0uvQ*@4iuen(K33V&Gjrc)=jn5sGJ61Hc`(|cMn%xuO_66i1Cj=#ZJ9n9RobIgC+zwU&WlPy zh=DB9Re&r7JX{uAnJ5f#!Gu$Ebe3w4w0H5Vbx_iV#mz6(TXl@B{abx z2?}1_S_1;Dfy9!~ib+3mJ-Zc+T@=oB4G(ak*AcnDKW|I>&;H8^zr=#7Vt4tXv$x9X zjX{FlfNXMT_EP(It=0qcWOQD@sk!xHJVZ1-*a4(4x1J<~DDC$>9bVVCcz|Oldn|w{ z@M++G%q$G=dO@dBErt^88NcbWM9n}p?8mVV=AZ-Mm z)3lNSA+7)fh7X~PI8tnVsP33l43RD$OXT8g_I+%-&YH|z-~|IEasUcYP8v+OR82s* zrkHf^{-|v2y>k>>pts#?KB4;^>6WqgkJSLPjE1(vX4 z{Rzy8r%llk2>qPqZm)vUgIca_DC-aal338hVoJ$>$WDUEQHT1}^A6~ti6X?e7jzT; z{5QnHqZOzfgMVAcd;r*W8sKPMVtn}vle%{4S0!aB7tyi9*WHT|0w3 zk6hU{Q>^OjTz`Orj)o;PqQih#Pg-k^3nG5-G5jY^8xM_rH-r=hR@-~qt@8AOH+~NZ z>}CkLPX^4<=MVH81FP6IY^v<5^&qn%CWdGLiUW`q6Wz;ObYEX_z)NRC5$FLvcQmW> z^)n8!FD~C{JEEj-9+fMP3n3=^?ij~|3+&U)A1T_OtBQwJ-SE-?3;Xo^;w7aQ0AVND zA}D8sr#|AB)TD%r2$CpEQJVZpxs6f^=Af$|7w$-&8s!!{dzF)1!|}x;c#?-)fe9#S zO=$a#3Eq=G7k=jE!cC88?&AtKyd2cMC_E>t#}{k=9Bu<4!O#c3TSNgmFD_1D3tCf* zl?=%yl71c8P9hVV&^DPzv&u911>{;Sr=$2@(`>tfjIeq@ygNf<4|igs+~U0KsM-id z76`@KB!VtwQ28tZ#9R0tY*gsio6{%T^lJ)7jf49yagqbygH%gE6eVH2s_i>60jqxa zatG4gi-og-vez^513OxCY%t&TWc;Ns#Uv~soKhv#(r$|)U`lSD=Bm*`$iFmf6d;sp zvAo$yye$sy;#k@Yb9^TD5NToxIJMQ`1%Pr{iTo9{A~==-aF=wFr% z^=4uZT-395FrvJ5>HU6+J>dT1SfoTs^KWM z&&1by* zg>s1Az%h0Sv^r)X@7}%8%0x~7$#_|Yz+xnX2Uu(Dxs~-nVQ`ZCxij=0q3p{zu>zpfTY+5!E=4o+H zF|Cw^mCJb-oXmwc-={aQVtjR|f-q>bV3uRE>CF1W!ZeOB5x5S8wrF}Q(X0qOyo7gE zrGIN_gR$$k&w|$EMl^kxtSWpHpjARU@8K!)eMs*-2-U0lg{mF`=xFVSDIiZ-d8jRr zd0{R(>`oW+4trTxu)W@Tg&XLbtDyhd?}hiV!f2A{GCD7 z_RVMKiZtZjF<$_xnEHyNVQ_d(BI$;Z4rxaT*hOmo0*BU>vuUYlRgTL9?a7xhDMbP! z&#RxM_36zOmPL9 z>db(bIgW#2C%StiODn%V@S%M@U$YCrC`f0GAXG2^6`ZwdIC)MS0So;OsndsS6D4Jw zFwz>^nQft5=CQK0?RV*8&YbXGuY&hWCjHgG!*0~C;7Z6ZYaL8~i`-*wISV2S9*6_I zTZd|k0o#GszeCN_&EaSMMXmoR-}2pUdqgARAibG(B`f=JcC9^wARy?>L$_V6fVT-h zx1sp~2REFK1TfoP_N&#{i6853$^|ry1*(t6Gy*9k_jI9l#^-!o;9r^G(Wx;R{f^+I znl}oPCyKm_$`J8ujN1$F>1V(SQVC(igZE<|fa$C8Sl4=^xSClh@;9yN-@mb7b?G`{^m z@!ac|cIODw%xjzAsz;F#UQ|OlObu*uCc+^^VEm~Zcg^@Ml`sN^6j;uEC!_)oKhT0l zT4^hqe5pZi)s{;0Bb&l36M-Zf!f^j}&aK+7a>i#3OMSbxTSG@koNSrlEO36)pkBT_ zn9op z@4tv@`n#Ig(dW~pR)$?ww>bTTQgDHMD()+>VJ$Z8rlgfj^1le8J5uP!Yts_x{ z8c(|2V4}M|eD;!O+25#WY1T5;y7lY0y>b*cp84jm@SO|gc`_prp6e=q%d{@#D?tPW zaa29Ilsi;V^uVvEaR`RgwFF%Ca_z{$^!VqnRZEfYC_J~%cB#IjtoYs4|Dd(0m?)=2niERsi)_OqJS9KWC642DJYkDYOsbNOEQ9IA=M_3e=lgfg4z z2xJ?^aw^6;71qRAkubv`TrpuHwO>XFRUh_cPP^fCm+Mm+y9-aHG~9jA)b;A#7<34h zH?xoM;Ce-!6Lq=Rq1ejcJ3Kgd(p&Dc81beNfk`Pc6>JE93N*BxDKFI_~v8ioA zbfPI~HMJig%!z{hTl1#uZK!K!PsKMD*mpl6=aeX4??u?bppJOflKFjQ7xtVk*VqFo zW@H>*Ohr>K6z^n4=%6AVo0f!!i!ujz6c#a&?~-tSCi3T9-9uUo#JG3MQ^Ci+xnmT| zOgLJI*<@8Pz42{^pRD4M`O%KT`OY zs9ghu=H_-%iofhy0Ja#Hv1L5Lz5YIR0MQRYz8Ls;y?Ts8k~B$5k=qa5Dt4#Zbry)O zUB_G_39SMAD0u9rzD@rt=JD}hGlHi1*YiG?wAOP#5g_w zj;e*vlJAA~<2LO*gq(-DC9m)Bf;yRH&ojt=MQ2_-LggwS4X4HK`w{#*iqk3?+!a;A zdS#rI@jKYM@-4tCp=&jg+;3r#7#qC3MzqG4e8TdLvk(i?4+0w_rV(|$w$cMyV=cm_ zj!gCy0c#HH5LDq*DYX?w+GDB#8OIc|A;L;gEW!z7p3mRzJ>wlIV-yfy-jwUv4g1fE z5_@7D^_$ARVh){7XV24O-J6sjWBy8;)!lYDQz|MdLPr;IN+?8YBPyQGC=c+8 z>4ry{HG@;Rh4Rw}6KpNaENb<2@=w#YQmeRN7F?HUe>#)sr%_W!TgZ{=*EU9DZ!Vm& zZy%tB_Ok=On&+*~JFYVPS9PvD2MbaEPJrVD3|8jA^pAdHs`ep%Vw8 zLPmWD)@D4vYqW9)So#J1v_&us^m*(z9{w_KBLKfYUyFhl zS`aO&CoB0F#ut7080dIibhYBEOPkL*NbXDpQcnK;_99v5iqg7xS%$^7({O6=lf1u& zl+QeS z&ECtVoR8=eEfk*FE#A;-%0VVwGySW6o0d^qp=cP>*dBn#=-2kHp_?y!PClw&G|L0iYn1f zW0VWz49olIQfbJXdi7Kv*G(DUR|A`P05)C6e05#68m;cizI9{ePVfz#PDLWEMeP&sGR?hcf|1a_^V9qA-%Sw>kL~Ta(Ge3b`?GSUfN} zl%;yBYpPri(eo%Rt){zQpaP$0>dzsK082L5qLd`FqS5>%LNWba-*Dkxi>48aG$JAE z(tzWK2tRdrj|CrC4$r2WNEzR{UTkIMu zLyI=wm6x)4Y2rGx_3r$52{$cSAz2M9AYzm!^1vHh_bW{qMB&S!81jL*7QtcK!X2^+ zeNF&NJ(VMk#shko?Cz6$2!yU7AmcV$Fk8A6$!3bQKgXWG6;YYBl<3ufAyn{J;lan# z*_R7ki-GpdC_+wfKIr+5@S=)GiAE0C&0ZnvMmX(Pu7G*%ILRF6!n9pwWa>E-WKWk5 zdmZ+B84K^YzE<#3)5*P4s`wPF(VBevhY{XjR*N?MJ^dp*T_I4>I%arv25lt5F3^~u zMt`O{_{8L$F4O*r-|(HbMzz6}h9E&}Dm%raCw}qsqRF5FyE2WY=gm){eZlR z8i@sFwTgHmrS_jNvE+SzMc@CSiYQX&BX|MwlNT4HjW`9sd1a&t2ijh;JvEn7Jx6(S zQml>jD$(s{%F_z}ETn1jvxZOB0ZgOY`UGbipKm+7wVzdD%t8B!@+eZKl8lHOA0i?J zjpd<(+%8-NU>^$8w3<>XFFsmF&}V|DNHN9^2+*8WsbvG!5NZAsskXz(AtT*P*n5UO z>f#{*9Hj6tvYge^ck`@)2C%flVkIFyQc?~rKjWj-zXn{mhmqSo&~8F`FJE|X2J#vx zJo>}RBdLqH!;d;M=Dprx0BJtnVgtl_CDi zN};?F^duRMlr4IsLn0ODL|KzDmX);b`5>}uaVw+zVxNJoY1B=3P~JlPbItf16fSwm zkS=$Mz(j?{W2xKb?LSuczPQT!%d<>7T2w7#5HNNbr$CiyKV3awQ5_Av*8V0#^E5ne z`UXS54IVxKH6X<}p(9Mw@L3>ym;M^$IdDF1H6gg#B)5N~-+20wvH7?NzymOCpToN~ zHHR#NZWpC@L3wQsQy;A;lr3xQ;`UCdCgT9*1|K*ZchYT;-)O{jh!|s2nl9$-ox50T za{CO23GQ+<_ruYN%yGncTs0T1R=v+LE%;3Mw6l(lkKQr{4qE2zzRZ_HB7?o?yN{n< zb6A{N5%*YRUVj5R+0xFi$)a1cK4ac0_@}!B?=!Gl3J<=%PXa_0keQz1uo&;TK^R2n zz@DfnPKLUdqP?n6E}o_Rs>W)ij9d=3(yUVwz>)pf@jxU>WL}5SnHjFwXcgk3Z$HnX z262z@m5s~v3oJuoKxYh`7s3*q3mU6zqOh1@v$u|eKdzZ=N(*BbQ&}oKMm#=FGDAS4 zdviqPjW5Ux3CPtyac}B(7 zhjr{OZMDe|Fb$K)2kh&v^M`^{gD(rN=oMX(g_0Us{-RgIY!9bOgaEI)i`rCEW@kHM7f}%XBw9~Ef0;#DQBWy7AyX_x5*ue;riYU3}P<;3LJZZ9&R5mBlk#m3ZKDW=}Qm| zQ&q8&*O!-JQ5#>Xf0xQNy*byz_z8NDD7=|RRdb|COT5;A>JEjsN57HvNg)kEB;5n#=l;9kEeM5)?ujGy*P4wXnc$5a);B0s#5l0Zv+G_)Z)e8h@=7_DEgK=+b z6&;`-A-=_1NXDkVKYGsNq~@vj6wQ;QBN1?X##ocLP~FaIPVz*x4ue4n1PfLDeEAuQ z_nexxiphh*Y;9+ey~~FF*I^^}lHXErm}IID910m?bRK}%SUGh!P3N94O15qL9ujzP z2O1y`FS@raz$^g&L6edMCJo5< zTrX1lIU&vjSu|`|@!&HZ4fXA!6V|x`+;29IeU;gPIlb+|*o)wn$nip9Nv=m<0iPe5 zn8X1xwc`4~qBx`Huvh@L=zgE#53bm^#TZp_Hhd&=Dua4waAWzV3nNY0%xtJaM-$&I z7X}J3StuW!RQ}-sP*$m)f)@>->?g@ht*)n568RVc-9eu#pichdzAVx*P_Fm6@RLfF zAehKpJq%?UZ15o13pMu74<$X8o8;T8_+C32&=AyIuhb)e5~-Y0AZ*i^O>V1Mus-D! zCpy7s5qN5JLaA3#AuG7nRVv#3(VsrEwCd*#zNeW-RGU;eD57MMnGOw+6lEpa*x6M>C#=t37B7t@vu`R9}JGRgdoscN`=9+r~O??S7L)vI`N!-|XeOEXr?!2TnxE7@ptzQ+*rP!)CEO7s7=zQ={Zn+;&p zG{7^0%g;SZ0FqrXq@)1MjP4AJstLXdDOnp?EFnocH)N@s>7* z0Oo^hi8OcjaVnQ8=(!ViP{e*0Ao3FlG>oLX1KiTC@nOnmS@LT5^9TMvXr$7<78>uG zZ@eYT#Ly#+vjSZ~&NJCafIl0=%`}6nr}?n2&VZDnxX6MV?I_u92iBLO5gfiX%HJ-- zz2R9>0x2}o4E2bkMD7Tj$jUTi+SH(UV|dzy=KQpF-i(H{0W19~hT~T9S}TJxIevs} z3_^4Dw?Y!@`2?Ae`$_?U{HJ5{IVQ$tXJ5T8JRz~||9*mEN-#}PeSttJ{6$M;&j3y1i zIp?iH7-JK>2as7Q1`w?5B$y`lzxe{|=|Alm*d@By;|w@X&zsgiCdHn0_#FNF>p0Dc z&L=HJqr5IB^oge%_tzw8d9MoWeAM-3`L(0WOO*>isR2Z#EYULT zOWv-A?qaeR5j(X(S1SD4Wf}l^Lj|Mfo@9kH^;IxB6fWS;3mM8t-}Ay z@$QO-XN#}NRAw`H%)fyEK=7>!eh|qPRyoO!og#5So3DH~p#KUh>v^YwQjwh`s(!yO zMr`&vdlgXCk%EH6j26EdUK>BP#2l%mOY9bwZ_ixOLUz>xUL_c>#qG`pOn}R(N*HAn zwuto;`rv9JH^X8(Im`*Z>$*5ab_12X6KAh7WU;;4 zWU?gly-jPps*lu)NRNy8WHE*Bf3dsX@r9^J0B6erIh~~$@TWU z)CQ{L%SHa?t=1_^Nj!D}J4T-JrEGV1v?WJhOk*ll*r)>W(<%GXb^~HRH^_@$tnjxk zJUVm%(Y}yn8aQFwQf;WAi0EmWx9EyYIf%epX^aFCdu`Icc^k$PG=?+_lKwqY8)^RT zTlARHLKcJ}po>c8lh;9#_;bo~8M#A9e;Q+{`1~a4XH~9rKa)OZYfJO2mhtoS)6LT!xSbx%5D*uBNasB zZwii-NH*|dLLAuTT5t6$?#aubGQjDDL|xI+o~aV`slj`&EQ_vl{AlReFK*<0o<^)aM8KL-}tp;1U@cyFZk>P*h zR~b3LK^r}>gid<9VToyvuMJ@^HStT{J!SXTWn3pBz5SoSdG9jUw)=*E(y}raV0&T&}_`eQ)4mUirsf`Isvqx#srNlQ;M~982Vd* zA?L}Cdb}=C8Ip>yIP;o7sBqsu^lM&fPXqhsx2bLKvp0n@hIx;wuZKVUoDuNIF<@K* z$jK{`!B)cYwICPlJJa^{P%&Iu8$#KQQcyO6_0~D>Jk0dnjq|a7D_2ewMO-a-Aw=}ajRKvZVqkJ!qk38BwBx*pXz*1d>Cz+e>4@CBRA0UL1s`708wK=}bD}|(lQL_)@#&^Owpq(^ zBlO42uUhNI_?`j8J~kiK}xDNBig( zoctG9NL0xMy}uGxFNWZtNhqx<`{?FVK*Ha7QhoHrGGd4QQ9xFIJRWrBvj`Aha2?oG z&wn-xD(3)F++B=|Z%3u=QSWMOLyCGl{o6yPRlz`WrikaT5{BG+?934<7{tzD>)o~! zIBTxqwZ$Qm5F?pH<7x*QI_T0c?BdkzLH#4Trw#`m>h93afRFxE=p_8~aQkfcm5cNe z@$Dt!5I3*mq1tj70(Px#2(KSb?gY2|uw`F&4^17O526Zf=G^NaW>c`(BE?}kb(_pW z0I5y>O_gYhY~8mry~{03kH`@mUsTuyqE2)Aoil$`PH8M}-~-3pk_3U7cOx^(a}X{i zU|$a5LdF7S2ppH$mC<;{rmPM>ubtWu@J_lmaqCg~S2e@V?v8>O6=PIEtPzMV`I*th zr~@ijj?kpGfW)kLc`^mKgCMAN7325e_eOseDOg5$=Y(bEL3|W^X`@z@;%Vwvi849{ zHQ&*MbqGZ<&La|KD}|VpdQ=U%Rac8X^XvE_mUl*Dn$yZj$K1q251YQ212tl$)uOe(F;z?wM2oTuLi84yU$$(Aauxy{B|F3rm)Tp|PcKKojD_ z-|dU)_wOzMom%2b05INy^~J>*ty}2$lsh8o0~{!#;LmX96eQx-)62O)oe0$%Gc1g; zbQ=~lWrnqpc>ftA>nKerf?T?~2LIt1N`mIhm)!jtE2+W<#~$Q(B8)J0!8y6D2$CqSe{|)_UXivXzHYAH|C=f7L zj@>YiNEYszux(CoPs9s%y6*oGOhyA$J&fDE{jUw-ZP>)gvlnpnz;EZ zy41`zu(GspH^0=JWry~;{D)xLdHoHY)I3*vyQ)}w?+{hjJ_lmL_%vBPNWFzculb<= z5T@uij&|G6qFYc3UwZM@x?HUU24QrzhczNxH6}~e1;=P#^>u8s|C@AOlG0&NJ?jdz zmzVEEUuqLHh&&zTMduM+0l@#NIfn-iFz3!JJ^l_h=5bars+w+KHr7Rj!)BOcKsa!| ztv%4(@Ln8E?Jr(vBTWU6G8nzs42d49I;8DRfhauvB2Eugm`=)8R8p=76KwoNW8Z7<$Kc~-cE#M^=)Ez}1-cZ$YI zc^MhA5)*!fll{8@D6mT?{(Dl*Yl&T|7Q=S5(IV(xRvY^G(3G8I)Lp=Zo(%BFma)PM zek|+o^4PIGFb4--|FBKE(*VFV!*X~ZBf|9u$F&}BTF5fpCq=u7ka%>`)lJap=vImm z$>uK87sWI;3-Au`V{bb{rD2+M+Z2#H#6b!nd7x8D)p}p zvCj4}#|_o?aju-p$c?PdDr5+f>9o5rk?i|y{6+%qD-Q*(vpLqvSbjde{|$cn*f$q! z4Mwq_6ufVwy&C=WK>pqrnAWMLNRr-S{OCNN33yCY&W z169?ze&tNC`f20Mst7=L+ez(pB1P&p@lD@HegpHZj zCg6IsM8wtKy=hec3E#_j=xtKW9h3sfBp0OXYd2B&34c$j%|r8}GT zSuPoLfzXZa`~Nk4Q;F*OeKYd}QD06s!BphJ&>+WBP=C$&#c2<65K_Gx6L+W`Y<p68vviE; z1?Kt#pd)X$<#CD7E1;rAT?;3FGuiu%A+p<8L%T-URvy}PH^9{B2Mis%^H{soEIosR z>Y-4+lXhG2FFQ#-N$2Y4{nf?a**c8ms7g?Y?^cNMq*BLwks;69vc}mvJyJ4|=34Ti zsy>$Ji>V>V!Za29+rG0riza_-Pn9Jw!2^((T)=w10)quU9^j`Vn_8*6au{)w<_+VF zR$S20v4~td6mVtjeX6FRyasNmR3;u=2TY9AiFX+qL9@uF;Xps3&2x2l!%9cl>~3rz z=SL?pphaiRy^H@*5VZjemB6w3$)2#s%9Rwtx0X}te7c8Rixpyq(f4BOzRP=?`eFB% z2@nxPrA$BRiy-+i;aWK87x5AjDJl*y5d{ijY|vA`iQVD;vfcCyDCw94X9;3UDRe$o z#{?gKK53%!h&dZNvJW0@Vx#waR^r>raO}T094(D!8u18!zZ;s|L*VB?n++p|L+2Kkuc4!*W?)9~%OD+x!Jq7h-Ck~*6s6aC?H)qSX8!1F z!QI=AU)5)uqEaUsggY`y$CQIvdbSBSm1P^~1F|cG)PCowGvLwKK?sXr?D zNx@%^l=P<3E08!uJ@M&FU^WKkV zE(WWo2juX(!AR~w2O1*$4q1$(mN2jd{cE4tM}TGltlv-ib> zKp{Iz;RQ?(Pk@f44>7Y}C8zkg-Mdjjq(Xi3*V(V|zqadsTMvlLNy_hv%el;!a?GJH z(|Q|`LJ5G2o&IGA&p%r+0kQLbU6eZ+rM!Fd_LrqFDJ(gkIK6N7KYiY}5b+a#9awSP zmXf&|8+r>9d4H3M`+iG%{T86TuNggc)kE#MtFmaGQps_@1|1OIK@0uK{tXJnZzUq3 zqFO;h8l&?*v7EB;0F-!}L96MvEAFimhg(vxIYw)Ehc$90Fd%f}>YAeJAFABJ+TohA zRm~KK5a-TfUYJ2Wx^g3C0(B5`Lm?p*k@66da(WDhoj)O6VgFBj##)*aH9C-PgV+c2^7YcQE>iOLQC9%wul930ZE zU#NNXiB(GX63iWBew*r}6^8W%V}tRv-54|JGNkD_#etE}$jfS#Dmlt!e8Nl*f2#|Y zVQRLsslg|tTXD0pG|PFAM5^5K7!GNI!G;B`GDbB>(#(gDoyBEX*VD$rm}*e-QL+e( zBBP}lAyd=^uYDWn+}fVF&xZb`4}I37^3j`|y|dKpga}^vopeZG>sy^u+zEP;A`OOy zdBYo`I>l%ZGj`;`8_9&Se;d)5+5=8WQ46l+aMMQ zh!b6)oY*ID>jh48VLKk#;s8?02nCSJAfrk-Dl*k7AK`Je z&)+2!jl3Ir5w*pe#usfG!0qu=spi#pw=&(vU20t1fjIFlfy{FHYW#A|(^G2C0%Wk4 zbAB!xWT6=rVueqU6I338+z;))GIMIs7_K~8`_Dt7rGp((rS8?JAo3*J;&p)VKxk2L zfTq^H3T;}cTzZQwxd#2o!|~bg>f=lPth)k|ZC=G>qbB(JdjH6+kDLMW64QpDu*r%8 zI_7ux<Nlm~WkHFm9ScZzL6vel_q~P2{|Zp1K8%{JGE=y8nEp^wAR=Zi?Bdvi;XSQo zOx5GOFe8BvBNVz&hf)rPV%Y^S8DtiG8;ur5qB|^XiE>^9jrykw(X8L&H+HqtS+nq6 zxJcy!DddVUQ*^*Bi~^Rbhy>adHhQTMr_pVf$vXUA#*?T9-O@B2d6s zL*$LG4&CGQ*9td zh(?>njNDo14?tA3U~q1JC<~iwDHv$cfE4Pl_)tBg)lMw;G|#e`13P9!hrQokAyqva zP&(!p_unL8gw%#3yDLZ zA4O;E0k6p{5Yvo^aDM=)2=i36kABVHsuur$bQBYk^KSewVFry17Ba6#=Lcuw$kCc))zfnTk247M;La~|GQ2dzfL4uYOrQolfyyk1q1s3(QDEs zC1YO`B_a%*W?A(s$a;0tA+1@0Gd&ytn^{JRueiXO07=jm-zZ0yQx~+h>W0*W=N5Q9 z>$61z^jH_l7M3^8n2NT1V)-EwmxXxP695O{%|J3#RV)iP07m-rUFWYq#5Hzvun7oh zu-hL}^gDHvbzCM$)uizjPV)hhhJwH;8fIqktP-R361#f%SOziZ* z-XHw8Rb8Gnjik7wiQRgOgu!KjW7!@Ne212uZXrvYOZzW6H9A?#LDv^;6U^0~c_ zYX`DW*lwAo4hK^%ioH(+(|u;i1e{Dq%^oOZyk)vK_1|RGzxuJiM$fUvL}ai6IeE3rBl_P}b{i@z)m3%I}Uiv?#1cE4u<3isFn_FX@#{&JEW4|UEU0%T8ubSBJx^(3UPUD zvY$XZ^m9&#|A;C5?Q+-6H%zb2i|=DJj4%)aoOFCVk;aX2xx0kZ*^0_KK?aa;96Zhl ziov<(<$yW$_?h5`_coqOoi*nlo-k7(bB$TC$r8$G!Z&-qYHk!W1%Erq#aTad#!o%~ zcF+v6x)u}b6Ry0Y5WHaV%>+4kD&p|(Iw1ljYxD)#afIPoS#Euyjr^aZ>7LUevD z^Pc+n8)6}^SNaF~#>a2y1rH*CyUTLS#>pUqHGBAqrD#9T-kCEVQb)kIP)0IHw{504 z%s5nV(&xH010oGac`K@zw;_3wX7g`?vz%}#x=UVmq zKLxs$9&eJPn6I^ghJKju2h>0lE=@F1jWLPIBj#&reltZaai6^!rjiv_t|J+UP}xXj zn2&MH>QMnZAwfA@HVNjDwMK{9`>XI3f~o<5_3@b+oJ-c+y>D3lT|=Nux3G%oWn8Mk zH*&k{^V}CPW3!)?SzKn%w>>X`X2g|gT&U?p@v6myr6|vI^V}O4UqU8yc~$Cfpg2(Y z-7YpZCWtjRAx)bdJNfO|DzRNPq^J5E6`;l|NVE~GfU!a&*SPJS-%}QW8bY_Q+4GS{@#-({t|1?wMBpAvrzhja3R>t}E6!GcJKLoC zW^0etZ*8kZdaQlbuEuVVzx(1H>MUQ9!UfOk%8b)SgP3m~Rl*^cqNe-Suw|J2c`LKp zTX6yT14FtGTTEdnj0k@z%K}XEj6;b*#BbY_$z2yvxLSj$$qlUwicn%o2PHJ-y!sMp z8&*RHyKjD`qC2i`XGd3Ur!$UGZJ_8cSH=Z2K-b5J6B~cRlx&|qHOq|F-H3(2#4khc z1k0IM;l&k7uXHJjpptw~@H3a45JVrCCeqY?L-oQSc9w>zwW2A(s1aHz`o#Cs9q;Nh z=ia54nQYS%{DV*grRLCzsbjp=)^+Cy=7MAQdjJk6}2ZzN13hKco;P$ zo*5Z}=GYjL=k;D=oQUKjLVDHPDc7_|+lFFd0NCyqu5DT!W9}ij5~+Gh)23IDeC|9D z%G%Z3JFi(-FsgMg8N@-NT)phXmM`(H*;WNvh@Yjc`334R3%RkVBT~X=ebpDQbKy|{ zS4EcRnoUITf!?^`@h`EA8Pxhr~0z3*QY zh6gi>02Vh>wE%WUsRuIgDSa$m%fnGWOKSmzfHwLNtIIr1yG3A~r@$rsPQ$dn`+yK9 zG7_@(@KsRj(zG)|8X)lUAn{to5uj(g4;(viTQwhCh+f@CFQ%_|2k06l-$N$k?Y9uU z%e=1ln{de7jA7#XkwBZAq{)ScOr{j<`|kET`&9ZKvSGb(peosi0;_k)X8Bb5Ft6W2 zL+s<8nJW)XpavvWoUZGz61b7#rN&4ckATuw)3sDGAICt^s(6v6mi^kY+qO7Gn_V9!wX7ePNDEyK9sOAN3$@q5dT#y{(^K z(4``&(HtW?Y!}4v+(N`EXyW@+T$0_J*Wbwg9Tbg-Bc2xHMmuu;Um z`f30XjeY3*Axe+3Bt@bZ2iIw&bi4NczZ4xt+$5&MbI)Js>!z1mh?pkj90ZO?7w0HR z*hsqVtPnK-a@)$3=TAN;WAFC?8F>(sg~4BQON~BnZaSkU@m@HJAQWVmHN7r zA^U;}J-$jSW(spNEG+03*oz0Jw&G}e*S+&D-$g;6z4t8xu9y>Hm-i)%%sxZ5A;`h5 zASsxDWEDYBc&QUWmYaHF5>)AgzUFKAHS?_I46;w)m6VU}V8xCH!=k2Q{fw>gf(_<^ z-F^Z$O!De5Yx^B{A1S#6R{dza9^$Im@L8T4(bwFyPoPGaIPV4#$xARmW6?7`!5o5c z_8Ocy=OD_$EmA(SUCo!xHl%TQkpd8@BqpD&O}mncn-=LbhY-=#m$jlNdxg*vxqKi=+SG2gZ==f2%X{mwmBF0kS!Nl^yWi=LcFYuf*g57^no8rJ=3-X^6_NWRNFj9 z{+QqXw)FYe1X}{<|03NKW#O1HtLz!eys0d#0C-A3F{peo5g0_2SxjFDr*?3 z@;=G;1}Q(_uEqnPt02fx2Qe1*2d zp~pcj`UYBRA4_sHflrJEJc|#!lyH*b-*FagIZq^&v}Jl(}1S|FurJ{X|!z$TZbD8m+~aKzuHt&RjWc=^O=h;`Nl!_p$9QV^e? zWY$fikl12y-dRNwlQ;Db^;?7Qth;Q$~%W`X^m!9Gf_-5f=|crckQCCCR{J{T-ScqtfrVq^|of$j}b1vg9U#AxKrsw zFR7?4SXJQrj6Bs*N#J1?6@~a}m{Bdw$TQX1+lg7Uq{XyvS2JK`PLZvr(`?8ZACLs(&$%) zO$3=2H6W-DcECltq}0Oaf47U(2w(&|=JG#c#mFv}nV-z3bA5PF^E7e+$}R%UKRKuh ziLpLvmETmV3>-vmC?i)JkN)q`)Hybod1p5J5Bo5S3p~u-Xf4dYA%nl2$hqlF6o?d1o=&n`{DeVb@T(dqE*q=xMQ%ew{_aWXX zJ;9d#*XeWR#7*)Hfp9hnv&eKOHkeY)8Omazv78IfDoZkt0zeX3bqh0K5D;GmL`Rmt zUNBEYIXq1$E2N^C292rSd}%q|erT^%ur}VTz?yN4ELzNul8Kp%rkRBHM-fg>og%|1 zLItwv6tqsjzLp^m~8j)2ib| zSo(Z@Qt-`eo#`)pVd!ikN#%-|_b*#cJ4utHn)7p2i?J5HAZX)dfxxs;btT(GF<}L0 zkv*8Zf$QQ%>A#mJWk9Z~u||tdd=c>Q4H?v&%+jF~aX?*B{)MpUp;Z^d9L6);bNH)6CB5u%S)3C3B*w^SqR+hCD42J}zqv%3vwVWOkpPYG1 z!d~oRivM7RLwI$?QXWO6OCS-=Un8uZAdrkeDelk-iGHMF*_iSugV>kd^JzJM{?1(R zOj7!7f~}EEbbH{V?2TxZVNw|Vvw)hIxB>fMmSiYZlpvKH1hkeZ!~QKaCdfUQc{r#P zx)t@gYU)YtJz~s_l#ygxenyTr5{Ewj+9|WKf_9ix=4%D$r_|`=k!I@rV+`VbRI3IU ztYdc=(7@H#uxtDpSlb4T3nXcH0oJUkWjVUu6pBYdl5mmZk#<#G5m+BUidxRXoOE;D zyaNCYAU?L@T*E@ z1P}dU5c`JIG>Wz2c}0li07(tynRF?V9H*gkSQH#c6^OE|R^-w4XPr_+#-)A=xKqc1 z(N-v2V?IJ(;`KF>5LeAXYp+}m*h;%x^RZ(~qKR@?&A=}%kH~I1* n|3;|sOn8U@#$)(|00GFX0p{ooE5!Q&J}?c70ssI200CKA?CbDS literal 0 HcmV?d00001 From fad4ed5a2c2a758e76fa666b7a7f1cfbfea2f3c0 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 26 Nov 2024 11:07:08 +0300 Subject: [PATCH 36/44] Rewrite the S4 section data.table needs support for both S4 tables and S4 columns. shallow() has an API solution. Link between sections, both S4 -> ATTRIB and in other places. Other tweaks. --- posts/2024-12-12-non-api-use/index.qmd | 121 ++++++++++++--------- posts/2024-12-12-non-api-use/precomputed.R | 2 +- 2 files changed, 73 insertions(+), 50 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 22214530..1c447af4 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -1,5 +1,5 @@ --- -title: "Use of non-API functions in `data.table`" +title: "Use of non-API entry points in `data.table`" author: "Ivan Krylov" date: "2024-12-12" categories: [code] @@ -33,7 +33,7 @@ maintaining the code [@Nash2024]. Still, the capability of R to be extended by special-purpose compiled code is as important as ever. As of `r when`, `r round(sum(needscomp)/length(needscomp)*100)`% of CRAN packages use compiled code. Since the implementation language of R is C, -not Fortran, the application programming interface (API) for R is also +not Fortran, the application programming interface (API) for R is mainly defined in terms of C. What's in an API? @@ -88,9 +88,10 @@ been historical mistakes: while WRE has been saying [back in version the (API) functions `dwilcox`, `pwilcox` or `qwilcox`, the function was only [declared in the public headers][wilcox_declared] and [removed from `tools:::nonAPI`][wilcox_api] in R-4.2.0. Still, between R-3.3.3 and -R-4.4.2, `tools:::nonAPI` grew from `r length(nonAPI.3_3)` to -`r length(nonAPI.4_4)` entries, and the package maintainers had to adapt -or face archival of their packages. +R-4.4.2, the `#define USE_RINTERNALS` escape hatch finally closed, +`tools:::nonAPI` grew from `r length(nonAPI.3_3)` to `r +length(nonAPI.4_4)` entries, and the package maintainers had to adapt or +face archival of their packages. A [recent question on R-devel][ALTREPnonAPI] (whether the [ALTREP] interface should be considered "API" for the purpose of CRAN package @@ -136,9 +137,6 @@ prompting the blog post you are currently reading. - - @@ -165,38 +163,57 @@ non-API][remove_non_API]: Operating on the S4 bit: `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` --------------------------------------------------------------------------- +In R's "S4" OOP system, objects can have a primitive base type (e.g. +`setClass("PrimitiveBaseType", contains = "numeric")` or no base type at +all (e.g. `setClass("NoBaseType")`). In the former case, their +`SEXPTYPE` code is that of their base class (e.g. `REALSXP`). In the +latter case, their type code is `OBJSXP` (previously `S4SXP`, which is +now an alias for `OBJSXP`). To make both cases work consistently, R uses +a [special "S4" bit][RI_S4rep] in the header of the object. + The `data.table` class is [registered][setOldClass] with the S4 OOP system, making it possible to create S4 classes containing `data.table`s as members (`setClass(slots = c(mytable = 'data.table'))`) or even inheriting from `data.table` (and, in turn, from `data.frame`: -`setClass(contains = 'data.table')`). This latter case requires care -from the C code: when creating a copy of an S4 `data.table` from scratch -(or setting all attributes from one object onto another), the -destination value must also end up being an S4 object. This is -controlled by the special "S4" bit in the header of every R object, so -the code must read and set it correctly. +`setClass(contains = 'data.table')`). Additionally, `data.table`s may +contain columns that are themselves S4 objects, and both of these cases +require care from the C code. The undocumented functions `IS_S4_OBJECT`, `SET_S4_OBJECT`, `UNSET_S4_OBJECT` exist as bare interfaces to [the internal macros][IS_S4_OBJECT] of the same names and directly access the flag -inside their argument. +inside their argument. Writing R Extensions +[documents][WRE_replacement_entrypoints] `Rf_isS4` and `Rf_asS4` as +their replacements. The [`Rf_isS4`][isS4] function is a wrapper for `IS_S4_OBJECT` that follows the usual naming convention for remapped functions, has been part of the API for a long time, and could implement additional checks if they are needed by R. The [`Rf_asS4`][asS4] function (experimental -API) is more involved, making sure to operate on a shallow copy of an -object instead of overwriting it in place and trying to "deconstruct" S4 -objects into S3 objects if possible and requested. - -Solution: [use `Rf_isS4` instead of -`IS_S4_OBJECT`][remove_set_s4_object], as -[documented][WRE_replacement_entrypoints] in Writing R Extensions. Use -`Rf_asS4` to control the S4 object bit, but be careful -around shared objects. - - +API) is more involved, trying to "deconstruct" S4 objects into S3 +objects if possible and requested to. If the reference +count of its argument is _above_ 1, it will operate upon and return +its shallow duplicate. + +`data.table` used to directly operate on the S4 bit in two places, the +[`shallow` function in `src/assign.c`][datatable_assign_shallow_S4] and +the [`keepattr` function in +`src/dogroups.c`][datatable_dogroups_keepattr_S4]. In both cases, this +was required after directly modifying attribute list using the +undocumented function `SET_ATTRIB`. For +`shallow`, the solution was to replace the manual operation of +attributes with +[`SHALLOW_DUPLICATE_ATTRIB`][datatable_assign_SHALLOW_ATTRIB] (API, +available since 3.3.0), which itself takes care of invariants like the +object bit and the S4 bit. + +The `keepattr` function is only used in +[`growVector`][datatable_dogroups_grow_keepattr] to transplant all +attributes from a vector to its enlarged copy without duplicating them, +for which no API exists. The solution is to +[use `Rf_asS4` to control the S4 object bit][remove_set_s4_object], +knowing that the new vector is freshly allocated and thus cannot be +shared yet. Converting between calls and pairlists: `SET_TYPEOF` ---------------------------------------------------- @@ -328,7 +345,7 @@ const SEXP *sourceD = STRING_PTR_RO(source); See also: [PR18775]. -Reading the reference counts: `NAMED` +Reading the reference counts: `NAMED` {#NAMED} ------------------------------------- In plain R, all value types -- numbers, strings, lists -- have @@ -491,14 +508,14 @@ making the change breaks a number of unit tests. ### Growable vectors Since `data.frame`s and `data.table`s are lists, and lists in R are -value types with pass-by-value semantics (see above), adding or removing -a column to one the normally involves allocating a new list referencing -the same columns (performing a "shallow duplicate"). By contrast, the -[over-allocated vectors][datatable_overallocation] can be resized in -place by gradually increasing their `LENGTH` (remembering their original -length in the `TRUELENGTH` field), obviating the need for shallow -duplicates at the cost of making `data.table` shared, by-reference -values. The change has been introduced in [v1.7.3, November +value types with pass-by-value semantics, adding or +removing a column to one the normally involves allocating a new list +referencing the same columns (performing a "shallow duplicate"). By +contrast, the [over-allocated vectors][datatable_overallocation] can be +resized in place by gradually increasing their `LENGTH` (remembering +their original length in the `TRUELENGTH` field), obviating the need for +shallow duplicates at the cost of making `data.table` shared, +by-reference values. The change has been introduced in [v1.7.3, November 2011][news173], together with the `:=` operator for changing the columns by reference (which has since become [the defining feature of data.table][datatable_logo]). @@ -513,17 +530,18 @@ reference tracking during (un)serialization ([1][R_serialize_hash], [2][R_saveload_hash]) and looking up environment contents ([1][R_envir_hashpri], [2][R_envir_hashval]). R-3.3 (May 2016) saw the inclusion of [radix sort][R_radixsort] from `data.table` itself, which -uses `TRUELENGTH` to sort strings (more on that below). R-3.4 (April -2017) [introduced][R_growable] over-allocation when growing vectors due -to assignment outside their bounds. The [growable bit][gp_for_growable] -was added to prevent the mismanagement of the allocated memory counter: -without the bit set on the over-allocated vectors, the garbage collector -only counted `LENGTH(x)` instead of `TRUELENGTH(x)` units as released -when garbage-collecting the vector, inflating the counter over time. -[ALTREP] objects introduced in R-3.5 (April 2018) don't have a -`TRUELENGTH`: it [cannot be set][R_altrep_set_truelen] and is [returned -as 0][R_altrep_truelen]. In very old versions of R, `TRUELENGTH` wasn't -initialised, but it is nowadays set to 0, which `data.table` [depends +uses `TRUELENGTH` to sort strings. R-3.4 +(April 2017) [introduced][R_growable] over-allocation when growing +vectors due to assignment outside their bounds. The [growable +bit][gp_for_growable] was added to prevent the mismanagement of the +allocated memory counter: without the bit set on the over-allocated +vectors, the garbage collector only counted `LENGTH(x)` instead of +`TRUELENGTH(x)` units as released when garbage-collecting the vector, +inflating the counter over time. [ALTREP] objects introduced in R-3.5 +(April 2018) don't have a `TRUELENGTH`: it [cannot be +set][R_altrep_set_truelen] and is [returned as 0][R_altrep_truelen]. In +very old versions of R, `TRUELENGTH` wasn't initialised, but it is +nowadays set to 0, which `data.table` [depends upon][datatable_init_testtl]. Nowadays, `data.table` uses vectors whose length is different from their @@ -700,7 +718,7 @@ inside `data.table`, not from the ALTREP methods. The original implementation that uses `SETLENGTH` can be kept behind `#if R_VERSION < R_Version(4, 3, 0)` for backwards compatibility. -### Fast string matching +### Fast string matching {#TRUELENGTH-mark} `data.table`'s use of `TRUELENGTH` is not limited to growable buffers. A common idiom is to set the `TRUELENGTH`s of `CHARSXP` values from a @@ -796,7 +814,7 @@ R and imported by `data.table`, we can find a number of non-API entry points which ` R CMD check ` doesn't complain about yet: `r paste(paste0('', sort(DTnonAPI_yet), ''), collapse = ', ')`. -`(SET_)ATTRIB`, `SET_OBJECT` +`(SET_)ATTRIB`, `SET_OBJECT` {#ATTRIB-all} ---------------------------- > Use `getAttrib` for individual attributes. To test whether there are @@ -1024,9 +1042,14 @@ References [clarifyingAPI]: https://stat.ethz.ch/pipermail/r-devel/2024-June/083449.html [remove_non_API]: https://github.com/Rdatatable/data.table/issues/6180 [setOldClass]: https://search.r-project.org/R/refmans/methods/html/setOldClass.html +[RI_S4rep]: https://cran.r-project.org/doc/manuals/R-ints.html#Representation-of-S4-objects [IS_S4_OBJECT]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/memory.c#L4033-L4035 [isS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1838-L1841 [asS4]: https://github.com/r-devel/r-svn/blob/c20ebd2d417d9ebb915e32bfb0bfdad768f9a80a/src/main/objects.c#L1843 +[datatable_assign_shallow_S4]: https://github.com/Rdatatable/data.table/blob/a2e20d6cab0bc3cd00f8e47d10603e8c04c89759/src/assign.c#L156 +[datatable_dogroups_keepattr_S4]: https://github.com/Rdatatable/data.table/blob/a2213177283f0f15823e1ff823c1fdf63746da3d/src/dogroups.c#L485 +[datatable_assign_SHALLOW_ATTRIB]: https://github.com/Rdatatable/data.table/commit/f952062030e6657bef83de2748c65120990031c1 +[datatable_dogroups_grow_keepattr]: https://github.com/Rdatatable/data.table/blob/a2213177283f0f15823e1ff823c1fdf63746da3d/src/dogroups.c#L522 [remove_set_s4_object]: https://github.com/Rdatatable/data.table/pull/6183 [call]: https://search.r-project.org/R/refmans/base/html/call.html [WRE_call]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Creating-call-expressions diff --git a/posts/2024-12-12-non-api-use/precomputed.R b/posts/2024-12-12-non-api-use/precomputed.R index 09c26e32..28e94d46 100644 --- a/posts/2024-12-12-non-api-use/precomputed.R +++ b/posts/2024-12-12-non-api-use/precomputed.R @@ -34,7 +34,7 @@ DTsymbols %<-% fread( name := sub('@.*', '', name) ][] -# this is entirely different on late-2024 tools:::{funAPI,nonAPI} +# this is entirely dependent on late-2024 tools:::{funAPI,nonAPI} setdiff( # symbols exported by R and imported by data.table... intersect(symbols$name, DTsymbols$name) |> From 611aa541c04e197dad2d7a7ff2fd6a128c94b673 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 26 Nov 2024 14:14:18 +0300 Subject: [PATCH 37/44] Thoughts on TRUELENGTH --- posts/2024-12-12-non-api-use/index.qmd | 56 +++++++++++++++++++------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 1c447af4..a7ecabdd 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -510,11 +510,11 @@ making the change breaks a number of unit tests. Since `data.frame`s and `data.table`s are lists, and lists in R are value types with pass-by-value semantics, adding or removing a column to one the normally involves allocating a new list -referencing the same columns (performing a "shallow duplicate"). By -contrast, the [over-allocated vectors][datatable_overallocation] can be +referencing the rest of the columns (performing a "shallow duplicate"). +By contrast, the [over-allocated lists][datatable_overallocation] can be resized in place by gradually increasing their `LENGTH` (remembering their original length in the `TRUELENGTH` field), obviating the need for -shallow duplicates at the cost of making `data.table` shared, +shallow duplicates at the cost of making `data.table`s shared, by-reference values. The change has been introduced in [v1.7.3, November 2011][news173], together with the `:=` operator for changing the columns by reference (which has since become [the defining feature of @@ -599,20 +599,21 @@ R: * Setting the `GROWABLE_BIT` on the `data.table` would make R account for `TRUELENGTH` elements instead of `XLENGTH` elements. -Unfortunately, `GROWABLE_BIT` is not part of the API, so using it will -not help in the long run. +Unfortunately, `GROWABLE_BIT` is not part of the API and was only +introduced in R-3.4, so it does not present a full solution to the +problems. Moreover, * Setting `LENGTH` larger than the allocated length may cause R to access undefined or even unmapped memory. -* For vectors containing other `SEXP` values (`VECSXP`, `EXPRSXP`, - `STRSXP`) vectors: when reducing the `LENGTH`, having a non-persistent - value (something other than `R_NilValue` or `R_BlankString` or - `R_NaString`) in the newly inaccessible cells will also make them - unreachable from the viewpoint of the garbage collector, potentially - prompting it to reuse or unmap the pointed-to memory. Increasing the - `LENGTH` again with invalid pointers in the newly accessible slots - will make an invalid vector that cannot be safely altered or - discarded: +* For vectors containing other `SEXP` values (of type `VECSXP`, + `EXPRSXP`, `STRSXP`): when reducing the `LENGTH`, having a + non-persistent value (something unlike the persistent values + `R_NilValue` or `R_BlankString` or `R_NaString` provided by R itself) + in the newly inaccessible cells will also make them unreachable from + the viewpoint of the garbage collector, potentially prompting it to + reuse or unmap the pointed-to memory. Increasing the `LENGTH` again + with invalid pointers in the newly accessible slots will make an + invalid vector that cannot be safely altered or discarded: ```c #include @@ -718,6 +719,29 @@ inside `data.table`, not from the ALTREP methods. The original implementation that uses `SETLENGTH` can be kept behind `#if R_VERSION < R_Version(4, 3, 0)` for backwards compatibility. +Replacing `TRUELENGTH`-based growable vectors with `ALTREP`-based ones +will conform to the API, allow growing the vector in place, and avoid +the various inconsistencies that happen when R duplicates or deallocates +these vectors, but also has the following downsides: + + * Every place in `data.table` that uses growable vectors will have to + be refactored to use the new abstraction layer (`SETLENGTH` in R < + 4.3, ALTREP in R ≥ 4.3). + * Both implementations will have to be maintained as long as + `data.table` supports R < 4.3. + * The data pointer access is slower for ALTREP vectors than for + ordinary vectors: having checked the ALTREP bit in the header, R will + have to access the method table and call the method instead of adding + a fixed offset to the original `SEXP` pointer. This shouldn't be + noticeable unless `data.table` puts data pointer access inside a + "hot" loop. + * For numeric ALTREP classes, ALTREP-aware operations that use + `*_GET_REGION` instead of the data pointer will become slower unless + the class implements a `Get_region` method. + * The current implementation performs extra work to un-ALTREP lists and + vectors given to `data.table`, precisely because it's impossible to + `SET_TRUELENGTH` on them. This will also need to be refactored. + ### Fast string matching {#TRUELENGTH-mark} `data.table`'s use of `TRUELENGTH` is not limited to growable buffers. A @@ -785,7 +809,9 @@ The fast string lookup is used in the following places: * `src/fmelt.c`: [combining factor levels by merging their `CHARSXP`s in a common array with indices in `TRUELENGTH`][datatable_fmelt_truelen] - + ### Marking columns for copying From ebbce1201b13725e123a8c4c0f3c07cf4fad83be Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 26 Nov 2024 19:57:41 +0300 Subject: [PATCH 38/44] Hashing to maybe replace TRUELENGTH(CHARSXP) --- posts/2024-12-12-non-api-use/index.qmd | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index a7ecabdd..2014aa9c 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -757,7 +757,7 @@ index. R does not currently set negative `TRUELENGTH`s by itself, so any positive `TRUELENGTH`s can be safely discarded as non-matches. In the best case scenario, this lookup is very fast: for a table of size -`n` and `k` strings to look up in it, it takes $\mathrm{O}(1)$ memory +$n$ and $k$ strings to look up in it, it takes $\mathrm{O}(1)$ memory (the `TRUELENGTH` is already there, unused) and $\mathrm{O}(n)$ time for overhead plus $\mathrm{O}(k)$ time for the actual lookups. @@ -809,9 +809,17 @@ The fast string lookup is used in the following places: * `src/fmelt.c`: [combining factor levels by merging their `CHARSXP`s in a common array with indices in `TRUELENGTH`][datatable_fmelt_truelen] - +Since there does't seem to be any intent to allow using R API to place +arbitrary integer values inside unused `SEXP` fields, `data.table` will +have to look up the `CHARSXP` values using the externally available +information. Performing $O(nk)$ direct pointer comparisons would scale +poorly, so for an $O(1)$ individual lookup `data.table` could build a +hash table of `SEXP` pointers. While pointer hashing [isn't strictly +guaranteed by the C standard to work][Wellons_hashptr], it has been used +[in R itself][R_unique_PTRHASH]. A hash table would need $O(n)$ memory +and average $O(k)$ time for lookup. ### Marking columns for copying @@ -1188,6 +1196,8 @@ References [datatable_chmatch_lookup]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L108-L130 [datatable_chmatch_cleanup2]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/chmatch.c#L135-L136 [datatable_fmelt_truelen]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 +[Wellons_hashptr]: https://nullprogram.com/blog/2016/05/30/ +[R_unique_PTRHASH]: https://github.com/r-devel/r-svn/blob/3713345283787c928e563cdcdf01cc4a9dc1c708/src/main/unique.c#L185-L208 [datatable_dogroups_setlen-1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L105-L152 [datatable_dogroups_anyspecialstatic]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L6-L64 [datatable_copyShared1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L260-L261 From b0ea392816fd07b75287514d61ecca233b6cacba Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 27 Nov 2024 14:43:55 +0300 Subject: [PATCH 39/44] Finish the section on hashing for now Also, * expand the cases under "there's more" * provide the bibliographic references * finish all the other small TODOs --- posts/2024-12-12-non-api-use/index.qmd | 121 +++++++++++++++++-------- posts/2024-12-12-non-api-use/refs.bib | 23 +++++ 2 files changed, 108 insertions(+), 36 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index 2014aa9c..e58a8fb7 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -230,8 +230,7 @@ contains the value in its `CAR` field and a reference to the rest of the list in its `CDR` field. Argument names, if provided, are stored in the third field, `TAG`. The list is terminated by `R_NilValue`, which is of type `NILSXP`. These structures must be constructed every time C code -wants to evaluate a function call. +wants to evaluate a function call ([e.g.][datatable_rbindlist_eval]). Previously, R API contained a function to allocate `LISTSXP` pairlists of arbitrary length, `allocList()`, but not function calls, so it became @@ -308,13 +307,14 @@ barrier][RI17] ([1][Tierney_gengc], [2][Tierney_writebr]) any time a `SEXP` value (such as an `STRSXP` vector) references another `SEXP` value (such as a `CHARSXP` string). In a generational garbage collector, "younger" generations are marked and sweeped more frequently than -"older" ones. If package C code manually writes a reference to a "young" -`CHARSXP` object into an "old" `STRSXP` vector without taking -generations into account, a following collection of the "young" pool of -objects will miss the `CHARSXP` being referenced by the "old" `STRSXP` -and remove the `CHARSXP` as "garbage". This makes the `SEXP *` pointers -returned by `STRING_PTR` unsafe and requires the use of `STRING_PTR_RO` -function, which returns a read-only `const SEXP *`. +"older" ones, because in a typical R session, most objects are temporary +[@Jones2012, chapter 9]. If package C code manually writes a reference +to a "young" `CHARSXP` object into an "old" `STRSXP` vector without +taking generations into account, a following collection of the "young" +pool of objects will miss the `CHARSXP` being referenced by the "old" +`STRSXP` and remove the `CHARSXP` as "garbage". This makes the `SEXP *` +pointers returned by `STRING_PTR` unsafe and requires the use of +`STRING_PTR_RO` function, which returns a read-only `const SEXP *`. Thankfully, `data.table` has already been using read-only `const SEXP *` pointers when working with `STRSXP` vectors, so the required changes to @@ -399,7 +399,7 @@ assignment][remove_named]: if (verbose) { Rprintf(_("RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d, but then is being plonked. length(values)==%d; length(cols)==%d)\n"), i+1, NAMED(thisvalue), MAYBE_SHARED(thisvalue), length(values), length(cols)); - ^^^^^ non-API function + // ^^^^^ non-API function } ``` @@ -412,7 +412,7 @@ printing `MAYBE_REFERENCED()` and `MAYBE_SHARED()` instead of `NAMED()`: if (verbose) { Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d, but then is being plonked. length(values)==%d; length(cols)==%d)\n"), i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), length(values), length(cols)); - ^^^^^^^^^^^^^^^^ API function + // ^^^^^^^^^^^^^^^^ API function } ``` @@ -729,6 +729,10 @@ these vectors, but also has the following downsides: 4.3, ALTREP in R ≥ 4.3). * Both implementations will have to be maintained as long as `data.table` supports R < 4.3. + * The current implementation in `data.table` re-creates ALTREP + objects as ordinary ones precisely because it's impossible to + `SET_TRUELENGTH` on ALTREP objects. This will also need to be + refactored. * The data pointer access is slower for ALTREP vectors than for ordinary vectors: having checked the ALTREP bit in the header, R will have to access the method table and call the method instead of adding @@ -738,9 +742,6 @@ these vectors, but also has the following downsides: * For numeric ALTREP classes, ALTREP-aware operations that use `*_GET_REGION` instead of the data pointer will become slower unless the class implements a `Get_region` method. - * The current implementation performs extra work to un-ALTREP lists and - vectors given to `data.table`, precisely because it's impossible to - `SET_TRUELENGTH` on them. This will also need to be refactored. ### Fast string matching {#TRUELENGTH-mark} @@ -788,6 +789,10 @@ Care must be taken for the technique to work properly: * The `TRUELENGTH`s are used to look up variables in hashed environments, so R code should not run while the values are disturbed. +The encoding conversions take $O(n+k)$ time and space; the `TRUELENGTH` +bookkeeping takes $O(n)$ space and time (thanks to the exponential +`realloc` trick). + The fast string lookup is used in the following places: * `src/assign.c`: [factor level merging in @@ -816,29 +821,58 @@ information. Performing $O(nk)$ direct pointer comparisons would scale poorly, so for an $O(1)$ individual lookup `data.table` could build a hash table of `SEXP` pointers. While pointer hashing [isn't strictly guaranteed by the C standard to work][Wellons_hashptr], it has been used -[in R itself][R_unique_PTRHASH]. A hash table would need $O(n)$ memory -and average $O(k)$ time for lookup. +[in R itself][R_unique_PTRHASH]. A hash table for $n$ `CHARSXP` pointers +would need $O(n)$ memory, $O(n)$ time to initialise, and average $O(k)$ +time for $k$ lookups [@Cormen2009, chapter 11]. + +Taking the `savetl` bookkeeping into account, the _average asymptotic_ +performance of `TRUELENGTH` and hashing for string lookup is the same in +both time and space, but the constants are most likely lower for +`TRUELENGTH`. Transitioning to a hash will probably involve a +performance hit. + +A truly lazy implementation could just use [`std::unordered_map`][cppreference_unordered_map] (at the cost of requiring C++11, +which was supported but far from required in R-3.3, and having to shield +R from the C++ exceptions) or the permissively-licensed [uthash]. Since +the upper bound on the size of the table is known ahead of time, a +custom-made open-addressing hash table [@Cormen2009, section 11.4] could +be implemented with a fixed load factor, requiring only one allocation +and no linked lists to walk. ### Marking columns for copying -* `src/dogroups.c`: special symbols `.BY`, `.I`, `.N`, `.GRP` live in - their own special vectors that must not appear inside regular - `data.table`s; [setting the marker][datatable_dogroups_setlen-1], - [checking the marker][datatable_dogroups_anyspecialstatic] -* `src/utils.c`: need to copy columns that share memory or are ALTREP - [preparing zero `TRUELENGTH`s][datatable_copyShared1], [marking - ALTREP, special, and already marked columns for - copy][datatable_copyShared2], [marking not previously marked columns - with their column number][datatable_copyShared3], [restoring the - `TRUELENGTH`s for columns that won't be - overwritten][datatable_copyShared4] +The use of `TRUELENGTH` in `data.table` to mark objects is not limited +to `CHARSXP` strings. Individual columns are also marked in a similar +manner for later copying: + +* In `src/dogroups.c`, the vectors allocated for the special symbols + `.BY`, `.I`, `.N`, `.GRP` must not be returned by the grouping + operations evaluated with `dt[..., ..., by=...]`, so they are [marked + with a `TRUELENGTH` of -1][datatable_dogroups_setlen-1], and the + [marked columns][datatable_dogroups_anyspecialstatic] are later + re-created. +* In `src/utils.c`, columns share memory or are ALTREP must be copied. + Memory sharing between columns may lead to confusing results when they + are altered by reference, and ALTREP columns cannot have `TRUELENGTH` + set. The code uses the same trick as with `CHARSXP` objects: if + `TRUELENGTH` is set on an object, accessing the object through a + different pointer and seeting `TRUELENGTH` set will prove that the + object has been previously visited. The code first [prepares zero + `TRUELENGTH`s][datatable_copyShared1], then [marks ALTREP, special, + and already marked columns for copying][datatable_copyShared2], then + [marks columns not previously marked with their column + number][datatable_copyShared3], then finally [restores the + `TRUELENGTH`s for the columns that won't be + overwritten][datatable_copyShared4]. * The `SET_TRUELENGTH` call in `copySharedColumns` would fail if it ever got an ALTREP column, but the only use of `copySharedColumns` in `reorder` guards against those. - +The same solution as above can be used +here, with the same downsides of having to allocate memory for the hash +table and the potential to have worst-case $O(kn)$ time for $k$ lookups +fundamental to hash tables. But there's more ================ @@ -851,6 +885,10 @@ points which ` R CMD check ` doesn't complain about yet: `(SET_)ATTRIB`, `SET_OBJECT` {#ATTRIB-all} ---------------------------- +`data.table` performs some direct operations on the attribute pairlists. +Accessing attributes directly requires manually maintaining the object +bit. + > Use `getAttrib` for individual attributes. To test whether there are > any attributes use `ANY_ATTRIB`, added in R 4.5.0. Use `setAttrib` for > individual attributes, `DUPLICATE_ATTRIB` or @@ -894,9 +932,10 @@ that use the given column. of a special symbol has been stashed there and must be duplicated. Without `ATTRIB`, this will only be possible using an R-level call to -`attributes()`. - - +`attributes()`. While the indices could be changed to use a different data +structure (a named `VECSXP` list?), necessitating an update step for +`data.table`s loaded from storage, the code in `src/dogroups.c` cannot +avoid having to see all the attributes. ### Raw `c(NA, n)` row names @@ -906,15 +945,21 @@ in the compact form as a 2-element integer vector starting with `NA`. The `getAttrib` function has a special case for the `R_RowNamesSymbol`, which returns an ALTREP representation of this attribute. - +`data.table` needs this access in order to [temporarily +overwrite][datatable_dogroups_rownames2] the `rownames` attribute for +the specially-prepared subset `data.table` named `.SD` (which has a +different number of rows and therefore needs different `rownames`). +Creating a full-sized `rownames` attribute instead of its compact form +would take more time than desirable. ### Direct transplantation of attributes The code in `src/dogroups.c` needs to [transplant][datatable_dogroups_SETATTR] the attributes from one object to another without duplicating them, even shallowly. - - +`SHALLOW_DUPLICATE_ATTRIB` could work as a replacement, but with worse +performance because it would waste time copying attributes from an +object that is about to be discarded. `findVar` --------- @@ -1086,6 +1131,7 @@ References [datatable_dogroups_grow_keepattr]: https://github.com/Rdatatable/data.table/blob/a2213177283f0f15823e1ff823c1fdf63746da3d/src/dogroups.c#L522 [remove_set_s4_object]: https://github.com/Rdatatable/data.table/pull/6183 [call]: https://search.r-project.org/R/refmans/base/html/call.html +[datatable_rbindlist_eval]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/rbindlist.c#L237 [WRE_call]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Creating-call-expressions [remove_set_typeof]: https://github.com/Rdatatable/data.table/pull/6313 [RI17]: https://cran.r-project.org/doc/manuals/R-ints.html#The-write-barrier @@ -1198,6 +1244,8 @@ References [datatable_fmelt_truelen]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L273 [Wellons_hashptr]: https://nullprogram.com/blog/2016/05/30/ [R_unique_PTRHASH]: https://github.com/r-devel/r-svn/blob/3713345283787c928e563cdcdf01cc4a9dc1c708/src/main/unique.c#L185-L208 +[cppreference_unordered_map]: https://en.cppreference.com/w/cpp/container/unordered_map +[uthash]: https://troydhanson.github.io/uthash/ [datatable_dogroups_setlen-1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L105-L152 [datatable_dogroups_anyspecialstatic]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L6-L64 [datatable_copyShared1]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/utils.c#L260-L261 @@ -1208,6 +1256,7 @@ References [datatable_assign_ATTRIB]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/assign.c#L618-L629 [datatable_dogroups_ATTRIB]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L57-L58 [datatable_dogroups_rownames]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L131-L134 +[datatable_dogroups_rownames2]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L195 [datatable_dogroups_SETATTR]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L509-L515 [datatable_dogroups_findVar]: https://github.com/Rdatatable/data.table/blob/03c647f9a44710aad834c0718e0b34e8c5341bf1/src/dogroups.c#L90-L118 [WRE 6.21.7]: https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Working-with-variable-bindings diff --git a/posts/2024-12-12-non-api-use/refs.bib b/posts/2024-12-12-non-api-use/refs.bib index cf280acd..b51c71ee 100644 --- a/posts/2024-12-12-non-api-use/refs.bib +++ b/posts/2024-12-12-non-api-use/refs.bib @@ -30,3 +30,26 @@ @article{Nash2024 issn = {2073-4859}, pages = {198-215} } +@book{Jones2012, + address = {Boca Raton, FL}, + series = {Applied algorithms and data structures series}, + title = {The garbage collection handbook: the art of automatic memory management}, + isbn = {978-1-4200-8279-1}, + shorttitle = {The garbage collection handbook}, + language = {eng}, + publisher = {CRC Press}, + author = {Jones, Richard and Hosking, Antony and Moss, Eliot}, + year = {2012}, + note = {OCLC: ocn212844102}, + keywords = {Memory management (Computer science)}, +} +@book{Cormen2009, + address = {Cambridge, Massachusetts London, England}, + edition = {Third edition}, + title = {Introduction to algorithms}, + isbn = {978-0-262-03384-8 978-0-262-27083-0}, + language = {eng}, + publisher = {MIT Press}, + author = {Cormen, Thomas H. and Leiserson, Charles Eric and Rivest, Ronald Linn and Stein, Clifford}, + year = {2009}, +} From 229c77deb1f6c48738444648a3c58f899cbbfdc4 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 29 Nov 2024 14:33:22 +0300 Subject: [PATCH 40/44] Spell check Thanks @Anirban166 --- posts/2024-12-12-non-api-use/index.qmd | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/posts/2024-12-12-non-api-use/index.qmd b/posts/2024-12-12-non-api-use/index.qmd index e58a8fb7..e98c58ae 100644 --- a/posts/2024-12-12-non-api-use/index.qmd +++ b/posts/2024-12-12-non-api-use/index.qmd @@ -95,7 +95,7 @@ face archival of their packages. A [recent question on R-devel][ALTREPnonAPI] (whether the [ALTREP] interface should be considered "API" for the purpose of CRAN package -developent) sparked a series of events and an extensive discussion +development) sparked a series of events and an extensive discussion containing the highest count of occurrences of the word "API" per month ever seen on R-devel (234), topping [October 2002][Rd200210] (package versioning and API breakage, 150), [October 2005][Rd200510] (API for @@ -103,7 +103,7 @@ graphical interfaces and console output, 124), and [May 2019][Rd201905] (discussions of the ALTREP interface and multi-threading, 121). As a result, Luke Tierney [started work][clarifyingAPI] on programmatically describing the functions and other symbols exported by R (including -variables and preprocessor and enumeriation constants), giving a +variables and preprocessor and enumeration constants), giving a stronger definition to the interface. His changes add the currently unexported function `tools:::funAPI()` that lists entry points and two more of their categories: @@ -138,7 +138,7 @@ prompting the blog post you are currently reading. point -->