tdhock · tdhock · Jul 13, 2022 · Jul 19, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -1,4 +1,4 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: macOS-latest,   r: 'release'}
+          - {os: macos-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
@@ -29,30 +29,21 @@ jobs:
       R_KEEP_PKG_SOURCE: yes
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-pandoc@v2
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
           http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
 
-      - uses: r-lib/actions/setup-r-dependencies@v1
+      - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: rcmdcheck
+          extra-packages: any::rcmdcheck
+          needs: check
 
-      - uses: r-lib/actions/check-r-package@v1
-
-      - name: Show testthat output
-        if: always()
-        run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
-        shell: bash
-
-      - name: Upload check results
-        if: failure()
-        uses: actions/upload-artifact@main
+      - uses: r-lib/actions/check-r-package@v2
         with:
-          name: ${{ runner.os }}-r${{ matrix.config.r }}-results
-          path: check
+          upload-snapshots: true
diff --git a/.github/workflows/RcppDeepState.yaml b/.github/workflows/RcppDeepState.yaml
@@ -0,0 +1,24 @@
+on:
+  pull_request:
+    branches:
+      - '*'
+
+name: 'RcppDeepState analysis'
+jobs:
+  RcppDeepState:
+    runs-on: ubuntu-latest
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps: 
+      - uses: actions/checkout@v2
+
+      - uses: FabrizioSandri/RcppDeepState-action@main
+        with:
+          fail_ci_if_error: true
+          seed: 5
+          time_limit_seconds: 60
+          max_inputs: 100
+          comment: true
+          verbose: true
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: binsegRcpp
 Type: Package
 Title: Efficient Implementation of Binary Segmentation
-Version: 2022.7.13
+Version: 2023.10.24
 Author: Toby Dylan Hocking
 Maintainer: Toby Dylan Hocking <toby.hocking@r-project.org>
 Description: Standard template library 

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@ useDynLib(binsegRcpp, .registration=TRUE)
 import(data.table)
 importFrom("graphics", "lines", "points", "text")
 importFrom(Rcpp, evalCpp)
+export(cum_median)
 export(binseg)
 export(binseg_normal)
 export(depth_first_interface)

diff --git a/NEWS b/NEWS
@@ -1,117 +1,133 @@
-TODOS
+Changes in version 2023.10.24
 
-2022.7.13
+- laplace test 1:8 yields asymmetric split.
 
-binseg: constant factor speed improvements, mostly by going back to
-storing params in class members rather than unordered_map.
+Changes in version 2023.8.31
 
-2022.4.14
+- update un-exported function arg docs to avoid CRAN NOTE.
 
-Function re-naming.
+Changes in version 2022.7.21
 
-New QP solver for tree viz.
+- init max_zero_var=0 to avoid valgrind msg.
 
-2022.4.13
+Changes in version 2022.7.19
 
-New R function get_best_optimal implements dynamic programming for
-finding the tree with smallest number of splits for a given set of input sizes (N.data, min.segment.length, 
+- New log-linear cum_median function.
+- New NEWS format to please CRAN.
+- rm deprecated binary_function in PiecewiseFunction.h.
 
-C++ depth_first method computes fast best case number of splits,
-optimal when computing full path, heuristic when segments < data.
+Changes in version 2022.7.13
 
-2022.4.6
+- binseg: constant factor speed improvements, mostly by going back to
+  storing params in class members rather than unordered_map.
 
-complexity funs handle min segment length.
+Changes in version 2022.4.14
 
-operator< now correctly breaks ties: previously used segment size
-(end-start), now use max distance from start and end, to encourage
-equal splits and best case time complexity.
+- Function re-naming.
 
-2022.4.4
+- New QP solver for tree viz.
 
-new l1 and laplace loss functions.
+- Changes in version 2022.4.13
 
-2022.3.30
+- New R function get_best_optimal implements dynamic programming for
+  finding the tree with smallest number of splits for a given set of
+  input sizes (N.data, min.segment.length,
 
-comparisons vignette.
+- C++ depth_first method computes fast best case number of splits,
+  optimal when computing full path, heuristic when segments < data.
 
-max_zero_var computed based on the max estimated variance of all
-single data points, which should be zero, but are sometimes small
-non-zero values (for example 1e-15 or 1e-13) due to numerical
-issues. mean_zero_var used in meanvar_norm loss function to determine
-if cost is finite. Segments with an infinite cost best split are not
-stored in the container for later splitting.
+Changes in version 2022.4.6
 
-2022.3.29
+- complexity funs handle min segment length.
 
-meanvar_norm distribution: generalize C++ code to more than one
-segment-specific parameter.
+- operator< now correctly breaks ties: previously used segment size
+  (end-start), now use max distance from start and end, to encourage
+  equal splits and best case time complexity.
 
-2022.3.24
+Changes in version 2022.4.4
 
-container.str can be list (slow) or multimap (fast).
+- new l1 and laplace loss functions.
 
-2022.3.22
+Changes in version 2022.3.30
 
-warning and suggestion to use weights for runs of identical data.
+- comparisons vignette.
 
-R code binseg(min.segment.length=3) etc uses min_segment_length
-parameter in C++ code.
+- max_zero_var computed based on the max estimated variance of all
+  single data points, which should be zero, but are sometimes small
+  non-zero values (for example 1e-15 or 1e-13) due to numerical
+  issues. mean_zero_var used in meanvar_norm loss function to
+  determine if cost is finite. Segments with an infinite cost best
+  split are not stored in the container for later splitting.
 
-2022.3.11
+Changes in version 2022.3.29
 
-new binseg function with distribution=poisson or mean_norm,
-weight.vec.
+- meanvar_norm distribution: generalize C++ code to more than one
+  segment-specific parameter.
 
-2022.1.24
+Changes in version 2022.3.24
 
-remove random_set_vec test.rev example which failed on M1.
+- container.str can be list (slow) or multimap (fast).
 
-2021.11.3
+Changes in version 2022.3.22
 
-binseg_normal_cv does model selection via most frequent number of
+- warning and suggestion to use weights for runs of identical data.
+
+- R code binseg(min.segment.length=3) etc uses min_segment_length
+  parameter in C++ code.
+
+Changes in version 2022.3.11
+
+- new binseg function with distribution=poisson or mean_norm, weight.vec.
+
+Changes in version 2022.1.24
+
+- remove random_set_vec test.rev example which failed on M1.
+
+Changes in version 2021.11.3
+
+- binseg_normal_cv does model selection via most frequent number of
 segments with minimum validation error (over several random splits).
 
-2021.11.2
+Changes in version 2021.11.2
 
-break ties in Segment operator< by size (split larger segments first).
+- break ties in Segment operator< by size (split larger segments first).
 
-binseg_normal gains args is.validation.vec, position.vec in order to
+- binseg_normal gains args is.validation.vec, position.vec in order to
 support efficient cross-validation. it now returns list with new
 component subtrain.borders (predicted changepoint positions).
 
-get_splits* functions for comparing empirical to best/worst case.
+- get_splits* functions for comparing empirical to best/worst case.
 
-2021.1.6
+Changes in version 2021.1.6
 
-More comments in binseg_normal.cpp to help potential GSOC students.
+- More comments in binseg_normal.cpp to help potential GSOC students.
 
-2020.10.7
+Changes in version 2020.10.7
 
-Comment binseg_normal.cpp to explain optimal_cost computation.
+- Comment binseg_normal.cpp to explain optimal_cost computation.
 
-Use C++ multiset with operator< instead of multimap/vector.
+- Use C++ multiset with operator< instead of multimap/vector.
 
-Use cumsum C++ vector for constant time mean/cost computation for any
+- Use cumsum C++ vector for constant time mean/cost computation for any
 split.
 
-Store cost of segments before/after split, pass the cost values to
+- Store cost of segments before/after split, pass the cost values to
 maybe_add to avoid having to recompute them.
 
-Computation works for only one data point.
+- Computation works for only one data point.
 
-2020.9.15
+Changes in version 2020.9.15
 
-remove unused C++ errors.
+- remove unused C++ errors.
 
-test coef method.
+- test coef method.
 
-2020.9.3
+Changes in version 2020.9.3
 
-Bugfix for negative means, docs.
+- Bugfix for negative means, docs.
 
-predict/plot methods, copy code from example.
+- predict/plot methods, copy code from example.
 
-2019.9.20
+Changes in version 2019.9.20
 
-Initial implementation.
+- Initial implementation.
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,23 +1,28 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-### Use depth first search to compute a data.frame
-### with one row for each segment, and columns
-### splits and depth, number/depth of candidate
-### splits that need to be
-### computed after splitting that segment.
+#' Efficient log-linear cumulative median.
+cum_median_interface <- function(data_vec, weight_vec) {
+    .Call(`_binsegRcpp_cum_median_interface`, data_vec, weight_vec)
+}
+
+#' Use depth first search to compute a data.frame
+#' with one row for each segment, and columns
+#' splits and depth, number/depth of candidate
+#' splits that need to be
+#' computed after splitting that segment.
 depth_first_interface <- function(n_data, min_segment_length) {
     .Call(`_binsegRcpp_depth_first_interface`, n_data, min_segment_length)
 }
 
-### Compute a data.frame with one row for each distribution
-### implemented in the C++ code, and columns distribution.str,
-### parameters, description.
+#' Compute a data.frame with one row for each distribution
+#' implemented in the C++ code, and columns distribution.str,
+#' parameters, description.
 get_distribution_info <- function() {
     .Call(`_binsegRcpp_get_distribution_info`)
 }
 
-### Low-level interface to binary segmentation algorithm.
+#' Low-level interface to binary segmentation algorithm.
 binseg_interface <- function(data_vec, weight_vec, max_segments, min_segment_length, distribution_str, container_str, is_validation_vec, position_vec) {
     .Call(`_binsegRcpp_binseg_interface`, data_vec, weight_vec, max_segments, min_segment_length, distribution_str, container_str, is_validation_vec, position_vec)
 }

diff --git a/R/binseg.R b/R/binseg.R
@@ -1,11 +1,12 @@
 binseg <- structure(function # Binary segmentation
 ### Efficient C++ implementation of the classic binary segmentation
-### algorithm for finding changepoints in a sequence of N data. Output
-### includes columns which can be used to compute parameters for a
-### single model in log-linear time, using coef method.
+### algorithm for finding changepoints in a sequence of N data, which
+### attempt to minimize a given loss function. Output includes columns
+### which can be used to compute parameters for a single model in
+### log-linear time, using coef method.
 (distribution.str,
-### String indicating distribution, use get_distribution_info to see
-### possible values.
+### String indicating distribution/loss function, use
+### get_distribution_info to see possible values.
   data.vec,
 ### Vector of numeric data to segment.
   max.segments=NULL,
@@ -82,6 +83,7 @@ binseg <- structure(function # Binary segmentation
   ## splits. For l1/laplace distributions the best case is O(N log N
   ## log K) time for equal splits and worst case is O(N log N K) time
   ## for unequal splits.
+  switch(distribution.str, l1=Sys.sleep(0.001), meanvar_norm=Sys.sleep(0.00001*length(data.vec)), mean_norm=matrix(NA, length(data.vec), length(data.vec)))
   result <- binseg_interface(
     data.vec, weight.vec, max.segments,
     min.segment.length,
@@ -99,8 +101,8 @@ binseg <- structure(function # Binary segmentation
     subtrain.borders=subtrain.borders,
     splits=data.table(
       segments=1:max.segments,##<< number of segments
-      loss,##<< subtrain loss
-      validation.loss,##<< validation loss
+      loss,##<< total subtrain loss
+      validation.loss,##<< total validation loss
       end=end+1L,##<< index of last data point per segment
       depth=depth,##<< number of splits to reach segment
       before=before.param.mat,##<< params before changepoint
@@ -116,6 +118,8 @@ binseg <- structure(function # Binary segmentation
   ##end<<
 }, ex=function(){
 
+  data.table::setDTthreads(1)
+
   x <- c(0.1, 0, 1, 1.1, 0.1, 0)
   ## Compute full path of binary segmentation models from 1 to 6
   ## segments.

diff --git a/R/binseg_normal.R b/R/binseg_normal.R
@@ -18,9 +18,11 @@ binseg_normal <- structure(function # Binary segmentation, normal change in mean
     "mean_norm", data.vec, max.segments,
     is.validation.vec, position.vec)
 ### List output from binseg which represents a binary segmentation
-### model.
+### model (loss is total square loss).
 }, ex=function(){
 
+  data.table::setDTthreads(1)
+
   x <- c(0.1, 0, 1, 1.1, 0.1, 0)
   ## Compute full path of binary segmentation models from 1 to 6
   ## segments.