Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
114dbb6
Merge pull request #12 from tdhock/opt-speed
tdhock Jul 13, 2022
e95abfe
cum median
tdhock Jul 19, 2022
5ca755f
reformat for new CRAN checks
tdhock Jul 20, 2022
fd9d60b
rm deprecated binary_function
tdhock Jul 20, 2022
c8f5e75
init max_zero_var=0 to avoid valgrind msg
tdhock Jul 21, 2022
f4c9463
version++
tdhock Jul 21, 2022
546025c
RcppDeepsTate action
tdhock Jul 26, 2022
4d55934
jobs
tdhock Jul 26, 2022
47b013d
ident
tdhock Jul 26, 2022
ff44375
no tab
tdhock Jul 26, 2022
c5ad507
@master
tdhock Jul 26, 2022
261dafe
@main
tdhock Jul 26, 2022
f52b7f8
leak
tdhock Jul 26, 2022
c0aacd7
fail_ci_if_error:true
tdhock Jul 26, 2022
8b37de6
only pr not push
tdhock Jul 26, 2022
e9277ec
custom test harness
tdhock Aug 4, 2022
01881ed
10 inputs
tdhock Aug 16, 2022
5c0626a
fixed seed
tdhock Aug 16, 2022
16d3fe6
only leak for positive data
tdhock Aug 16, 2022
32d0969
tab ->space
tdhock Aug 17, 2022
4a57056
rm rcppdeepstate yaml action
tdhock Aug 24, 2022
977f385
rm rcppdeepstate yaml action
tdhock Aug 24, 2022
631d0d3
new yaml
tdhock Aug 24, 2022
9920c75
merge
tdhock Aug 24, 2022
3e17be9
edit readme another
tdhock Jan 25, 2023
e9f344d
tests filel
tdhock Jan 25, 2023
516581b
sleep depending on distn
tdhock Jan 26, 2023
cf7d891
distns for testign
tdhock Jan 26, 2023
637c88e
l1 quadratic mem
tdhock Jan 27, 2023
b7f889d
new name
tdhock Jan 27, 2023
b6f4e44
seconds.limit
tdhock Jan 27, 2023
5e5e134
each item in test list is a list of args
tdhock Jan 27, 2023
47027fc
un-exported fun arg docs
tdhock Sep 1, 2023
85ddeed
setDTthreads(1)
tdhock Sep 6, 2023
cc8ed4f
Unbreak the build: use std::isfinite
barracuda156 Sep 6, 2023
0591955
check v2
tdhock Sep 9, 2023
15becb3
Merge pull request #18 from barracuda156/fix_isfinite
tdhock Sep 10, 2023
e7f1343
fix docs
tdhock Sep 19, 2023
27a3e08
merge
tdhock Sep 19, 2023
485f071
test laplace asymmetric split for 1:8 data
tdhock Oct 25, 2023
9091d66
Merge branch 'master' of github.com:tdhock/binsegRcpp
tdhock Oct 25, 2023
1404308
add before
tdhock Dec 8, 2023
100de57
Merge branch 'master' of https://github.com/tdhock/binsegRcpp
tdhock Apr 23, 2024
7e9bc76
document total loss
tdhock Apr 23, 2024
9179c3a
Merge branch 'master' into another-branch
tdhock May 17, 2024
1109932
newer commit
tdhock May 23, 2024
f70739a
N.tests.preview <- 2
tdhock Sep 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 10 additions & 19 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
Expand All @@ -18,7 +18,7 @@ jobs:
fail-fast: false
matrix:
config:
- {os: macOS-latest, r: 'release'}
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
Expand All @@ -29,30 +29,21 @@ jobs:
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v1
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: rcmdcheck
extra-packages: any::rcmdcheck
needs: check

- uses: r-lib/actions/check-r-package@v1

- name: Show testthat output
if: always()
run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
shell: bash

- name: Upload check results
if: failure()
uses: actions/upload-artifact@main
- uses: r-lib/actions/check-r-package@v2
with:
name: ${{ runner.os }}-r${{ matrix.config.r }}-results
path: check
upload-snapshots: true
24 changes: 24 additions & 0 deletions .github/workflows/RcppDeepState.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
on:
pull_request:
branches:
- '*'

name: 'RcppDeepState analysis'
jobs:
RcppDeepState:
runs-on: ubuntu-latest

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v2

- uses: FabrizioSandri/RcppDeepState-action@main
with:
fail_ci_if_error: true
seed: 5
time_limit_seconds: 60
max_inputs: 100
comment: true
verbose: true
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: binsegRcpp
Type: Package
Title: Efficient Implementation of Binary Segmentation
Version: 2022.7.13
Version: 2023.10.24
Author: Toby Dylan Hocking
Maintainer: Toby Dylan Hocking <toby.hocking@r-project.org>
Description: Standard template library
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ useDynLib(binsegRcpp, .registration=TRUE)
import(data.table)
importFrom("graphics", "lines", "points", "text")
importFrom(Rcpp, evalCpp)
export(cum_median)
export(binseg)
export(binseg_normal)
export(depth_first_interface)
Expand Down
142 changes: 79 additions & 63 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,117 +1,133 @@
TODOS
Changes in version 2023.10.24

2022.7.13
- laplace test 1:8 yields asymmetric split.

binseg: constant factor speed improvements, mostly by going back to
storing params in class members rather than unordered_map.
Changes in version 2023.8.31

2022.4.14
- update un-exported function arg docs to avoid CRAN NOTE.

Function re-naming.
Changes in version 2022.7.21

New QP solver for tree viz.
- init max_zero_var=0 to avoid valgrind msg.

2022.4.13
Changes in version 2022.7.19

New R function get_best_optimal implements dynamic programming for
finding the tree with smallest number of splits for a given set of input sizes (N.data, min.segment.length,
- New log-linear cum_median function.
- New NEWS format to please CRAN.
- rm deprecated binary_function in PiecewiseFunction.h.

C++ depth_first method computes fast best case number of splits,
optimal when computing full path, heuristic when segments < data.
Changes in version 2022.7.13

2022.4.6
- binseg: constant factor speed improvements, mostly by going back to
storing params in class members rather than unordered_map.

complexity funs handle min segment length.
Changes in version 2022.4.14

operator< now correctly breaks ties: previously used segment size
(end-start), now use max distance from start and end, to encourage
equal splits and best case time complexity.
- Function re-naming.

2022.4.4
- New QP solver for tree viz.

new l1 and laplace loss functions.
- Changes in version 2022.4.13

2022.3.30
- New R function get_best_optimal implements dynamic programming for
finding the tree with smallest number of splits for a given set of
input sizes (N.data, min.segment.length,

comparisons vignette.
- C++ depth_first method computes fast best case number of splits,
optimal when computing full path, heuristic when segments < data.

max_zero_var computed based on the max estimated variance of all
single data points, which should be zero, but are sometimes small
non-zero values (for example 1e-15 or 1e-13) due to numerical
issues. mean_zero_var used in meanvar_norm loss function to determine
if cost is finite. Segments with an infinite cost best split are not
stored in the container for later splitting.
Changes in version 2022.4.6

2022.3.29
- complexity funs handle min segment length.

meanvar_norm distribution: generalize C++ code to more than one
segment-specific parameter.
- operator< now correctly breaks ties: previously used segment size
(end-start), now use max distance from start and end, to encourage
equal splits and best case time complexity.

2022.3.24
Changes in version 2022.4.4

container.str can be list (slow) or multimap (fast).
- new l1 and laplace loss functions.

2022.3.22
Changes in version 2022.3.30

warning and suggestion to use weights for runs of identical data.
- comparisons vignette.

R code binseg(min.segment.length=3) etc uses min_segment_length
parameter in C++ code.
- max_zero_var computed based on the max estimated variance of all
single data points, which should be zero, but are sometimes small
non-zero values (for example 1e-15 or 1e-13) due to numerical
issues. mean_zero_var used in meanvar_norm loss function to
determine if cost is finite. Segments with an infinite cost best
split are not stored in the container for later splitting.

2022.3.11
Changes in version 2022.3.29

new binseg function with distribution=poisson or mean_norm,
weight.vec.
- meanvar_norm distribution: generalize C++ code to more than one
segment-specific parameter.

2022.1.24
Changes in version 2022.3.24

remove random_set_vec test.rev example which failed on M1.
- container.str can be list (slow) or multimap (fast).

2021.11.3
Changes in version 2022.3.22

binseg_normal_cv does model selection via most frequent number of
- warning and suggestion to use weights for runs of identical data.

- R code binseg(min.segment.length=3) etc uses min_segment_length
parameter in C++ code.

Changes in version 2022.3.11

- new binseg function with distribution=poisson or mean_norm, weight.vec.

Changes in version 2022.1.24

- remove random_set_vec test.rev example which failed on M1.

Changes in version 2021.11.3

- binseg_normal_cv does model selection via most frequent number of
segments with minimum validation error (over several random splits).

2021.11.2
Changes in version 2021.11.2

break ties in Segment operator< by size (split larger segments first).
- break ties in Segment operator< by size (split larger segments first).

binseg_normal gains args is.validation.vec, position.vec in order to
- binseg_normal gains args is.validation.vec, position.vec in order to
support efficient cross-validation. it now returns list with new
component subtrain.borders (predicted changepoint positions).

get_splits* functions for comparing empirical to best/worst case.
- get_splits* functions for comparing empirical to best/worst case.

2021.1.6
Changes in version 2021.1.6

More comments in binseg_normal.cpp to help potential GSOC students.
- More comments in binseg_normal.cpp to help potential GSOC students.

2020.10.7
Changes in version 2020.10.7

Comment binseg_normal.cpp to explain optimal_cost computation.
- Comment binseg_normal.cpp to explain optimal_cost computation.

Use C++ multiset with operator< instead of multimap/vector.
- Use C++ multiset with operator< instead of multimap/vector.

Use cumsum C++ vector for constant time mean/cost computation for any
- Use cumsum C++ vector for constant time mean/cost computation for any
split.

Store cost of segments before/after split, pass the cost values to
- Store cost of segments before/after split, pass the cost values to
maybe_add to avoid having to recompute them.

Computation works for only one data point.
- Computation works for only one data point.

2020.9.15
Changes in version 2020.9.15

remove unused C++ errors.
- remove unused C++ errors.

test coef method.
- test coef method.

2020.9.3
Changes in version 2020.9.3

Bugfix for negative means, docs.
- Bugfix for negative means, docs.

predict/plot methods, copy code from example.
- predict/plot methods, copy code from example.

2019.9.20
Changes in version 2019.9.20

Initial implementation.
- Initial implementation.
23 changes: 14 additions & 9 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

### Use depth first search to compute a data.frame
### with one row for each segment, and columns
### splits and depth, number/depth of candidate
### splits that need to be
### computed after splitting that segment.
#' Efficient log-linear cumulative median.
cum_median_interface <- function(data_vec, weight_vec) {
.Call(`_binsegRcpp_cum_median_interface`, data_vec, weight_vec)
}

#' Use depth first search to compute a data.frame
#' with one row for each segment, and columns
#' splits and depth, number/depth of candidate
#' splits that need to be
#' computed after splitting that segment.
depth_first_interface <- function(n_data, min_segment_length) {
.Call(`_binsegRcpp_depth_first_interface`, n_data, min_segment_length)
}

### Compute a data.frame with one row for each distribution
### implemented in the C++ code, and columns distribution.str,
### parameters, description.
#' Compute a data.frame with one row for each distribution
#' implemented in the C++ code, and columns distribution.str,
#' parameters, description.
get_distribution_info <- function() {
.Call(`_binsegRcpp_get_distribution_info`)
}

### Low-level interface to binary segmentation algorithm.
#' Low-level interface to binary segmentation algorithm.
binseg_interface <- function(data_vec, weight_vec, max_segments, min_segment_length, distribution_str, container_str, is_validation_vec, position_vec) {
.Call(`_binsegRcpp_binseg_interface`, data_vec, weight_vec, max_segments, min_segment_length, distribution_str, container_str, is_validation_vec, position_vec)
}
Expand Down
18 changes: 11 additions & 7 deletions R/binseg.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
binseg <- structure(function # Binary segmentation
### Efficient C++ implementation of the classic binary segmentation
### algorithm for finding changepoints in a sequence of N data. Output
### includes columns which can be used to compute parameters for a
### single model in log-linear time, using coef method.
### algorithm for finding changepoints in a sequence of N data, which
### attempt to minimize a given loss function. Output includes columns
### which can be used to compute parameters for a single model in
### log-linear time, using coef method.
(distribution.str,
### String indicating distribution, use get_distribution_info to see
### possible values.
### String indicating distribution/loss function, use
### get_distribution_info to see possible values.
data.vec,
### Vector of numeric data to segment.
max.segments=NULL,
Expand Down Expand Up @@ -82,6 +83,7 @@ binseg <- structure(function # Binary segmentation
## splits. For l1/laplace distributions the best case is O(N log N
## log K) time for equal splits and worst case is O(N log N K) time
## for unequal splits.
switch(distribution.str, l1=Sys.sleep(0.001), meanvar_norm=Sys.sleep(0.00001*length(data.vec)), mean_norm=matrix(NA, length(data.vec), length(data.vec)))
result <- binseg_interface(
data.vec, weight.vec, max.segments,
min.segment.length,
Expand All @@ -99,8 +101,8 @@ binseg <- structure(function # Binary segmentation
subtrain.borders=subtrain.borders,
splits=data.table(
segments=1:max.segments,##<< number of segments
loss,##<< subtrain loss
validation.loss,##<< validation loss
loss,##<< total subtrain loss
validation.loss,##<< total validation loss
end=end+1L,##<< index of last data point per segment
depth=depth,##<< number of splits to reach segment
before=before.param.mat,##<< params before changepoint
Expand All @@ -116,6 +118,8 @@ binseg <- structure(function # Binary segmentation
##end<<
}, ex=function(){

data.table::setDTthreads(1)

x <- c(0.1, 0, 1, 1.1, 0.1, 0)
## Compute full path of binary segmentation models from 1 to 6
## segments.
Expand Down
4 changes: 3 additions & 1 deletion R/binseg_normal.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ binseg_normal <- structure(function # Binary segmentation, normal change in mean
"mean_norm", data.vec, max.segments,
is.validation.vec, position.vec)
### List output from binseg which represents a binary segmentation
### model.
### model (loss is total square loss).
}, ex=function(){

data.table::setDTthreads(1)

x <- c(0.1, 0, 1, 1.1, 0.1, 0)
## Compute full path of binary segmentation models from 1 to 6
## segments.
Expand Down
Loading
Loading