diff --git a/doc/distributions/dist_reference.qbk b/doc/distributions/dist_reference.qbk index 3d5d82fe89..7231d39e32 100644 --- a/doc/distributions/dist_reference.qbk +++ b/doc/distributions/dist_reference.qbk @@ -37,6 +37,7 @@ [include triangular.qbk] [include uniform.qbk] [include weibull.qbk] +[include empirical_cdf.qbk] [endsect] [/section:dists Distributions] @@ -138,10 +139,3 @@ opportunity to integrate the statistical tests with this framework at some later (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt). ] - - - - - - - diff --git a/doc/distributions/empirical_cdf.qbk b/doc/distributions/empirical_cdf.qbk new file mode 100644 index 0000000000..ab7d60c320 --- /dev/null +++ b/doc/distributions/empirical_cdf.qbk @@ -0,0 +1,153 @@ +[/ +Copyright (c) 2019 Nick Thompson +Use, modification and distribution are subject to the +Boost Software License, Version 1.0. (See accompanying file +LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +] + +[section:empirical_cdf Empirical Cumulative Distribution Function] + +[heading Synopsis] + +``` +#include + +namespace boost{ namespace math{ + +template +class empirical_cumulative_distribution_function +{ +public: + using Real = typename RandomAccessContainer::value_type; + empirical_cumulative_distribution_function(RandomAccessContainer && v, bool sorted = false); + + auto operator()(Real t) const; + + RandomAccessContainer&& return_data(); +}; + +}} +``` + +[heading Empirical Cumulative Distribution Function] + +The empirical cumulative distribution function is a step function constructed from observed data which converges to the true cumulative distribution function in the limit of infinite data. +This function is a basic building block of hypothesis testing workflows that attempt to answer the question "does my data come from a given distribution?" +These tests require computing quadratures over some function of the empirical CDF and the supposed CDF to create a distance measurement, and hence it is occasionally useful to construct a continuous callable from the data. + +An example usage is demonstrated below: + +``` +#include +#include +#include +using boost::math::empirical_cumulative_distribution_function; +std::random_device rd; +std::mt19937 gen{rd()}; +std::normal_distribution dis(0, 1); +size_t n = 128; +std::vector v(n); +for (size_t i = 0; i < n; ++i) { + v[i] = dis(gen); +} + +auto ecdf = empirical_cumulative_distribution_function(std::move(v)); +std::cout << "ecdf(0.0) = " << ecdf(0.0) << "\n"; +// should print approximately 0.5 . . . +``` + +The empirical distribution function requires sorted data. +By default, the constructor sorts it for you at O(Nlog(N)) cost. + +If your data is already sorted, you can specify this and the constructor simply moves your data into the class: + +``` +std::sort(v.begin(), v.end()); +auto ecdf = empirical_cumulative_distribution_function(std::move(v), /* already sorted = */ true); +``` + +If you want your data back after being done with the object, use + +``` +v = ecdf.return_data(); +``` + +This operation invalidates `ecdf`; it can no longer be used. + +The call operator complexity is O(log(N)), as it requires a call to `std::upper_bound`. + +Works with both integer and floating point types. +If the input data consists of integers, the output of the call operator is a double. Requires C++17. + +[$../graphs/empiricial_cumulative_distribution_gauss.svg] + +[$../graphs/empiricial_cumulative_distribution_uniform.svg] + + +[heading Performance] + +``` +------------------------------------------------------ +Benchmark Time +------------------------------------------------------ +ECDFConstructorSorted/8 4.52 ns +ECDFConstructorSorted/16 5.20 ns +ECDFConstructorSorted/32 5.22 ns +ECDFConstructorSorted/64 7.37 ns +ECDFConstructorSorted/128 7.16 ns +ECDFConstructorSorted/256 8.97 ns +ECDFConstructorSorted/512 8.44 ns +ECDFConstructorSorted/1024 9.07 ns +ECDFConstructorSorted/2048 11.4 ns +ECDFConstructorSorted/4096 12.6 ns +ECDFConstructorSorted/8192 11.4 ns +ECDFConstructorSorted/16384 16.0 ns +ECDFConstructorSorted/32768 17.0 ns +ECDFConstructorSorted/65536 19.5 ns +ECDFConstructorSorted/131072 15.8 ns +ECDFConstructorSorted/262144 17.9 ns +ECDFConstructorSorted/524288 26.7 ns +ECDFConstructorSorted/1048576 29.5 ns +ECDFConstructorSorted/2097152 31.8 ns +ECDFConstructorSorted/4194304 32.8 ns +ECDFConstructorSorted/8388608 35.4 ns +ECDFConstructorSorted/16777216 30.4 ns +ECDFConstructorSorted_BigO 1.27 lgN +ECDFConstructorSorted_RMS 20 % +ECDFConstructorUnsorted/8 155 ns +ECDFConstructorUnsorted/64 2095 ns +ECDFConstructorUnsorted/512 22212 ns +ECDFConstructorUnsorted/4096 220821 ns +ECDFConstructorUnsorted/32768 1996380 ns +ECDFConstructorUnsorted/262144 18916039 ns +ECDFConstructorUnsorted/2097152 194250013 ns +ECDFConstructorUnsorted/16777216 2281469424 ns +ECDFConstructorUnsorted_BigO 5.65 NlgN +ECDFConstructorUnsorted_RMS 6 % +Shuffle/8 82.4 ns +Shuffle/64 731 ns +Shuffle/512 5876 ns +Shuffle/4096 46864 ns +Shuffle/32768 385265 ns +Shuffle/262144 4663866 ns +Shuffle/2097152 54686332 ns +Shuffle/16777216 875309099 ns +Shuffle_BigO 2.16 NlgN +Shuffle_RMS 12 % +ECDFEvaluation/8 48.6 ns +ECDFEvaluation/64 61.3 ns +ECDFEvaluation/512 85.1 ns +ECDFEvaluation/4096 105 ns +ECDFEvaluation/32768 131 ns +ECDFEvaluation/262144 196 ns +ECDFEvaluation/2097152 391 ns +ECDFEvaluation/16777216 715 ns +ECDFEvaluation_BigO 18.19 lgN +ECDFEvaluation_RMS 60 % +``` + +The call to the unsorted constructor is in fact a little faster than indicated, as the data must be shuffled after being sorted in the benchmark. +This is itself a fairly expensive operation. + +[endsect] +[/section:empirical_cdf] diff --git a/doc/graphs/empiricial_cumulative_distribution_gauss.svg b/doc/graphs/empiricial_cumulative_distribution_gauss.svg new file mode 100644 index 0000000000..f07a6b51c6 --- /dev/null +++ b/doc/graphs/empiricial_cumulative_distribution_gauss.svg @@ -0,0 +1,48 @@ + + + +Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) distribution on 128 samples + + + + +0.125 + +0.25 + +0.375 + +0.5 + +0.625 + +0.75 + +0.875 + +1 + +-2.4 + +-1.8 + +-1.2 + +-0.6 + +0.0 + +0.6 + +1.2 + +1.8 + +2.4 + +3 + + + + diff --git a/doc/graphs/empiricial_cumulative_distribution_uniform.svg b/doc/graphs/empiricial_cumulative_distribution_uniform.svg new file mode 100644 index 0000000000..46d3a7a151 --- /dev/null +++ b/doc/graphs/empiricial_cumulative_distribution_uniform.svg @@ -0,0 +1,48 @@ + + + +Empirical (blue) and theoretical CDF (orange) of the dice roll distribution on n = 128 samples + + + + +0.125 + +0.25 + +0.375 + +0.5 + +0.625 + +0.75 + +0.875 + +1 + +0.6 + +1.2 + +1.8 + +2.4 + +3 + +3.6 + +4.2 + +4.8 + +5.4 + +6 + + + + diff --git a/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp b/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp new file mode 100644 index 0000000000..451bf2f205 --- /dev/null +++ b/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp @@ -0,0 +1,63 @@ +// Copyright Nick Thompson 2019. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP +#define BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP +#include +#include +#include + +namespace boost { namespace math{ + +template +class empirical_cumulative_distribution_function { + using Real = typename RandomAccessContainer::value_type; +public: + empirical_cumulative_distribution_function(RandomAccessContainer && v, bool sorted = false) + { + if (v.size() == 0) { + throw std::domain_error("At least one sample is required to compute an empirical CDF."); + } + m_v = std::move(v); + if (!sorted) { + std::sort(m_v.begin(), m_v.end()); + } + } + + auto operator()(Real x) const { + if constexpr (std::is_integral_v) + { + if (x < m_v[0]) { + return double(0); + } + if (x >= m_v[m_v.size()-1]) { + return double(1); + } + auto it = std::upper_bound(m_v.begin(), m_v.end(), x); + return static_cast(std::distance(m_v.begin(), it))/static_cast(m_v.size()); + } + else + { + if (x < m_v[0]) { + return Real(0); + } + if (x >= m_v[m_v.size()-1]) { + return Real(1); + } + auto it = std::upper_bound(m_v.begin(), m_v.end(), x); + return static_cast(std::distance(m_v.begin(), it))/static_cast(m_v.size()); + } + } + + RandomAccessContainer&& return_data() { + return std::move(m_v); + } + +private: + RandomAccessContainer m_v; +}; + +}} +#endif diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index a75340d889..5583bcfa25 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -951,6 +951,7 @@ test-suite misc : [ run compile_test/catmull_rom_concept_test.cpp compile_test_main : : : [ requires cxx11_hdr_array cxx11_hdr_initializer_list ] ] [ run ooura_fourier_integral_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] + [ run empirical_cumulative_distribution_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run bivariate_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] diff --git a/test/empirical_cumulative_distribution_test.cpp b/test/empirical_cumulative_distribution_test.cpp new file mode 100644 index 0000000000..632f3653bd --- /dev/null +++ b/test/empirical_cumulative_distribution_test.cpp @@ -0,0 +1,69 @@ +/* + * Copyright Nick Thompson, 2019 + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#include "math_unit_test.hpp" +#include +#include +#include +#include +#include +#ifdef BOOST_HAS_FLOAT128 +#include +using boost::multiprecision::float128; +#endif + +using boost::math::empirical_cumulative_distribution_function; + +template +void test_uniform_z() +{ + std::vector v{6,3,4,1,2,5}; + + auto ecdf = empirical_cumulative_distribution_function(std::move(v)); + + CHECK_ULP_CLOSE(1.0/6.0, ecdf(1), 1); + CHECK_ULP_CLOSE(2.0/6.0, ecdf(2), 1); + CHECK_ULP_CLOSE(3.0/6.0, ecdf(3), 1); + CHECK_ULP_CLOSE(4.0/6.0, ecdf(4), 1); + CHECK_ULP_CLOSE(5.0/6.0, ecdf(5), 1); + CHECK_ULP_CLOSE(6.0/6.0, ecdf(6), 1); + + // Less trivial: + + v = {6,3,4,1,1,1,2,4}; + ecdf = empirical_cumulative_distribution_function(std::move(v)); + CHECK_ULP_CLOSE(3.0/8.0, ecdf(1), 1); + CHECK_ULP_CLOSE(4.0/8.0, ecdf(2), 1); + CHECK_ULP_CLOSE(5.0/8.0, ecdf(3), 1); + CHECK_ULP_CLOSE(7.0/8.0, ecdf(4), 1); + CHECK_ULP_CLOSE(7.0/8.0, ecdf(5), 1); + CHECK_ULP_CLOSE(8.0/8.0, ecdf(6), 1); +} + +template +void test_uniform() +{ + size_t n = 128; + std::vector v(n); + for (size_t i = 0; i < n; ++i) { + v[i] = Real(i+1)/Real(n); + } + + auto ecdf = empirical_cumulative_distribution_function(std::move(v)); + + for (size_t i = 0; i < n; ++i) { + CHECK_ULP_CLOSE(Real(i+1)/Real(n), ecdf(Real(i+1)/Real(n)), 1); + } +} + + +int main() +{ + test_uniform_z(); + test_uniform(); + return boost::math::test::report_errors(); +}