diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e9ba96a..2e74346b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,7 +117,7 @@ cmake_policy(SET CMP0097 NEW) include(ExternalProject) ExternalProject_Add(datasketches GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git - GIT_TAG 5.1.0 + GIT_TAG 5.2.0 GIT_SHALLOW true GIT_SUBMODULES "" INSTALL_DIR /tmp/datasketches diff --git a/NOTICE b/NOTICE index 13051683..1668e075 100644 --- a/NOTICE +++ b/NOTICE @@ -1,9 +1,9 @@ Apache DataSketches Python -Copyright 2024 The Apache Software Foundation +Copyright 2025 The Apache Software Foundation Copyright 2015-2018 Yahoo Inc. Copyright 2019-2020 Verizon Media -Copyright 2021 Yahoo Inc. +Copyright 2021- Yahoo Inc. This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/README.md b/README.md index 2641de88..99614966 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,13 @@ The unit tests are mostly structured in a tutorial style and can be used as a re - `vector_of_kll_floats_sketches` - Kolmogorov-Smirnov Test - `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches -- Density +- Kernel Density - `density_sketch` - Count-min sketch - `count_min_sketch` +- t-digest + - tdigest_float + - tdigest_double ## Known Differences from C++ diff --git a/docs/source/quantiles/index.rst b/docs/source/quantiles/index.rst index b1928f67..bf53ea36 100644 --- a/docs/source/quantiles/index.rst +++ b/docs/source/quantiles/index.rst @@ -10,17 +10,21 @@ in the stream. These sketches may be used to compute approximate histograms, Probability Mass Functions (PMFs), or Cumulative Distribution Functions (CDFs). -The library provides three types of quantiles sketches, each of which has generic items as well as versions -specific to a given numeric type (e.g. integer or floating point values). All three types provide error -bounds on rank estimation with proven probabilistic error distributions. +The library provides four types of quantiles sketches, three of which have generic items as well as versions +specific to a given numeric type (e.g. integer or floating point values). Those three types provide error +bounds on rank estimation with proven probabilistic error distributions. t-digest is a heuristic-based sketch +that works only on numeric data, and while the error properties are not guaranteed, the sketch typically +does a good job with small storage. - * KLL: Provides uniform rank estimation error over the entire range + * KLL: Provides uniform rank estimation error over the entire range. * REQ: Provides relative rank error estimates, which decreases approaching either the high or low end values. + * t-digest: Relative rank error estimates, heuristic-based without guarantees but quite compact with generally very good error properties. * Classic quantiles: Largely deprecated in favor of KLL, also provides uniform rank estimation error. Included largely for backwards compatibility with historic data. .. toctree:: :maxdepth: 1 - + kll req + tdigest quantiles_depr \ No newline at end of file diff --git a/docs/source/quantiles/kll.rst b/docs/source/quantiles/kll.rst index 0e54b443..ab7f0c46 100644 --- a/docs/source/quantiles/kll.rst +++ b/docs/source/quantiles/kll.rst @@ -14,10 +14,6 @@ The analysis is obtained using `get_quantile()` function or the inverse functions `get_rank()`, `get_pmf()` (Probability Mass Function), and `get_cdf()` (Cumulative Distribution Function). -As of May 2020, this implementation produces serialized sketches which are binary-compatible -with the equivalent Java implementation only when template parameter `T = float` -(32-bit single precision values). - Given an input stream of `N` items, the `natural rank` of any specific item is defined as its index `(1 to N)` in inclusive mode or `(0 to N-1)` in exclusive mode @@ -168,4 +164,3 @@ Additionally, the interval may be quite large for certain distributions. .. rubric:: Non-static Methods: .. automethod:: __init__ - diff --git a/docs/source/quantiles/tdigest.rst b/docs/source/quantiles/tdigest.rst new file mode 100644 index 00000000..f697cbe9 --- /dev/null +++ b/docs/source/quantiles/tdigest.rst @@ -0,0 +1,52 @@ +t-digest +-------- + +.. currentmodule:: datasketches + +The implementation in this library is based on the MergingDigest described in +`Computing Extremely Accurate Quantiles Using t-Digests `_ by Ted Dunning and Otmar Ertl. + +The implementation in this library has a few differences from the reference implementation associated with that paper: + +* Merge does not modify the input +* Derialization similar to other sketches in this library, although reading the reference implementation format is supported + +Unlike all other algorithms in the library, t-digest is empirical and has no mathematical basis for estimating its error +and its results are dependent on the input data. However, for many common data distributions, it can produce excellent results. +t-digest also operates only on numeric data and, unlike the quantiles family algorithms in the library which return quantile +approximations from the input domain, t-digest interpolates values and will hold and return data points not seen in the input. + +The closest alternative to t-digest in this library is REQ sketch. It prioritizes one chosen side of the rank domain: +either low rank accuracy or high rank accuracy. t-digest (in this implementation) prioritizes both ends of the rank domain +and has lower accuracy towards the middle of the rank domain (median). + +Measurements show that t-digest is slightly biased (tends to underestimate low ranks and overestimate high ranks), while still +doing very well close to the extremes. The effect seems to be more pronounced with more input values. + +For more information on the performance characteristics, see `the Datasketches page on t-digest `_. + +.. autoclass:: tdigest_float + :members: + :undoc-members: + :exclude-members: deserialize + + .. rubric:: Static Methods: + + .. automethod:: deserialize + + .. rubric:: Non-static Methods: + + .. automethod:: __init__ + +.. autoclass:: tdigest_double + :members: + :undoc-members: + :exclude-members: deserialize + + .. rubric:: Static Methods: + + .. automethod:: deserialize + + .. rubric:: Non-static Methods: + + .. automethod:: __init__ diff --git a/src/tdigest_wrapper.cpp b/src/tdigest_wrapper.cpp index 059ee1d6..43248fb0 100644 --- a/src/tdigest_wrapper.cpp +++ b/src/tdigest_wrapper.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "tdigest.hpp" @@ -44,7 +45,7 @@ void bind_tdigest(nb::module_ &m, const char* name) { .def("__copy__", [](const tdigest& sk) { return tdigest(sk); }) .def("update", (void(tdigest::*)(T)) &tdigest::update, nb::arg("item"), "Updates the sketch with the given value") - .def("merge", (void(tdigest::*)(tdigest&)) &tdigest::merge, nb::arg("sketch"), + .def("merge", (void(tdigest::*)(const tdigest&)) &tdigest::merge, nb::arg("sketch"), "Merges the provided sketch into this one") .def("__str__", [](const tdigest& sk) { return sk.to_string(); }, "Produces a string summary of the sketch") @@ -71,6 +72,32 @@ void bind_tdigest(nb::module_ &m, const char* name) { .def("get_serialized_size_bytes", &tdigest::get_serialized_size_bytes, nb::arg("with_buffer")=false, "Returns the size of the serialized sketch, in bytes") + .def( + "get_pmf", + [](const tdigest& sk, const std::vector& split_points) { + return sk.get_PMF(split_points.data(), split_points.size()); + }, + nb::arg("split_points"), + "Returns an approximation to the Probability Mass Function (PMF) of the input stream " + "given a set of split points (values).\n" + "If the sketch is empty this returns an empty vector.\n" + "split_points is an array of m unique, monotonically increasing float values " + "that divide the real number line into m+1 consecutive disjoint intervals.\n" + "It is not necessary to include either the min or max values in these split points." + ) + .def( + "get_cdf", + [](const tdigest& sk, const std::vector& split_points) { + return sk.get_CDF(split_points.data(), split_points.size()); + }, + nb::arg("split_points"), + "Returns an approximation to the Cumulative Distribution Function (CDF), which is the " + "cumulative analog of the PMF, of the input stream given a set of split points (values).\n" + "If the sketch is empty this returns an empty vector.\n" + "split_points is an array of m unique, monotonically increasing float values " + "that divide the real number line into m+1 consecutive disjoint intervals.\n" + "It is not necessary to include either the min or max values in these split points." + ) ; add_serialization(tdigest_class); diff --git a/tests/tdigest_test.py b/tests/tdigest_test.py index fab32c49..b0c5550e 100644 --- a/tests/tdigest_test.py +++ b/tests/tdigest_test.py @@ -50,7 +50,16 @@ def test_tdigest_double_example(self): self.assertFalse(td.is_empty()) self.assertEqual(td.get_total_weight(), n) - # we can define a new tdiget with a different distribution, then merge them + # we can get the PMF and CDF + pmf = td.get_pmf([-0.5, 0.0, 0.5]) + self.assertEqual(len(pmf), 4) + self.assertAlmostEqual(sum(pmf), 1.0) + + cdf = td.get_cdf([0.0]) + self.assertEqual(len(cdf), 2) + self.assertAlmostEqual(cdf[0], 0.5, delta = 0.05) + + # we can define a new tdigest with a different distribution, then merge them td2 = tdigest_double() td2.update(np.random.normal(loc=2.0, size=n)) td.merge(td2) @@ -89,6 +98,14 @@ def test_tdigest_float_example(self): self.assertFalse(td.is_empty()) self.assertEqual(td.get_total_weight(), n) + pmf = td.get_pmf([-0.5, 0.0, 0.5]) + self.assertEqual(len(pmf), 4) + self.assertAlmostEqual(sum(pmf), 1.0) + + cdf = td.get_cdf([0.0]) + self.assertEqual(len(cdf), 2) + self.assertAlmostEqual(cdf[0], 0.5, delta = 0.05) + td2 = tdigest_float() td2.update(np.random.normal(loc=2.0, size=n)) td.merge(td2)