From f0287092a53c4a27f0e6fe3940768bd25a8835fc Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 11:57:47 -0600 Subject: [PATCH 001/151] Add naive implementation of stump_topk --- tests/naive.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tests/naive.py b/tests/naive.py index 4089e603e..8f3a05da8 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1716,3 +1716,74 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w ) return total_ndists + + +def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): + """ + Traverse distance matrix along the diagonals and update the top-k + nearest neigbors matrix profile and matrix profile indices + """ + if T_B is None: # self-join: + ignore_trivial = True + distance_matrix = np.array( + [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] + ) + T_B = T_A.copy() + else: + ignore_trivial = False + distance_matrix = np.array( + [distance_profile(Q, T_B, m) for Q in core.rolling_window(T_A, m)] + ) + + distance_matrix[np.isnan(distance_matrix)] = np.inf + + n_A = T_A.shape[0] + n_B = T_B.shape[0] + l = n_A - m + 1 + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + + if ignore_trivial: + diags = np.arange(exclusion_zone + 1, n_A - m + 1) + else: + diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) + + # the last two columns in P and I are to keep track of right and left mp for 1NN + P = np.full((l, k + 2), np.inf) + I = np.full((l, k + 2), -1, dtype=np.int64) + + for g in diags: + if g >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) + else: + iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - g)) + + for i in iter_range: + D = distance_matrix[i, i + g] + if D < P[i, k - 1]: + idx = np.searchsorted(P[i, :k], D, side='right') + P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] + I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] + + if ignore_trivial: # Self-joins only + if D < P[i + g, k - 1]: + idx = np.searchsorted(P[i + g, :k], D, side='right') + P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] + I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] + + if i < i + g: + # Left matrix profile and left matrix profile index + if D < P[i + g, k]: + P[i + g, k] = D + I[i + g, k] = i + + if D < P[i, k + 1]: + # right matrix profile and right matrix profile index + P[i, k + 1] = D + I[i, k + 1] = i + g + + result = np.empty((l, 2 * k + 2), dtype=object) + result[:, :k] = P[:, :k] + result[:, k:] = I[:, :] + + return result From e893873fc763a944b3d7e414d23e116762ee6693 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 12:10:32 -0600 Subject: [PATCH 002/151] Copy test_stump code to test_stump_topk --- tests/test_stump_topk.py | 242 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 tests/test_stump_topk.py diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py new file mode 100644 index 000000000..d3475122f --- /dev/null +++ b/tests/test_stump_topk.py @@ -0,0 +1,242 @@ +import numpy as np +import numpy.testing as npt +import pandas as pd +from stumpy import stump, config +import pytest +import naive + + +test_data = [ + ( + np.array([9, 8100, -60, 7], dtype=np.float64), + np.array([584, -11, 23, 79, 1001, 0, -19], dtype=np.float64), + ), + ( + np.random.uniform(-1000, 1000, [8]).astype(np.float64), + np.random.uniform(-1000, 1000, [64]).astype(np.float64), + ), +] + +substitution_locations = [(slice(0, 0), 0, -1, slice(1, 3), [0, 3])] +substitution_values = [np.nan, np.inf] + + +def test_stump_int_input(): + with pytest.raises(TypeError): + stump(np.arange(10), 5) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_self_join(T_A, T_B): + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone) + comp_mp = stump(T_B, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_A_B_join(T_A, T_B): + m = 3 + ref_mp = naive.stump(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +def test_stump_constant_subsequence_self_join(): + T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_A, m, exclusion_zone=zone) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + +def test_stump_one_constant_subsequence_A_B_join(): + T_A = np.random.rand(20) + T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + m = 3 + ref_mp = naive.stamp(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + # Swap inputs + ref_mp = naive.stamp(T_B, m, T_B=T_A) + comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + +def test_stump_two_constant_subsequences_A_B_join(): + T_A = np.concatenate( + (np.zeros(10, dtype=np.float64), np.ones(10, dtype=np.float64)) + ) + T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + m = 3 + ref_mp = naive.stamp(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + # Swap inputs + ref_mp = naive.stamp(T_B, m, T_B=T_A) + comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_B), m, pd.Series(T_A), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + +def test_stump_identical_subsequence_self_join(): + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical + T_A[11 : 11 + identical.shape[0]] = identical + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stamp(T_A, m, exclusion_zone=zone) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +def test_stump_identical_subsequence_A_B_join(): + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_B = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical + T_B[11 : 11 + identical.shape[0]] = identical + m = 3 + ref_mp = naive.stamp(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + # Swap inputs + ref_mp = naive.stamp(T_B, m, T_B=T_A) + comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("substitute_B", substitution_values) +@pytest.mark.parametrize("substitution_locations", substitution_locations) +def test_stump_nan_inf_self_join(T_A, T_B, substitute_B, substitution_locations): + m = 3 + + T_B_sub = T_B.copy() + + for substitution_location_B in substitution_locations: + T_B_sub[:] = T_B[:] + T_B_sub[substitution_location_B] = substitute_B + + zone = int(np.ceil(m / 4)) + ref_mp = naive.stamp(T_B_sub, m, exclusion_zone=zone) + comp_mp = stump(T_B_sub, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B_sub), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("substitute_A", substitution_values) +@pytest.mark.parametrize("substitute_B", substitution_values) +@pytest.mark.parametrize("substitution_locations", substitution_locations) +def test_stump_nan_inf_A_B_join( + T_A, T_B, substitute_A, substitute_B, substitution_locations +): + m = 3 + + T_A_sub = T_A.copy() + T_B_sub = T_B.copy() + + for substitution_location_B in substitution_locations: + for substitution_location_A in substitution_locations: + T_A_sub[:] = T_A[:] + T_B_sub[:] = T_B[:] + T_A_sub[substitution_location_A] = substitute_A + T_B_sub[substitution_location_B] = substitute_B + + ref_mp = naive.stamp(T_A_sub, m, T_B=T_B_sub) + comp_mp = stump(T_A_sub, m, T_B_sub, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump( + pd.Series(T_A_sub), m, pd.Series(T_B_sub), ignore_trivial=False + ) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +def test_stump_nan_zero_mean_self_join(): + T = np.array([-1, 0, 1, np.inf, 1, 0, -1]) + m = 3 + + zone = int(np.ceil(m / 4)) + ref_mp = naive.stamp(T, m, exclusion_zone=zone) + comp_mp = stump(T, m, ignore_trivial=True) + + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 986311f78dae7ca90db29a793d43fa23b0a3afe4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 12:15:46 -0600 Subject: [PATCH 003/151] change replace naive.stump with naive.stump_topk --- tests/test_stump_topk.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index d3475122f..290487460 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -28,9 +28,10 @@ def test_stump_int_input(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join(T_A, T_B): + k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone) + ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) comp_mp = stump(T_B, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) @@ -43,8 +44,9 @@ def test_stump_self_join(T_A, T_B): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_A_B_join(T_A, T_B): + k = 3 m = 3 - ref_mp = naive.stump(T_A, m, T_B=T_B) + ref_mp = naive.stump_topk(T_A, m, T_B=T_B, k=k) comp_mp = stump(T_A, m, T_B, ignore_trivial=False) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) @@ -57,9 +59,10 @@ def test_stump_A_B_join(T_A, T_B): def test_stump_constant_subsequence_self_join(): T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_A, m, exclusion_zone=zone) + ref_mp = naive.stump_topk(T_A, m, exclusion_zone=zone, k=k) comp_mp = stump(T_A, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) From 9d8aafc3b75a051dee64aa72112dc8a3050b13b9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:04:36 -0600 Subject: [PATCH 004/151] Add self-join tests for 1NN and KNN --- tests/test_stump_topk.py | 202 ++------------------------------------- 1 file changed, 7 insertions(+), 195 deletions(-) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index 290487460..b3276b85b 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -27,8 +27,8 @@ def test_stump_int_input(): @pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join(T_A, T_B): - k = 3 +def test_stump_self_join_1NN(T_A, T_B): + k = 1 m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) @@ -42,204 +42,16 @@ def test_stump_self_join(T_A, T_B): npt.assert_almost_equal(ref_mp, comp_mp) -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_A_B_join(T_A, T_B): - k = 3 - m = 3 - ref_mp = naive.stump_topk(T_A, m, T_B=T_B, k=k) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -def test_stump_constant_subsequence_self_join(): - T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) +def test_stump_self_join_KNN(T_A, T_B): k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_A, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_A, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - -def test_stump_one_constant_subsequence_A_B_join(): - T_A = np.random.rand(20) - T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) - m = 3 - ref_mp = naive.stamp(T_A, m, T_B=T_B) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - # Swap inputs - ref_mp = naive.stamp(T_B, m, T_B=T_A) - comp_mp = stump(T_B, m, T_A, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - -def test_stump_two_constant_subsequences_A_B_join(): - T_A = np.concatenate( - (np.zeros(10, dtype=np.float64), np.ones(10, dtype=np.float64)) - ) - T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) - m = 3 - ref_mp = naive.stamp(T_A, m, T_B=T_B) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - # Swap inputs - ref_mp = naive.stamp(T_B, m, T_B=T_A) - comp_mp = stump(T_B, m, T_A, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_B), m, pd.Series(T_A), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - -def test_stump_identical_subsequence_self_join(): - identical = np.random.rand(8) - T_A = np.random.rand(20) - T_A[1 : 1 + identical.shape[0]] = identical - T_A[11 : 11 + identical.shape[0]] = identical - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stamp(T_A, m, exclusion_zone=zone) - comp_mp = stump(T_A, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - -def test_stump_identical_subsequence_A_B_join(): - identical = np.random.rand(8) - T_A = np.random.rand(20) - T_B = np.random.rand(20) - T_A[1 : 1 + identical.shape[0]] = identical - T_B[11 : 11 + identical.shape[0]] = identical - m = 3 - ref_mp = naive.stamp(T_A, m, T_B=T_B) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - # Swap inputs - ref_mp = naive.stamp(T_B, m, T_B=T_A) - comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - -@pytest.mark.parametrize("T_A, T_B", test_data) -@pytest.mark.parametrize("substitute_B", substitution_values) -@pytest.mark.parametrize("substitution_locations", substitution_locations) -def test_stump_nan_inf_self_join(T_A, T_B, substitute_B, substitution_locations): - m = 3 - - T_B_sub = T_B.copy() - - for substitution_location_B in substitution_locations: - T_B_sub[:] = T_B[:] - T_B_sub[substitution_location_B] = substitute_B - - zone = int(np.ceil(m / 4)) - ref_mp = naive.stamp(T_B_sub, m, exclusion_zone=zone) - comp_mp = stump(T_B_sub, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B_sub), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -@pytest.mark.parametrize("T_A, T_B", test_data) -@pytest.mark.parametrize("substitute_A", substitution_values) -@pytest.mark.parametrize("substitute_B", substitution_values) -@pytest.mark.parametrize("substitution_locations", substitution_locations) -def test_stump_nan_inf_A_B_join( - T_A, T_B, substitute_A, substitute_B, substitution_locations -): - m = 3 - - T_A_sub = T_A.copy() - T_B_sub = T_B.copy() - - for substitution_location_B in substitution_locations: - for substitution_location_A in substitution_locations: - T_A_sub[:] = T_A[:] - T_B_sub[:] = T_B[:] - T_A_sub[substitution_location_A] = substitute_A - T_B_sub[substitution_location_B] = substitute_B - - ref_mp = naive.stamp(T_A_sub, m, T_B=T_B_sub) - comp_mp = stump(T_A_sub, m, T_B_sub, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump( - pd.Series(T_A_sub), m, pd.Series(T_B_sub), ignore_trivial=False - ) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -def test_stump_nan_zero_mean_self_join(): - T = np.array([-1, 0, 1, np.inf, 1, 0, -1]) - m = 3 - - zone = int(np.ceil(m / 4)) - ref_mp = naive.stamp(T, m, exclusion_zone=zone) - comp_mp = stump(T, m, ignore_trivial=True) + npt.assert_almost_equal(ref_mp, comp_mp) - naive.replace_inf(ref_mp) + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From 121686b43187f053f23c09f07f2cf88f0ab1c238 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:09:15 -0600 Subject: [PATCH 005/151] remove variable k in 1NN test --- tests/test_stump_topk.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index b3276b85b..3f277a0ad 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -28,10 +28,9 @@ def test_stump_int_input(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_1NN(T_A, T_B): - k = 1 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) + ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=1) comp_mp = stump(T_B, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) From 730bfbbee7e867b2373e5060503492bab533efd8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:11:49 -0600 Subject: [PATCH 006/151] Fixed passing input to test function --- tests/test_stump_topk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index 3f277a0ad..4b722fd8f 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -41,6 +41,7 @@ def test_stump_self_join_1NN(T_A, T_B): npt.assert_almost_equal(ref_mp, comp_mp) +@pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): k = 3 m = 3 From f78348f3fadaface820e558c909e19cb0803503c Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:19:43 -0600 Subject: [PATCH 007/151] Fixed minor bug --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 8f3a05da8..6dd4bcb99 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1756,11 +1756,11 @@ def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): if g >= 0: iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - g)) + iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: D = distance_matrix[i, i + g] - if D < P[i, k - 1]: + if D < P[i, k - 1]: #less than k-th smallest value of T[i:i+m] idx = np.searchsorted(P[i, :k], D, side='right') P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] From e09b5f05d16c4506ded15df432fcd27b2fc822df Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:31:18 -0600 Subject: [PATCH 008/151] Correct format --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 6dd4bcb99..91a88cea7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1760,14 +1760,14 @@ def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] - if D < P[i, k - 1]: #less than k-th smallest value of T[i:i+m] - idx = np.searchsorted(P[i, :k], D, side='right') + if D < P[i, k - 1]: # less than k-th smallest value of T[i:i+m] + idx = np.searchsorted(P[i, :k], D, side="right") P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k - 1]: - idx = np.searchsorted(P[i + g, :k], D, side='right') + idx = np.searchsorted(P[i + g, :k], D, side="right") P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 95a8c081f745ea8781da5b4eaefceea936559471 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 01:15:01 -0600 Subject: [PATCH 009/151] Erase function stump_topk --- tests/naive.py | 71 -------------------------------------------------- 1 file changed, 71 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 91a88cea7..4089e603e 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1716,74 +1716,3 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w ) return total_ndists - - -def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): - """ - Traverse distance matrix along the diagonals and update the top-k - nearest neigbors matrix profile and matrix profile indices - """ - if T_B is None: # self-join: - ignore_trivial = True - distance_matrix = np.array( - [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] - ) - T_B = T_A.copy() - else: - ignore_trivial = False - distance_matrix = np.array( - [distance_profile(Q, T_B, m) for Q in core.rolling_window(T_A, m)] - ) - - distance_matrix[np.isnan(distance_matrix)] = np.inf - - n_A = T_A.shape[0] - n_B = T_B.shape[0] - l = n_A - m + 1 - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - - if ignore_trivial: - diags = np.arange(exclusion_zone + 1, n_A - m + 1) - else: - diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) - - # the last two columns in P and I are to keep track of right and left mp for 1NN - P = np.full((l, k + 2), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) - - for g in diags: - if g >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) - else: - iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) - - for i in iter_range: - D = distance_matrix[i, i + g] - if D < P[i, k - 1]: # less than k-th smallest value of T[i:i+m] - idx = np.searchsorted(P[i, :k], D, side="right") - P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] - I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] - - if ignore_trivial: # Self-joins only - if D < P[i + g, k - 1]: - idx = np.searchsorted(P[i + g, :k], D, side="right") - P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] - I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] - - if i < i + g: - # Left matrix profile and left matrix profile index - if D < P[i + g, k]: - P[i + g, k] = D - I[i + g, k] = i - - if D < P[i, k + 1]: - # right matrix profile and right matrix profile index - P[i, k + 1] = D - I[i, k + 1] = i + g - - result = np.empty((l, 2 * k + 2), dtype=object) - result[:, :k] = P[:, :k] - result[:, k:] = I[:, :] - - return result From d0701fedd3060dcb0b97a266ceaae4beacae52e8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 16:18:58 -0600 Subject: [PATCH 010/151] Revise naive.stump to return topk NN matrix profile --- tests/naive.py | 64 +++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 4089e603e..0c49c5746 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -156,7 +156,7 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): return result -def stump(T_A, m, T_B=None, exclusion_zone=None): +def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ Traverse distance matrix along the diagonals and update the matrix profile and matrix profile indices @@ -181,45 +181,35 @@ def stump(T_A, m, T_B=None, exclusion_zone=None): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + is_included = np.ones_like(distance_matrix, dtype=bool) if ignore_trivial: - diags = np.arange(exclusion_zone + 1, n_A - m + 1) - else: - diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) + for i in range(l): + apply_exclusion_zone(is_included[i], i, exclusion_zone, False) - P = np.full((l, 3), np.inf) - I = np.full((l, 3), -1, dtype=np.int64) + P = np.full((l, k), np.inf) + I = np.full((l, k + 2), -1, dtype=np.int64) - for k in diags: - if k >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k)) - else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k)) - - for i in iter_range: - D = distance_matrix[i, i + k] - if D < P[i, 0]: - P[i, 0] = D - I[i, 0] = i + k - - if ignore_trivial: # Self-joins only - if D < P[i + k, 0]: - P[i + k, 0] = D - I[i + k, 0] = i - - if i < i + k: - # Left matrix profile and left matrix profile index - if D < P[i + k, 1]: - P[i + k, 1] = D - I[i + k, 1] = i - - if D < P[i, 2]: - # right matrix profile and right matrix profile index - P[i, 2] = D - I[i, 2] = i + k - - result = np.empty((l, 4), dtype=object) - result[:, 0] = P[:, 0] - result[:, 1:4] = I[:, :] + for i in range(l): + mask = is_included[i] + IDX = np.argsort(distance_matrix[i][mask]) + nn_indices_sorted = np.flatnonzero(mask)[IDX] + + topk_indices = nn_indices_sorted[:k] + P[i, :k] = distance_matrix[i][topk_indices] + I[i, :k] = topk_indices + + if ignore_trivial: + left_indices = nn_indices_sorted[nn_indices_sorted < i] + if len(left_indices) > 0: + I[i, k] = left_indices[0] + + right_indices = nn_indices_sorted[nn_indices_sorted > i] + if len(right_indices) > 0: + I[i, k + 1] = right_indices[0] + + result = np.empty((l, 2 * k + 2), dtype=object) + result[:, :k] = P[:, :] + result[:, k:] = I[:, :] return result From 54445994ac87bccecf2a4252044d7e5cd0434718 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 16:29:53 -0600 Subject: [PATCH 011/151] Added a few comments --- tests/naive.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 0c49c5746..f9c9226ef 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -185,9 +185,13 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): if ignore_trivial: for i in range(l): apply_exclusion_zone(is_included[i], i, exclusion_zone, False) + # replacing values of distanc matrix to np.inf in excluion zone + # can cause problem later if there is nan/np.inf in data. So, + # it is better to use mask. P = np.full((l, k), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are + # to store left and right matrix profile indices. for i in range(l): mask = is_included[i] From 9ebb08a4f274cd7c4e1f5a5f11c5c92cb5839721 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 17:17:06 -0600 Subject: [PATCH 012/151] Add one new test case for topk matrix profile --- tests/test_stump.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_stump.py b/tests/test_stump.py index d3475122f..67a6ec704 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,3 +240,19 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_self_join_KNN(T_A, T_B): + k = 2 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From d83e8e6355813c15dbfc111a1e853ce1879c3027 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 17:20:17 -0600 Subject: [PATCH 013/151] Removed unnecessary test file --- tests/test_stump_topk.py | 57 ---------------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 tests/test_stump_topk.py diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py deleted file mode 100644 index 4b722fd8f..000000000 --- a/tests/test_stump_topk.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import numpy.testing as npt -import pandas as pd -from stumpy import stump, config -import pytest -import naive - - -test_data = [ - ( - np.array([9, 8100, -60, 7], dtype=np.float64), - np.array([584, -11, 23, 79, 1001, 0, -19], dtype=np.float64), - ), - ( - np.random.uniform(-1000, 1000, [8]).astype(np.float64), - np.random.uniform(-1000, 1000, [64]).astype(np.float64), - ), -] - -substitution_locations = [(slice(0, 0), 0, -1, slice(1, 3), [0, 3])] -substitution_values = [np.nan, np.inf] - - -def test_stump_int_input(): - with pytest.raises(TypeError): - stump(np.arange(10), 5) - - -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join_1NN(T_A, T_B): - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=1) - comp_mp = stump(T_B, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join_KNN(T_A, T_B): - k = 3 - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) From 9c8f019353991898bd8ad248053353af19e7c288 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 20:58:31 -0600 Subject: [PATCH 014/151] Set I to -1 if its corresponding P is not finite --- tests/naive.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index f9c9226ef..d3640b66c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -158,8 +158,8 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ - Traverse distance matrix along the diagonals and update the matrix profile and - matrix profile indices + Traverse distance matrix in a row-wise manner and store topk nearest neighbor + matrix profile and matrix profile indices """ if T_B is None: # self-join: ignore_trivial = True @@ -181,35 +181,36 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - is_included = np.ones_like(distance_matrix, dtype=bool) if ignore_trivial: for i in range(l): - apply_exclusion_zone(is_included[i], i, exclusion_zone, False) - # replacing values of distanc matrix to np.inf in excluion zone - # can cause problem later if there is nan/np.inf in data. So, - # it is better to use mask. + apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) P = np.full((l, k), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are # to store left and right matrix profile indices. for i in range(l): - mask = is_included[i] - IDX = np.argsort(distance_matrix[i][mask]) - nn_indices_sorted = np.flatnonzero(mask)[IDX] - - topk_indices = nn_indices_sorted[:k] + indices = np.argsort(distance_matrix[i]) + topk_indices = indices[:k] P[i, :k] = distance_matrix[i][topk_indices] - I[i, :k] = topk_indices + I[i, :k] = np.where(distance_matrix[i][topk_indices] != np.inf, topk_indices, -1) if ignore_trivial: - left_indices = nn_indices_sorted[nn_indices_sorted < i] + IL = -1 + left_indices = indices[indices < i] if len(left_indices) > 0: - I[i, k] = left_indices[0] + IL = left_indices[0] + if distance_matrix[i][IL] == np.inf: + IL = -1 + I[i, k] = IL - right_indices = nn_indices_sorted[nn_indices_sorted > i] + IR = -1 + right_indices = indices[indices > i] if len(right_indices) > 0: - I[i, k + 1] = right_indices[0] + IR = right_indices[0] + if distance_matrix[i][IR] == np.inf: + IR = -1 + I[i, k + 1] = IR result = np.empty((l, 2 * k + 2), dtype=object) result[:, :k] = P[:, :] From 0ce959549502e8091d1d017da8c95df73ae45401 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:04:16 -0600 Subject: [PATCH 015/151] Removed new test function --- tests/test_stump.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index 67a6ec704..4d2bf312b 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -242,17 +242,17 @@ def test_stump_nan_zero_mean_self_join(): npt.assert_almost_equal(ref_mp, comp_mp) -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join_KNN(T_A, T_B): - k = 2 - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) +#@pytest.mark.parametrize("T_A, T_B", test_data) +#def test_stump_self_join_KNN(T_A, T_B): +# k = 2 +# m = 3 +# zone = int(np.ceil(m / 4)) +# ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) +# comp_mp = stump(T_B, m, ignore_trivial=True) +# naive.replace_inf(ref_mp) +# naive.replace_inf(comp_mp) +# npt.assert_almost_equal(ref_mp, comp_mp) + +# comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) +# naive.replace_inf(comp_mp) +# npt.assert_almost_equal(ref_mp, comp_mp) From a9726984574deca4eb79c74b622581036604635c Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:06:59 -0600 Subject: [PATCH 016/151] Fixed format --- tests/naive.py | 4 +++- tests/test_stump.py | 16 ---------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index d3640b66c..98f639a08 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -193,7 +193,9 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): indices = np.argsort(distance_matrix[i]) topk_indices = indices[:k] P[i, :k] = distance_matrix[i][topk_indices] - I[i, :k] = np.where(distance_matrix[i][topk_indices] != np.inf, topk_indices, -1) + I[i, :k] = np.where( + distance_matrix[i][topk_indices] != np.inf, topk_indices, -1 + ) if ignore_trivial: IL = -1 diff --git a/tests/test_stump.py b/tests/test_stump.py index 4d2bf312b..d3475122f 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,19 +240,3 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) - - -#@pytest.mark.parametrize("T_A, T_B", test_data) -#def test_stump_self_join_KNN(T_A, T_B): -# k = 2 -# m = 3 -# zone = int(np.ceil(m / 4)) -# ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) -# comp_mp = stump(T_B, m, ignore_trivial=True) -# naive.replace_inf(ref_mp) -# naive.replace_inf(comp_mp) -# npt.assert_almost_equal(ref_mp, comp_mp) - -# comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) -# naive.replace_inf(comp_mp) -# npt.assert_almost_equal(ref_mp, comp_mp) From e2d3061e132316cad0e4bbb74d0ff8f5bf0e52ce Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:14:07 -0600 Subject: [PATCH 017/151] minor change --- tests/naive.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 98f639a08..429b2ac99 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -161,7 +161,10 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): Traverse distance matrix in a row-wise manner and store topk nearest neighbor matrix profile and matrix profile indices """ - if T_B is None: # self-join: + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + + if T_B is None: # self-join: ignore_trivial = True distance_matrix = np.array( [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] @@ -175,12 +178,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): distance_matrix[np.isnan(distance_matrix)] = np.inf - n_A = T_A.shape[0] - n_B = T_B.shape[0] - l = n_A - m + 1 - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - + l = T_A.shape[0] - m + 1 if ignore_trivial: for i in range(l): apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) From 1938f63363dc873a7c00300c66c54742ec9b0010 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:16:46 -0600 Subject: [PATCH 018/151] minor change --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 429b2ac99..ff50eecf7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -164,7 +164,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - if T_B is None: # self-join: + if T_B is None: # self-join: ignore_trivial = True distance_matrix = np.array( [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] From 0e25a347ad7a3fa50d63144e32df771d9ad57545 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 22:32:24 -0600 Subject: [PATCH 019/151] Add new test function for topk matrix profile --- tests/test_stump.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_stump.py b/tests/test_stump.py index d3475122f..ea4bae3c9 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,3 +240,18 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_self_join_KNN(T_A, T_B): + k = 2 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From e3935851485cc4ecd9c097c915ab37c3946530fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 22:34:13 -0600 Subject: [PATCH 020/151] Fixed format --- tests/test_stump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stump.py b/tests/test_stump.py index ea4bae3c9..67a6ec704 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -241,6 +241,7 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): k = 2 From 850a5946c88465a4fa93fd91b113015752860ff2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 12:53:45 -0600 Subject: [PATCH 021/151] Use diagonal traversal to get top-k matrix profile - change naive.stump from row-wise to traversal - add a note to docstring to inform reader of row-wise traversal - use numpy.searchsort(side='right') --- tests/naive.py | 83 ++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 554c6f9fd..552c85cee 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -158,12 +158,11 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ - Traverse distance matrix in a row-wise manner and store topk nearest neighbor - matrix profile and matrix profile indices - """ - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + Traverse distance matrix along the diagonals and update the top-k nearest + neighbor matrix profile and matrix profile indices + NOTE: For row-wise traversal, please use function `stamp` + """ if T_B is None: # self-join: ignore_trivial = True distance_matrix = np.array( @@ -178,42 +177,54 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): distance_matrix[np.isnan(distance_matrix)] = np.inf - l = T_A.shape[0] - m + 1 + n_A = T_A.shape[0] + n_B = T_B.shape[0] + l = n_A - m + 1 + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + if ignore_trivial: - for i in range(l): - apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) + diags = np.arange(exclusion_zone + 1, n_A - m + 1) + else: + diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) - P = np.full((l, k), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are - # to store left and right matrix profile indices. + P = np.full((l, k + 2), np.inf) + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store + # ... left and right top-1 matrix profile indices. - for i in range(l): - indices = np.argsort(distance_matrix[i]) - topk_indices = indices[:k] - P[i, :k] = distance_matrix[i][topk_indices] - I[i, :k] = np.where( - distance_matrix[i][topk_indices] != np.inf, topk_indices, -1 - ) + for g in diags: + if g >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) + else: + iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) - if ignore_trivial: - IL = -1 - left_indices = indices[indices < i] - if len(left_indices) > 0: - IL = left_indices[0] - if distance_matrix[i][IL] == np.inf: - IL = -1 - I[i, k] = IL + for i in iter_range: + D = distance_matrix[i, i + g] + if D < P[i, k-1]: + idx = np.searchsorted(P[i, :k], D, side='right') + # to keep the top-k, we need to the get rid of the last element. + P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] + I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] - IR = -1 - right_indices = indices[indices > i] - if len(right_indices) > 0: - IR = right_indices[0] - if distance_matrix[i][IR] == np.inf: - IR = -1 - I[i, k + 1] = IR - - result = np.empty((l, 2 * k + 2), dtype=object) - result[:, :k] = P[:, :] + if ignore_trivial: # Self-joins only + if D < P[i + g, k-1]: + idx = np.searchsorted(P[i + g, :k], D, side='right') + P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] + I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] + + if i < i + g: + # Left matrix profile and left matrix profile index + if D < P[i + g, k]: + P[i + g, k] = D + I[i + g, k] = i + + if D < P[i, k + 1]: + # right matrix profile and right matrix profile index + P[i, k + 1] = D + I[i, k + 1] = i + g + + result = np.empty((2 * k + 2, 4), dtype=object) + result[:, :k] = P[:, :k] result[:, k:] = I[:, :] return result From 278e76ca5e74c53276b1e20cc6d4ab3efd8bc078 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:00:21 -0600 Subject: [PATCH 022/151] Fixed shape of naive.stump output --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 552c85cee..871d52024 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -223,7 +223,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): P[i, k + 1] = D I[i, k + 1] = i + g - result = np.empty((2 * k + 2, 4), dtype=object) + result = np.empty((l, 2 * k + 2), dtype=object) result[:, :k] = P[:, :k] result[:, k:] = I[:, :] From a864662b41f8553df6fcc1f1b9b3b341beb5cc31 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:29:52 -0600 Subject: [PATCH 023/151] Add naive version of numpy.searchsorted --- tests/naive.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/naive.py b/tests/naive.py index 871d52024..010836639 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -156,6 +156,14 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): return result +def searchsorted(a, v): + indices = np.flatnonzero(v < a) + if len(indices): + return indices.min() + else: + return len(a) + + def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ Traverse distance matrix along the diagonals and update the top-k nearest From f0c022da2fb61b1c9840d59e3a2034222dae65c4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:30:41 -0600 Subject: [PATCH 024/151] Replace numpy.searchsorted with its naive version --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 010836639..24ca851c7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -209,14 +209,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] if D < P[i, k-1]: - idx = np.searchsorted(P[i, :k], D, side='right') + idx = searchsorted(P[i, :k], D, side='right') # to keep the top-k, we need to the get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k-1]: - idx = np.searchsorted(P[i + g, :k], D, side='right') + idx = searchsorted(P[i + g, :k], D, side='right') P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 81701ba3620abb480b3852909ffe6fd0b46874ec Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:33:35 -0600 Subject: [PATCH 025/151] Fixed calling function searchsorted --- tests/naive.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 24ca851c7..a282d49c0 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -157,6 +157,9 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): def searchsorted(a, v): + """ + naive version of numpy.searchsorted(..., side='right') + """ indices = np.flatnonzero(v < a) if len(indices): return indices.min() @@ -209,14 +212,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] if D < P[i, k-1]: - idx = searchsorted(P[i, :k], D, side='right') + idx = searchsorted(P[i, :k], D) # to keep the top-k, we need to the get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k-1]: - idx = searchsorted(P[i + g, :k], D, side='right') + idx = searchsorted(P[i + g, :k], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From e244341a9291119a6f3f48ca07f9b7a11203c545 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:36:51 -0600 Subject: [PATCH 026/151] Fixed format --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index a282d49c0..0f70ae7b4 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -200,7 +200,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) P = np.full((l, k + 2), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store # ... left and right top-1 matrix profile indices. for g in diags: @@ -211,14 +211,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] - if D < P[i, k-1]: + if D < P[i, k - 1]: idx = searchsorted(P[i, :k], D) # to keep the top-k, we need to the get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only - if D < P[i + g, k-1]: + if D < P[i + g, k - 1]: idx = searchsorted(P[i + g, :k], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 1806c66241547cbdd9ac02c0313d16157b5f700e Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:13:05 -0600 Subject: [PATCH 027/151] minor changes --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 5592af064..3028dd15c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -240,7 +240,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: idx = searchsorted(P[i, :k], D) - # to keep the top-k, we need to the get rid of the last element. + # to keep the top-k, we must get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] From ad29c19cc83d6388a1caab1136fdb4fbf82596fb Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:14:10 -0600 Subject: [PATCH 028/151] Correct format --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 3028dd15c..849c8d080 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -203,11 +203,11 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): for i in range(l): apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) - for i, D in enumerate(distance_matrix): # D: distance profile + for i, D in enumerate(distance_matrix): # D: distance profile # self-join / AB-join: matrix proifle and indices indices = np.argsort(D)[:k] P[i, :k] = D[indices] - indices[P[i,:k] == np.inf] = -1 + indices[P[i, :k] == np.inf] = -1 I[i, :k] = indices # self-join: left matrix profile index (top-1) @@ -237,7 +237,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: - D = distance_matrix[i, i + g] # D: a single element + D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: idx = searchsorted(P[i, :k], D) # to keep the top-k, we must get rid of the last element. From 448d65d69d10c03063c29062cf6c09124281eb78 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:35:49 -0600 Subject: [PATCH 029/151] Correct flake8 style --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 849c8d080..dacba3075 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -158,7 +158,7 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): # pragma: no cover def searchsorted(a, v): """ - naive version of numpy.searchsorted(..., side='right') + Naive version of numpy.searchsorted(..., side='right') """ indices = np.flatnonzero(v < a) if len(indices): From e3ebcb5885085ab25e58ddc98acd8a7bfb7afac0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:46:10 -0600 Subject: [PATCH 030/151] Avoid unnecessary slicing --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index dacba3075..67d1fb27c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -239,14 +239,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): for i in iter_range: D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: - idx = searchsorted(P[i, :k], D) + idx = searchsorted(P[i], D) # to keep the top-k, we must get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k - 1]: - idx = searchsorted(P[i + g, :k], D) + idx = searchsorted(P[i + g], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 3cee5d85749eaa0987697e10e937fe5db65c9604 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 19:28:08 -0600 Subject: [PATCH 031/151] pass parameter k to function stump --- tests/test_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index 783163453..1ce70acc5 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -248,7 +248,7 @@ def test_stump_self_join_KNN(T_A, T_B): m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True) + comp_mp = stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From a1bc6a4182207f68050da74511d78f46b469b778 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 19:38:44 -0600 Subject: [PATCH 032/151] Add parameter k to function stump --- stumpy/stump.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 97334eb5a..115752113 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -433,7 +433,7 @@ def _stump( @core.non_normalized(aamp) -def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): +def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): """ Compute the z-normalized matrix profile @@ -467,6 +467,10 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of smallest elements in distance profile that should be stored + for constructing top-k matrix profile + Returns ------- out : numpy.ndarray @@ -587,7 +591,6 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): l = n_A - m + 1 excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - out = np.empty((l, 4), dtype=object) if ignore_trivial: diags = np.arange(excl_zone + 1, n_A - m + 1, dtype=np.int64) @@ -612,8 +615,9 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): ignore_trivial, ) - out[:, 0] = P[:, 0] - out[:, 1:] = I + out = np.empty((l, 2 * k + 2), dtype=object) + out[:, :k] = P[:, :k] + out[:, k:] = I threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From 384690cc6492019d66d8b9104a9297c5a0fbcc11 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 20:21:19 -0600 Subject: [PATCH 033/151] Add parameter k to function _stump --- stumpy/stump.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 115752113..bedd5bf6b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -235,6 +235,7 @@ def _stump( T_B_subseq_isconstant, diags, ignore_trivial, + k, ): """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel @@ -294,6 +295,10 @@ def _stump( Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. + k : int + The number of smallest elements in distance profile that should be stored + for constructing top-k matrix profile. + Returns ------- profile : numpy.ndarray @@ -353,8 +358,8 @@ def _stump( n_B = T_B.shape[0] l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS - ρ = np.full((n_threads, l, 3), -np.inf, dtype=np.float64) - I = np.full((n_threads, l, 3), -1, dtype=np.int64) + ρ = np.full((n_threads, l, k + 2), -np.inf, dtype=np.float64) + I = np.full((n_threads, l, k + 2), -1, dtype=np.int64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) @@ -406,27 +411,18 @@ def _stump( # Reduction of results from all threads for thread_idx in range(1, n_threads): for i in prange(l): - if ρ[0, i, 0] < ρ[thread_idx, i, 0]: - ρ[0, i, 0] = ρ[thread_idx, i, 0] - I[0, i, 0] = I[thread_idx, i, 0] - # left pearson correlation and left matrix profile indices - if ρ[0, i, 1] < ρ[thread_idx, i, 1]: - ρ[0, i, 1] = ρ[thread_idx, i, 1] - I[0, i, 1] = I[thread_idx, i, 1] - # right pearson correlation and right matrix profile indices - if ρ[0, i, 2] < ρ[thread_idx, i, 2]: - ρ[0, i, 2] = ρ[thread_idx, i, 2] - I[0, i, 2] = I[thread_idx, i, 2] + for j in range(k + 2): # alternative: use mask + if ρ[0, i, j] < ρ[thread_idx, i, j]: + ρ[0, i, j] = ρ[thread_idx, i, j] + I[0, i, j] = I[thread_idx, i, j] # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - if p_norm[i, 0] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 0] = 0.0 - if p_norm[i, 1] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 1] = 0.0 - if p_norm[i, 2] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 2] = 0.0 + for j in range(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: + p_norm[i, j] = 0.0 + P = np.sqrt(p_norm) return P[:, :], I[0, :, :] @@ -469,7 +465,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile + for constructing top-k matrix profile. Returns ------- @@ -613,6 +609,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): T_B_subseq_isconstant, diags, ignore_trivial, + k, ) out = np.empty((l, 2 * k + 2), dtype=object) From d246736717bac279d87970a8627e3c222d8fefa9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 20:45:08 -0600 Subject: [PATCH 034/151] Fixed update of top-k rho and indices in _stump --- stumpy/stump.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index bedd5bf6b..cc70e76c4 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -411,15 +411,30 @@ def _stump( # Reduction of results from all threads for thread_idx in range(1, n_threads): for i in prange(l): - for j in range(k + 2): # alternative: use mask - if ρ[0, i, j] < ρ[thread_idx, i, j]: - ρ[0, i, j] = ρ[thread_idx, i, j] - I[0, i, j] = I[thread_idx, i, j] + # top-k + for j in range(k): + if ρ[0, i, k-1] < ρ[thread_idx, i, j]: + idx = k - np.searchsorted( + ρ[0, i, :k][::-1], ρ[thread_idx, i, j] + ) + ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] + ρ[0, i, idx] = ρ[thread_idx, i, j] + + I[0, i, idx + 1 : k] = I[0, i, idx : k - 1] + I[0, i, idx] = I[thread_idx, i, j] + + if ρ[0, i, k] < ρ[thread_idx, i, k]: + ρ[0, i, k] = ρ[thread_idx, i, k] + I[0, i, k] = I[thread_idx, i, k] + + if ρ[0, i, k + 1] < ρ[thread_idx, i, k + 1]: + ρ[0, i, k + 1] = ρ[thread_idx, i, k + 1] + I[0, i, k + 1] = I[thread_idx, i, k + 1] # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - for j in range(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 From fdff040c1324fb7c804862a02ee0cf207edad8b4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 20:59:11 -0600 Subject: [PATCH 035/151] Add parameter k to function _compute_diagonal --- stumpy/stump.py | 68 +++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index cc70e76c4..f0f09e083 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -125,6 +125,10 @@ def _compute_diagonal( Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. + k : int + The number of smallest elements in distance profile that should be stored + for constructing top-k matrix profile. + Returns ------- None @@ -154,18 +158,18 @@ def _compute_diagonal( constant = (m - 1) * m_inverse * m_inverse # (m - 1)/(m * m) for diag_idx in range(diags_start_idx, diags_stop_idx): - k = diags[diag_idx] + g = diags[diag_idx] - if k >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k)) + if g >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k)) + iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: - if i == 0 or (k < 0 and i == -k): + if i == 0 or (g < 0 and i == -g): cov = ( np.dot( - (T_B[i + k : i + k + m] - M_T[i + k]), (T_A[i : i + m] - μ_Q[i]) + (T_B[i + g : i + g + m] - M_T[i + g]), (T_A[i : i + m] - μ_Q[i]) ) * m_inverse ) @@ -177,38 +181,51 @@ def _compute_diagonal( # - (T_B[i + k - 1] - M_T_m_1[i + k]) * (T_A[i - 1] - μ_Q_m_1[i]) # ) cov = cov + constant * ( - cov_a[i + k] * cov_b[i] - cov_c[i + k] * cov_d[i] + cov_a[i + g] * cov_b[i] - cov_c[i + g] * cov_d[i] ) - if T_B_subseq_isfinite[i + k] and T_A_subseq_isfinite[i]: + if T_B_subseq_isfinite[i + g] and T_A_subseq_isfinite[i]: # Neither subsequence contains NaNs - if T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]: + if T_B_subseq_isconstant[i + g] or T_A_subseq_isconstant[i]: pearson = 0.5 else: - pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i] + pearson = cov * Σ_T_inverse[i + g] * σ_Q_inverse[i] - if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]: + if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 - if pearson > ρ[thread_idx, i, 0]: - ρ[thread_idx, i, 0] = pearson - I[thread_idx, i, 0] = i + k + if pearson > ρ[thread_idx, i, k - 1]: + idx = k - np.searchsorted( + ρ[thread_idx, i, :k][::-1], pearson + ) + ρ[thread_idx, i, idx + 1 : k] = ρ[thread_idx, i, idx : k - 1] + ρ[thread_idx, i, idx] = pearson + I[thread_idx, i, idx + 1 : k] = I[thread_idx, i, idx : k - 1] + I[thread_idx, i, idx] = i + g if ignore_trivial: # self-joins only - if pearson > ρ[thread_idx, i + k, 0]: - ρ[thread_idx, i + k, 0] = pearson - I[thread_idx, i + k, 0] = i - - if i < i + k: + if pearson > ρ[thread_idx, i + g, k - 1]: + idx = k - np.searchsorted( + ρ[thread_idx, i + g, :k][::-1], pearson + ) + ρ[thread_idx, i + g, idx + 1 : k] = ρ[thread_idx, i + g, idx : k - 1] + ρ[thread_idx, i + g, idx] = pearson + I[thread_idx, i + g, idx + 1 : k] = I[thread_idx, i + g, idx : k - 1] + I[thread_idx, i + g, idx] = i + # for top-1 case: + #ρ[thread_idx, i + g, 0] = pearson + #I[thread_idx, i + g, 0] = i + + if i < i + g: # left pearson correlation and left matrix profile index - if pearson > ρ[thread_idx, i + k, 1]: - ρ[thread_idx, i + k, 1] = pearson - I[thread_idx, i + k, 1] = i + if pearson > ρ[thread_idx, i + g, k]: + ρ[thread_idx, i + g, k] = pearson + I[thread_idx, i + g, k] = i # right pearson correlation and right matrix profile index - if pearson > ρ[thread_idx, i, 2]: - ρ[thread_idx, i, 2] = pearson - I[thread_idx, i, 2] = i + k + if pearson > ρ[thread_idx, i, k + 1]: + ρ[thread_idx, i, k + 1] = pearson + I[thread_idx, i, k + 1] = i + g return @@ -406,6 +423,7 @@ def _stump( ρ, I, ignore_trivial, + k, ) # Reduction of results from all threads From 9d721982f4a10d3e01dbe3fdf0403fb33372aec7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:08:13 -0600 Subject: [PATCH 036/151] consider parameter k in non normalized function, decorator --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2a30c043..391ce6b57 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,7 +121,7 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p"] + exclude = ["normalize", "p", "k"] @functools.wraps(non_norm) def outer_wrapper(norm): From 995559ffe6f49aa20ab71f3b33846b3717ce4e1d Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:11:37 -0600 Subject: [PATCH 037/151] Fixed missing input parameter k in function _compute_diagonal --- stumpy/stump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/stump.py b/stumpy/stump.py index f0f09e083..45c4e533c 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -42,6 +42,7 @@ def _compute_diagonal( ρ, I, ignore_trivial, + k ): """ Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I From a047dd002a93b387f664189ca401405b19fdec4f Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:22:13 -0600 Subject: [PATCH 038/151] minor change --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 45c4e533c..5f701b9a5 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -648,10 +648,10 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): out = np.empty((l, 2 * k + 2), dtype=object) out[:, :k] = P[:, :k] - out[:, k:] = I + out[:, k:] = I[:, :] threshold = 10e-6 - if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover + if core.are_distances_too_small(out[:, :k].ravel(), threshold=threshold): # pragma: no cover logger.warning(f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") From c6370b6da6e438bdd16e4eefffb8a3e4f71a8c93 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:50:06 -0600 Subject: [PATCH 039/151] Add verbose --- stumpy/stump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 5f701b9a5..ce5988662 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -646,7 +646,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k, ) - out = np.empty((l, 2 * k + 2), dtype=object) + out = np.empty((l, (2 * k) + 2), dtype=object) + print(out.shape) out[:, :k] = P[:, :k] out[:, k:] = I[:, :] From 816441596cbc2d1d85454bdbcba939132d1677b2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:03:28 -0600 Subject: [PATCH 040/151] minor changes --- stumpy/stump.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index ce5988662..2ae3046be 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -42,7 +42,7 @@ def _compute_diagonal( ρ, I, ignore_trivial, - k + k, ): """ Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I @@ -646,13 +646,12 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k, ) - out = np.empty((l, (2 * k) + 2), dtype=object) - print(out.shape) + out = np.empty((l, 2 * k + 2), dtype=object) out[:, :k] = P[:, :k] - out[:, k:] = I[:, :] + out[:, k:] = I threshold = 10e-6 - if core.are_distances_too_small(out[:, :k].ravel(), threshold=threshold): # pragma: no cover + if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning(f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") From 7007953f700dd41cae95d1ea834d0e5850b245b7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:17:23 -0600 Subject: [PATCH 041/151] Fixed unit test for top-k matrix profile --- tests/test_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index 1ce70acc5..25b9c5283 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -253,6 +253,6 @@ def test_stump_self_join_KNN(T_A, T_B): naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From 5b5f21ada054f9d26780199c34f248f034874fe2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:21:13 -0600 Subject: [PATCH 042/151] Remove parameter k in function non_normalized decorator --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 391ce6b57..a2a30c043 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,7 +121,7 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p", "k"] + exclude = ["normalize", "p"] @functools.wraps(non_norm) def outer_wrapper(norm): From f7ee854f733eba01412ed17f6a3cdf8f747d842a Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:56:39 -0600 Subject: [PATCH 043/151] Corret format by black --- stumpy/stump.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 2ae3046be..eb18b7e8a 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -196,9 +196,7 @@ def _compute_diagonal( pearson = 1.0 if pearson > ρ[thread_idx, i, k - 1]: - idx = k - np.searchsorted( - ρ[thread_idx, i, :k][::-1], pearson - ) + idx = k - np.searchsorted(ρ[thread_idx, i, :k][::-1], pearson) ρ[thread_idx, i, idx + 1 : k] = ρ[thread_idx, i, idx : k - 1] ρ[thread_idx, i, idx] = pearson I[thread_idx, i, idx + 1 : k] = I[thread_idx, i, idx : k - 1] @@ -207,15 +205,19 @@ def _compute_diagonal( if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, k - 1]: idx = k - np.searchsorted( - ρ[thread_idx, i + g, :k][::-1], pearson + ρ[thread_idx, i + g, :k][::-1], pearson ) - ρ[thread_idx, i + g, idx + 1 : k] = ρ[thread_idx, i + g, idx : k - 1] + ρ[thread_idx, i + g, idx + 1 : k] = ρ[ + thread_idx, i + g, idx : k - 1 + ] ρ[thread_idx, i + g, idx] = pearson - I[thread_idx, i + g, idx + 1 : k] = I[thread_idx, i + g, idx : k - 1] + I[thread_idx, i + g, idx + 1 : k] = I[ + thread_idx, i + g, idx : k - 1 + ] I[thread_idx, i + g, idx] = i # for top-1 case: - #ρ[thread_idx, i + g, 0] = pearson - #I[thread_idx, i + g, 0] = i + # ρ[thread_idx, i + g, 0] = pearson + # I[thread_idx, i + g, 0] = i if i < i + g: # left pearson correlation and left matrix profile index @@ -432,10 +434,8 @@ def _stump( for i in prange(l): # top-k for j in range(k): - if ρ[0, i, k-1] < ρ[thread_idx, i, j]: - idx = k - np.searchsorted( - ρ[0, i, :k][::-1], ρ[thread_idx, i, j] - ) + if ρ[0, i, k - 1] < ρ[thread_idx, i, j]: + idx = k - np.searchsorted(ρ[0, i, :k][::-1], ρ[thread_idx, i, j]) ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] ρ[0, i, idx] = ρ[thread_idx, i, j] @@ -453,7 +453,7 @@ def _stump( # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 From 485dba3da38398f27b237142f29adebd870ac003 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 17:45:26 -0600 Subject: [PATCH 044/151] Use seperate variaboles for left and right profiles --- stumpy/stump.py | 87 +++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index eb18b7e8a..9921a5e7c 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -41,6 +41,10 @@ def _compute_diagonal( thread_idx, ρ, I, + ρL, + IL, + ρR, + IR, ignore_trivial, k, ): @@ -221,14 +225,14 @@ def _compute_diagonal( if i < i + g: # left pearson correlation and left matrix profile index - if pearson > ρ[thread_idx, i + g, k]: - ρ[thread_idx, i + g, k] = pearson - I[thread_idx, i + g, k] = i + if pearson > ρL[thread_idx, i + g]: + ρL[thread_idx, i + g] = pearson + IL[thread_idx, i + g] = i # right pearson correlation and right matrix profile index - if pearson > ρ[thread_idx, i, k + 1]: - ρ[thread_idx, i, k + 1] = pearson - I[thread_idx, i, k + 1] = i + g + if pearson > ρR[thread_idx, i]: + ρR[thread_idx, i] = pearson + IR[thread_idx, i] = i + g return @@ -378,8 +382,15 @@ def _stump( n_B = T_B.shape[0] l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS - ρ = np.full((n_threads, l, k + 2), -np.inf, dtype=np.float64) - I = np.full((n_threads, l, k + 2), -1, dtype=np.int64) + + ρ = np.full((n_threads, l, k), -np.inf, dtype=np.float64) + I = np.full((n_threads, l, k), -1, dtype=np.int64) + + ρL = np.full((n_threads, l), -np.inf, dtype=np.float64) + IL = np.full((n_threads, l), -1, dtype=np.float64) + + ρR = np.full((n_threads, l), -np.inf, dtype=np.float64) + IR = np.full((n_threads, l), -1, dtype=np.float64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) @@ -425,6 +436,10 @@ def _stump( thread_idx, ρ, I, + ρL, + IL, + ρR, + IR, ignore_trivial, k, ) @@ -434,7 +449,7 @@ def _stump( for i in prange(l): # top-k for j in range(k): - if ρ[0, i, k - 1] < ρ[thread_idx, i, j]: + if ρ[0, i, k-1] < ρ[thread_idx, i, j]: idx = k - np.searchsorted(ρ[0, i, :k][::-1], ρ[thread_idx, i, j]) ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] ρ[0, i, idx] = ρ[thread_idx, i, j] @@ -442,24 +457,24 @@ def _stump( I[0, i, idx + 1 : k] = I[0, i, idx : k - 1] I[0, i, idx] = I[thread_idx, i, j] - if ρ[0, i, k] < ρ[thread_idx, i, k]: - ρ[0, i, k] = ρ[thread_idx, i, k] - I[0, i, k] = I[thread_idx, i, k] + if ρL[0, i] < ρL[thread_idx, i]: + ρL[0, i] = ρL[thread_idx, i] + IL[0, i] = IL[thread_idx, i] - if ρ[0, i, k + 1] < ρ[thread_idx, i, k + 1]: - ρ[0, i, k + 1] = ρ[thread_idx, i, k + 1] - I[0, i, k + 1] = I[thread_idx, i, k + 1] + if ρR[0, i] < ρR[thread_idx, i]: + ρR[0, i] = ρR[thread_idx, i] + IR[0, i] = IR[thread_idx, i] # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + for j in prange(p_norm.shape[1]): if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 P = np.sqrt(p_norm) - return P[:, :], I[0, :, :] + return P, I[0, :, :], IL[0, :], IR[0, :] @core.non_normalized(aamp) @@ -627,28 +642,28 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + P, I, IL, IR = _stump( + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = P[:, :k] - out[:, k:] = I + out[:, :k] = P + out[:, k:] = np.c_[I, IL, IR] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From bc133ca638df71c4542b2351e07297b04b8b6269 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:10:06 -0600 Subject: [PATCH 045/151] store top-k rho in ascending order --- stumpy/stump.py | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 9921a5e7c..56b2118ca 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -199,26 +199,26 @@ def _compute_diagonal( if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 - if pearson > ρ[thread_idx, i, k - 1]: - idx = k - np.searchsorted(ρ[thread_idx, i, :k][::-1], pearson) - ρ[thread_idx, i, idx + 1 : k] = ρ[thread_idx, i, idx : k - 1] - ρ[thread_idx, i, idx] = pearson - I[thread_idx, i, idx + 1 : k] = I[thread_idx, i, idx : k - 1] - I[thread_idx, i, idx] = i + g + if pearson > ρ[thread_idx, i, 0]: + idx = np.searchsorted(ρ[thread_idx, i], pearson) + ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1 : idx] + ρ[thread_idx, i, idx - 1] = pearson + + I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1 : idx] + I[thread_idx, i, idx - 1] = i + g if ignore_trivial: # self-joins only - if pearson > ρ[thread_idx, i + g, k - 1]: - idx = k - np.searchsorted( - ρ[thread_idx, i + g, :k][::-1], pearson - ) - ρ[thread_idx, i + g, idx + 1 : k] = ρ[ - thread_idx, i + g, idx : k - 1 + if pearson > ρ[thread_idx, i + g, 0]: + idx = np.searchsorted(ρ[thread_idx, i + g], pearson) + ρ[thread_idx, i + g, : idx - 1] = ρ[ + thread_idx, i + g, 1 : idx ] - ρ[thread_idx, i + g, idx] = pearson - I[thread_idx, i + g, idx + 1 : k] = I[ - thread_idx, i + g, idx : k - 1 + ρ[thread_idx, i + g, idx - 1] = pearson + + I[thread_idx, i + g, : idx - 1] = I[ + thread_idx, i + g, 1 : idx ] - I[thread_idx, i + g, idx] = i + I[thread_idx, i + g, idx - 1] = i # for top-1 case: # ρ[thread_idx, i + g, 0] = pearson # I[thread_idx, i + g, 0] = i @@ -449,13 +449,14 @@ def _stump( for i in prange(l): # top-k for j in range(k): - if ρ[0, i, k-1] < ρ[thread_idx, i, j]: - idx = k - np.searchsorted(ρ[0, i, :k][::-1], ρ[thread_idx, i, j]) - ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] - ρ[0, i, idx] = ρ[thread_idx, i, j] + j = k - 1 - j + if ρ[0, i, 0] < ρ[thread_idx, i, j]: + idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) + ρ[0, i, : idx - 1] = ρ[0, i, 1 : idx] + ρ[0, i, idx - 1] = ρ[thread_idx, i, j] - I[0, i, idx + 1 : k] = I[0, i, idx : k - 1] - I[0, i, idx] = I[thread_idx, i, j] + I[0, i, : idx - 1] = I[0, i, 1 : idx] + I[0, i, idx - 1] = I[thread_idx, i, j] if ρL[0, i] < ρL[thread_idx, i]: ρL[0, i] = ρL[thread_idx, i] @@ -474,7 +475,7 @@ def _stump( P = np.sqrt(p_norm) - return P, I[0, :, :], IL[0, :], IR[0, :] + return P[:, ::-1], I[0, :, ::-1], IL[0, :], IR[0, :] @core.non_normalized(aamp) From 47a61b2f202e3f2864460086ccf92100168b8f1e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:23:08 -0600 Subject: [PATCH 046/151] Revise docstrings --- stumpy/stump.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 56b2118ca..bdf8c85b7 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -326,12 +326,16 @@ def _stump( Returns ------- profile : numpy.ndarray - Matrix profile + Top-k Matrix profile indices : numpy.ndarray - The first column consists of the matrix profile indices, the second - column consists of the left matrix profile indices, and the third - column consists of the right matrix profile indices. + The top-k matrix profile indices + + left indices : numpy.ndarray + The top-1 left matrix profile indices + + right indices : numpy.ndarray + The top-1 right matrix profile indices Notes ----- @@ -520,10 +524,10 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): Returns ------- out : numpy.ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. + The first k columns consists of the top-k matrix profile, the next k columns + consists of their corresponding matrix profile indices, the one before + last column consists of the top-1 left matrix profile indices, and the + last column consists of the top-1 right matrix profile indices. See Also -------- From d4dc04a5caea088cd6a9a619830af7c517f5348d Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:37:27 -0600 Subject: [PATCH 047/151] Correct docstrings --- stumpy/stump.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index bdf8c85b7..d49296ac5 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -49,9 +49,9 @@ def _compute_diagonal( k, ): """ - Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I - sequentially along individual diagonals using a single thread and avoiding race - conditions + Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation, ρ, and I, + and, the left ρ and the left I, the right ρ and the right I sequentially along + individual diagonals using a single thread and avoiding race conditions. Parameters ---------- @@ -121,10 +121,22 @@ def _compute_diagonal( The thread index ρ : numpy.ndarray - The Pearson correlations + The top-k Pearson correlations, sorted in ascending order per row I : numpy.ndarray - The matrix profile indices + The top-k matrix profile indices + + ρL : numpy.ndarray + The top-1 left Pearson correlations + + IL : numpy.ndarray + The top-1 left matrix profile indices + + ρR : numpy.ndarray + The top-1 left Pearson correlations + + IR : numpy.ndarray + The top-1 right matrix profile indices ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to @@ -263,8 +275,8 @@ def _stump( ): """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel - computation of the matrix profile, matrix profile indices, left matrix profile - indices, and right matrix profile indices. + computation of the top-k matrix profile, top-k matrix profile indices, top-1 + left matrix profile indices, and top-1 right matrix profile indices. Parameters ---------- @@ -326,16 +338,16 @@ def _stump( Returns ------- profile : numpy.ndarray - Top-k Matrix profile + Top-k matrix profile indices : numpy.ndarray - The top-k matrix profile indices + Top-k matrix profile indices left indices : numpy.ndarray - The top-1 left matrix profile indices + Top-1 left matrix profile indices right indices : numpy.ndarray - The top-1 right matrix profile indices + Top-1 right matrix profile indices Notes ----- @@ -417,7 +429,8 @@ def _stump( cov_d[:] = cov_d - μ_Q_m_1 for thread_idx in prange(n_threads): - # Compute and update cov, I within a single thread to avoiding race conditions + # Compute and update pearson correlations and matrix profile indices + # within a single thread to avoid race conditions _compute_diagonal( T_A, T_B, From a123540664c93cacc5cf1b006422b42fba9c9069 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:38:47 -0600 Subject: [PATCH 048/151] Correct formats --- stumpy/stump.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index d49296ac5..f31d0c0f7 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -213,23 +213,19 @@ def _compute_diagonal( if pearson > ρ[thread_idx, i, 0]: idx = np.searchsorted(ρ[thread_idx, i], pearson) - ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1 : idx] + ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1:idx] ρ[thread_idx, i, idx - 1] = pearson - I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1 : idx] + I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1:idx] I[thread_idx, i, idx - 1] = i + g if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, 0]: idx = np.searchsorted(ρ[thread_idx, i + g], pearson) - ρ[thread_idx, i + g, : idx - 1] = ρ[ - thread_idx, i + g, 1 : idx - ] + ρ[thread_idx, i + g, : idx - 1] = ρ[thread_idx, i + g, 1:idx] ρ[thread_idx, i + g, idx - 1] = pearson - I[thread_idx, i + g, : idx - 1] = I[ - thread_idx, i + g, 1 : idx - ] + I[thread_idx, i + g, : idx - 1] = I[thread_idx, i + g, 1:idx] I[thread_idx, i + g, idx - 1] = i # for top-1 case: # ρ[thread_idx, i + g, 0] = pearson @@ -469,10 +465,10 @@ def _stump( j = k - 1 - j if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) - ρ[0, i, : idx - 1] = ρ[0, i, 1 : idx] + ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] ρ[0, i, idx - 1] = ρ[thread_idx, i, j] - I[0, i, : idx - 1] = I[0, i, 1 : idx] + I[0, i, : idx - 1] = I[0, i, 1:idx] I[0, i, idx - 1] = I[thread_idx, i, j] if ρL[0, i] < ρL[thread_idx, i]: @@ -661,22 +657,22 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) P, I, IL, IR = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) out = np.empty((l, 2 * k + 2), dtype=object) From 1dff66f983346ae23430f76cf5c1f16b2c46ea98 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:40:00 -0600 Subject: [PATCH 049/151] Full coverage of test_stump unit test From cf48b6961eab3c01180a84a476dcd5e8fcd626ee Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:37:44 -0600 Subject: [PATCH 050/151] Change function considering new input/output structure --- stumpy/scrump.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 002847507..75790c70a 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -609,7 +609,7 @@ def update(self): if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] - P, I = _stump( + P, I, IL, IR = _stump( self._T_A, self._T_B, self._m, @@ -625,8 +625,11 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, + k=1, ) + I = np.c_[I, IL, IR] + # Update matrix profile and indices for i in range(self._P.shape[0]): if self._P[i, 0] > P[i, 0]: From 7d16ce6a883b38808a7e6f93c41c82755500465a Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:45:24 -0600 Subject: [PATCH 051/151] Add two more outputs returned by _stump --- stumpy/stump.py | 51 +++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index f31d0c0f7..348085a4e 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -481,14 +481,26 @@ def _stump( # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) + p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) + p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) + for i in prange(p_norm.shape[0]): for j in prange(p_norm.shape[1]): if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 + if p_norm_L[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm_L[i] = 0.0 + + if p_norm_R[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm_R[i] = 0.0 + P = np.sqrt(p_norm) + PL = np.sqrt(p_norm_L) + PR = np.sqrt(p_norm_R) + - return P[:, ::-1], I[0, :, ::-1], IL[0, :], IR[0, :] + return P[:, ::-1], I[0, :, ::-1], PL, IL[0, :], PR, IR[0, :] @core.non_normalized(aamp) @@ -656,26 +668,27 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I, IL, IR = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + P, I, PL, IL, PR, IR = _stump( + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) - out = np.empty((l, 2 * k + 2), dtype=object) + out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to + # store left and right matrix profile indices out[:, :k] = P out[:, k:] = np.c_[I, IL, IR] From 61d38b6b747ff96820140335163b5d02c76f0eaf Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:50:48 -0600 Subject: [PATCH 052/151] Update/Correct docstrings --- stumpy/stump.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 348085a4e..b9743613b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -133,7 +133,7 @@ def _compute_diagonal( The top-1 left matrix profile indices ρR : numpy.ndarray - The top-1 left Pearson correlations + The top-1 right Pearson correlations IR : numpy.ndarray The top-1 right matrix profile indices @@ -272,7 +272,8 @@ def _stump( """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel computation of the top-k matrix profile, top-k matrix profile indices, top-1 - left matrix profile indices, and top-1 right matrix profile indices. + left matrix profile and matrix profile indices, and top-1 right matrix profile + and matrix profile indices. Parameters ---------- @@ -339,9 +340,15 @@ def _stump( indices : numpy.ndarray Top-k matrix profile indices + left profile : numpy.ndarray + Top-1 left matrix profile + left indices : numpy.ndarray Top-1 left matrix profile indices + right profile : numpy.ndarray + Top-1 right matrix profile + right indices : numpy.ndarray Top-1 right matrix profile indices @@ -499,7 +506,6 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P[:, ::-1], I[0, :, ::-1], PL, IL[0, :], PR, IR[0, :] @@ -546,9 +552,9 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): ------- out : numpy.ndarray The first k columns consists of the top-k matrix profile, the next k columns - consists of their corresponding matrix profile indices, the one before - last column consists of the top-1 left matrix profile indices, and the - last column consists of the top-1 right matrix profile indices. + consists of their corresponding matrix profile indices, the column at + numpy indexing 2k contains top-1 left matrix profile indices and the last + column, at numpy indexing 2k+1, contains top-1 right matrix profile indices. See Also -------- From 1a469a5230720bdc4d86287db174c0196fd9cf8d Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:53:16 -0600 Subject: [PATCH 053/151] Correct callee function _stump --- stumpy/scrump.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 75790c70a..df53d8244 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -609,7 +609,7 @@ def update(self): if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] - P, I, IL, IR = _stump( + P, I, PL, IL, PR, IR = _stump( self._T_A, self._T_B, self._m, @@ -628,8 +628,9 @@ def update(self): k=1, ) + P = np.c_[P, PL, PR] I = np.c_[I, IL, IR] - + # Update matrix profile and indices for i in range(self._P.shape[0]): if self._P[i, 0] > P[i, 0]: From 2149abf0f4d2b0f109246b1a90d1106fa4d76f89 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:53:58 -0600 Subject: [PATCH 054/151] Fix format --- stumpy/stump.py | 34 +++++++++++++++++----------------- stumpy/test_stump.py | 0 2 files changed, 17 insertions(+), 17 deletions(-) create mode 100644 stumpy/test_stump.py diff --git a/stumpy/stump.py b/stumpy/stump.py index b9743613b..cb10e65c4 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -675,25 +675,25 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) P, I, PL, IL, PR, IR = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) - out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to + out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to # store left and right matrix profile indices out[:, :k] = P out[:, k:] = np.c_[I, IL, IR] diff --git a/stumpy/test_stump.py b/stumpy/test_stump.py new file mode 100644 index 000000000..e69de29bb From 364f280d7a4db08ede32151b201e856d344bdef6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:19:02 -0600 Subject: [PATCH 055/151] Fixed number of inputs passed to _stump --- stumpy/stumped.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 09557e318..7f1f67e51 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -248,6 +248,7 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, T_B_subseq_isconstant_future, diags_futures[i], ignore_trivial, + 1, ) ) From e983e1fbda3ca017d453a2acb97d997314ad9a70 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:29:00 -0600 Subject: [PATCH 056/151] Fixed number of outputs returned by the function --- stumpy/stumped.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 7f1f67e51..db30eea59 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -253,7 +253,11 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, ) results = dask_client.gather(futures) - profile, indices = results[0] + profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] + + profile = np.c_[profile, profile_L, profile_R] + indices = np.c_[indices, indices_L, indices_R] + for i in range(1, len(hosts)): P, I = results[i] for col in range(P.shape[1]): # pragma: no cover From ef2bc6578bfb4f7e04c74dcda3563d32fd76497a Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:36:56 -0600 Subject: [PATCH 057/151] Fixed number of returned outputs --- stumpy/stumped.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index db30eea59..1fbd7be49 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -259,7 +259,9 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, indices = np.c_[indices, indices_L, indices_R] for i in range(1, len(hosts)): - P, I = results[i] + P, I, PL, IL, PR, IR = results[i] + P = np.c_[P, PL, PR] + I = np.c_[I, IL, IR] for col in range(P.shape[1]): # pragma: no cover cond = P[:, col] < profile[:, col] profile[:, col] = np.where(cond, P[:, col], profile[:, col]) From f7d4a8fcd298600c7a51fe8178020a675b349349 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:39:31 -0600 Subject: [PATCH 058/151] Correct format --- stumpy/stumped.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 1fbd7be49..6ca40707c 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -253,7 +253,7 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, ) results = dask_client.gather(futures) - profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] + profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] profile = np.c_[profile, profile_L, profile_R] indices = np.c_[indices, indices_L, indices_R] From 3dccc9a244797c3324cfef54a0b3e1d07c36d6e5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 12:21:00 -0600 Subject: [PATCH 059/151] Exclude parameter 'k' in non-normalized decorator After updating non-normalized functions to return top-k matrix profile, the parameter "k" will be removed from such exclusion. --- stumpy/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2a30c043..9c4296ab9 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,7 +121,8 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p"] + exclude = ["normalize", "p", "k"] # remove "k" after updating + # non-normalized function to accept "k" for top-k matrix profile @functools.wraps(non_norm) def outer_wrapper(norm): From a430364aa2cfc77263f7328386dc5c9ea0048945 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 12:25:17 -0600 Subject: [PATCH 060/151] Correct format --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 9c4296ab9..f9a77a07f 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,8 +121,8 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p", "k"] # remove "k" after updating - # non-normalized function to accept "k" for top-k matrix profile + exclude = ["normalize", "p", "k"] # remove "k" after updating + # non-normalized function to accept "k" for top-k matrix profile @functools.wraps(non_norm) def outer_wrapper(norm): From 4f0194384b38e38a6b76e949d6aac0bd06fa441f Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 15:17:22 -0600 Subject: [PATCH 061/151] Fixed dtype of matrix profile indices --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index cb10e65c4..683194e9b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -406,10 +406,10 @@ def _stump( I = np.full((n_threads, l, k), -1, dtype=np.int64) ρL = np.full((n_threads, l), -np.inf, dtype=np.float64) - IL = np.full((n_threads, l), -1, dtype=np.float64) + IL = np.full((n_threads, l), -1, dtype=np.int64) ρR = np.full((n_threads, l), -np.inf, dtype=np.float64) - IR = np.full((n_threads, l), -1, dtype=np.float64) + IR = np.full((n_threads, l), -1, dtype=np.int64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) From aebe5a31920fed46be8cac8f46c50cbc58315e0c Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 16:33:19 -0600 Subject: [PATCH 062/151] Add pagam no cover --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 67d1fb27c..b2d8894f7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -164,7 +164,7 @@ def searchsorted(a, v): if len(indices): return indices.min() else: - return len(a) + return len(a) # pragma: no cover def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): From de295af807c8b114cdce77ee254e62ed34bcf485 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 16:37:23 -0600 Subject: [PATCH 063/151] Minor change --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index b2d8894f7..4a5ed789a 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -163,8 +163,8 @@ def searchsorted(a, v): indices = np.flatnonzero(v < a) if len(indices): return indices.min() - else: - return len(a) # pragma: no cover + else: # pragma: no cover + return len(a) def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): From 1d35aea6326fab28d4d099d1b6e40db7d4fd037c Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 15 May 2022 21:48:13 -0600 Subject: [PATCH 064/151] Use range to move in reverse --- stumpy/stump.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 683194e9b..6f47fe698 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -468,8 +468,7 @@ def _stump( for thread_idx in range(1, n_threads): for i in prange(l): # top-k - for j in range(k): - j = k - 1 - j + for j in range(k - 1, -1, -1): if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] From 33374167498eee362655c984ae16cedb34552204 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 11:22:50 -0600 Subject: [PATCH 065/151] seperate variables for left and right --- stumpy/stump.py | 89 ++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 97334eb5a..ba8d3a958 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -41,6 +41,10 @@ def _compute_diagonal( thread_idx, ρ, I, + ρL, + IL, + ρR, + IR, ignore_trivial, ): """ @@ -190,25 +194,25 @@ def _compute_diagonal( if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]: pearson = 1.0 - if pearson > ρ[thread_idx, i, 0]: - ρ[thread_idx, i, 0] = pearson - I[thread_idx, i, 0] = i + k + if pearson > ρ[thread_idx, i]: + ρ[thread_idx, i] = pearson + I[thread_idx, i] = i + k if ignore_trivial: # self-joins only - if pearson > ρ[thread_idx, i + k, 0]: - ρ[thread_idx, i + k, 0] = pearson - I[thread_idx, i + k, 0] = i + if pearson > ρ[thread_idx, i + k]: + ρ[thread_idx, i + k] = pearson + I[thread_idx, i + k] = i if i < i + k: # left pearson correlation and left matrix profile index - if pearson > ρ[thread_idx, i + k, 1]: - ρ[thread_idx, i + k, 1] = pearson - I[thread_idx, i + k, 1] = i + if pearson > ρL[thread_idx, i + k]: + ρL[thread_idx, i + k] = pearson + IL[thread_idx, i + k] = i # right pearson correlation and right matrix profile index - if pearson > ρ[thread_idx, i, 2]: - ρ[thread_idx, i, 2] = pearson - I[thread_idx, i, 2] = i + k + if pearson > ρR[thread_idx, i]: + ρR[thread_idx, i] = pearson + IR[thread_idx, i] = i + k return @@ -353,8 +357,14 @@ def _stump( n_B = T_B.shape[0] l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS - ρ = np.full((n_threads, l, 3), -np.inf, dtype=np.float64) - I = np.full((n_threads, l, 3), -1, dtype=np.int64) + ρ = np.full((n_threads, l), -np.inf, dtype=np.float64) + I = np.full((n_threads, l), -1, dtype=np.int64) + + ρL = np.full((n_threads, l), -np.inf, dtype=np.float64) + IL = np.full((n_threads, l), -1, dtype=np.int64) + + ρR = np.full((n_threads, l), -np.inf, dtype=np.float64) + IR = np.full((n_threads, l), -1, dtype=np.int64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) @@ -400,36 +410,47 @@ def _stump( thread_idx, ρ, I, + ρL, + IL, + ρR, + IR, ignore_trivial, ) # Reduction of results from all threads for thread_idx in range(1, n_threads): for i in prange(l): - if ρ[0, i, 0] < ρ[thread_idx, i, 0]: - ρ[0, i, 0] = ρ[thread_idx, i, 0] - I[0, i, 0] = I[thread_idx, i, 0] + if ρ[0, i] < ρ[thread_idx, i]: + ρ[0, i] = ρ[thread_idx, i] + I[0, i] = I[thread_idx, i] # left pearson correlation and left matrix profile indices - if ρ[0, i, 1] < ρ[thread_idx, i, 1]: - ρ[0, i, 1] = ρ[thread_idx, i, 1] - I[0, i, 1] = I[thread_idx, i, 1] + if ρL[0, i] < ρL[thread_idx, i]: + ρL[0, i] = ρL[thread_idx, i] + IL[0, i] = IL[thread_idx, i] # right pearson correlation and right matrix profile indices - if ρ[0, i, 2] < ρ[thread_idx, i, 2]: - ρ[0, i, 2] = ρ[thread_idx, i, 2] - I[0, i, 2] = I[thread_idx, i, 2] + if ρR[0, i] < ρR[thread_idx, i]: + ρR[0, i] = ρR[thread_idx, i] + IR[0, i] = IR[thread_idx, i] # Convert pearson correlations to distances - p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) + p_norm = np.abs(2 * m * (1 - ρ[0, :])) + p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) + p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) + for i in prange(p_norm.shape[0]): - if p_norm[i, 0] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 0] = 0.0 - if p_norm[i, 1] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 1] = 0.0 - if p_norm[i, 2] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 2] = 0.0 + if p_norm[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm[i] = 0.0 + if p_norm_L[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm_L[i] = 0.0 + if p_norm_R[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm_R[i] = 0.0 + P = np.sqrt(p_norm) + PL = np.sqrt(p_norm_L) + PR = np.sqrt(p_norm_R) + - return P[:, :], I[0, :, :] + return P, I, PL, IL, PR, IR @core.non_normalized(aamp) @@ -594,7 +615,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I = _stump( + P, I, PL, IL, PR, IR = _stump( T_A, T_B, m, @@ -612,8 +633,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): ignore_trivial, ) - out[:, 0] = P[:, 0] - out[:, 1:] = I + out[:, 0] = P + out[:, 1:] = np.c_[I, IL, IR] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From d09ea2359fcf5f355203abd34bc28a1a86ed5129 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 15:39:36 -0600 Subject: [PATCH 066/151] use seperate variables for left and right --- stumpy/stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index ba8d3a958..6972bff87 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -634,7 +634,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): ) out[:, 0] = P - out[:, 1:] = np.c_[I, IL, IR] + out[:, 1:] = np.column_stack((I, IL, IR)) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From dbd7b8c2ed36d92d6c9c88f7200f0799ce8e4ea6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 16:38:03 -0600 Subject: [PATCH 067/151] replace numpy column_stack with c_ --- stumpy/stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 6972bff87..ba8d3a958 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -634,7 +634,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): ) out[:, 0] = P - out[:, 1:] = np.column_stack((I, IL, IR)) + out[:, 1:] = np.c_[I, IL, IR] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From 56ab764228da8ee1b828d1771acc4c95fccaf239 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 16:51:29 -0600 Subject: [PATCH 068/151] Fix bug --- stumpy/stump.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index ba8d3a958..121988d25 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -357,6 +357,7 @@ def _stump( n_B = T_B.shape[0] l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS + ρ = np.full((n_threads, l), -np.inf, dtype=np.float64) I = np.full((n_threads, l), -1, dtype=np.int64) @@ -450,7 +451,7 @@ def _stump( PR = np.sqrt(p_norm_R) - return P, I, PL, IL, PR, IR + return P, I[0, :], PL, IL[0, :], PR, IR[0, :] @core.non_normalized(aamp) @@ -633,9 +634,11 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): ignore_trivial, ) + out[:, 0] = P out[:, 1:] = np.c_[I, IL, IR] + threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning(f"A large number of values are smaller than {threshold}.") From e817e5f0dd1316105b93a96d9be28b659a58367d Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 21:02:08 -0600 Subject: [PATCH 069/151] Remove a wrongly created file --- stumpy/test_stump.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 stumpy/test_stump.py diff --git a/stumpy/test_stump.py b/stumpy/test_stump.py deleted file mode 100644 index e69de29bb..000000000 From c1e39256972a03f0ee1b014e1b8e20efa2d811ba Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 21:04:59 -0600 Subject: [PATCH 070/151] Remove parameter k in non normalized decorator --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f9a77a07f..753b0affa 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,9 +121,9 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p", "k"] # remove "k" after updating - # non-normalized function to accept "k" for top-k matrix profile + exclude = ["normalize", "p"] + @functools.wraps(non_norm) def outer_wrapper(norm): @functools.wraps(norm) From aa08176e4cc1ecd90dc47e3ef851103088136a11 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 21:09:58 -0600 Subject: [PATCH 071/151] Add parameter k to arguments of non normalized function Temporarily, the parameter k is added to the arguments of non-normalized function `aamp` so that the tests can be passed for now. This will be handled after completing the normalized version `stump`. --- stumpy/aamp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 201e4413b..b00c8cbf1 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -240,7 +240,8 @@ def _aamp( return np.power(P[0, :, :], 1.0 / p), I[0, :, :] -def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0): +def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary + # and this function needs to be changed to return top-k """ Compute the non-normalized (i.e., without z-normalization) matrix profile From 37a9f2c91979fbd2db35d27d4c946eb1ca31c08f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 22:12:34 -0600 Subject: [PATCH 072/151] Replace numpy c_ with column_stack --- stumpy/scrump.py | 4 ++-- stumpy/stump.py | 2 +- stumpy/stumped.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index df53d8244..25c4e4e3f 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -628,8 +628,8 @@ def update(self): k=1, ) - P = np.c_[P, PL, PR] - I = np.c_[I, IL, IR] + P = np.column_stack((P, PL, PR)) + I = np.column_stack((I, IL, IR)) # Update matrix profile and indices for i in range(self._P.shape[0]): diff --git a/stumpy/stump.py b/stumpy/stump.py index 6f47fe698..449c35200 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -695,7 +695,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to # store left and right matrix profile indices out[:, :k] = P - out[:, k:] = np.c_[I, IL, IR] + out[:, k:] = np.column_stack((I, IL, IR)) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 6ca40707c..0c1c34e07 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -255,13 +255,13 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, results = dask_client.gather(futures) profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] - profile = np.c_[profile, profile_L, profile_R] - indices = np.c_[indices, indices_L, indices_R] + profile = np.column_stack((profile, profile_L, profile_R)) + indices = np.column_stack((indices, indices_L, indices_R)) for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - P = np.c_[P, PL, PR] - I = np.c_[I, IL, IR] + P = np.column_stack((P, PL, PR)) + I = np.column_stack((I, IL, IR)) for col in range(P.shape[1]): # pragma: no cover cond = P[:, col] < profile[:, col] profile[:, col] = np.where(cond, P[:, col], profile[:, col]) From 8c0e76ecd2eeea875fb7c80eb4cfe5703740a333 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 23:29:21 -0600 Subject: [PATCH 073/151] Minor changes - Improve docstrings - Reverse rho and I before return - Improve comments --- stumpy/stump.py | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 449c35200..6fe2b7e41 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -49,9 +49,9 @@ def _compute_diagonal( k, ): """ - Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation, ρ, and I, - and, the left ρ and the left I, the right ρ and the right I sequentially along - individual diagonals using a single thread and avoiding race conditions. + Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation (ρ), + ρL, ρR, I, IL, and IR sequentially along individual diagonals using a single + thread and avoiding race conditions. Parameters ---------- @@ -121,10 +121,10 @@ def _compute_diagonal( The thread index ρ : numpy.ndarray - The top-k Pearson correlations, sorted in ascending order per row + The (top-k) Pearson correlations, sorted in ascending order per row I : numpy.ndarray - The top-k matrix profile indices + The (top-k) matrix profile indices ρL : numpy.ndarray The top-1 left Pearson correlations @@ -144,7 +144,7 @@ def _compute_diagonal( k : int The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile. + for constructing the top-k matrix profile. Returns ------- @@ -227,9 +227,6 @@ def _compute_diagonal( I[thread_idx, i + g, : idx - 1] = I[thread_idx, i + g, 1:idx] I[thread_idx, i + g, idx - 1] = i - # for top-1 case: - # ρ[thread_idx, i + g, 0] = pearson - # I[thread_idx, i + g, 0] = i if i < i + g: # left pearson correlation and left matrix profile index @@ -271,9 +268,9 @@ def _stump( ): """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel - computation of the top-k matrix profile, top-k matrix profile indices, top-1 - left matrix profile and matrix profile indices, and top-1 right matrix profile - and matrix profile indices. + computation of the (top-k) matrix profile, the (top-k) matrix profile indices, + the top-1 left matrix profile and matrix profile indices, and the top-1 right + matrix profile and matrix profile indices. Parameters ---------- @@ -468,7 +465,7 @@ def _stump( for thread_idx in range(1, n_threads): for i in prange(l): # top-k - for j in range(k - 1, -1, -1): + for j in range(k - 1, -1, -1): # reverse iteration to preserve order in ties if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] @@ -485,8 +482,12 @@ def _stump( ρR[0, i] = ρR[thread_idx, i] IR[0, i] = IR[thread_idx, i] - # Convert pearson correlations to distances - p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) + # The arrays ρ (and so I) should be reversed since ρ is in ascending order. + ρ = ρ[0, :, ::-1] + I = I[0, :, ::-1] + + # Convert pearson correlations to distances. + p_norm = np.abs(2 * m * (1 - ρ)) p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) @@ -505,7 +506,7 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P[:, ::-1], I[0, :, ::-1], PL, IL[0, :], PR, IR[0, :] + return P, I, PL, IL[0, :], PR, IR[0, :] @core.non_normalized(aamp) @@ -514,8 +515,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): Compute the z-normalized matrix profile This is a convenience wrapper around the Numba JIT-compiled parallelized - `_stump` function which computes the matrix profile according to STOMPopt with - Pearson correlations. + `_stump` function which computes the (top-k) matrix profile according to + STOMPopt with Pearson correlations. Parameters ---------- @@ -545,15 +546,15 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile. + for constructing the top-k matrix profile. Returns ------- out : numpy.ndarray - The first k columns consists of the top-k matrix profile, the next k columns - consists of their corresponding matrix profile indices, the column at - numpy indexing 2k contains top-1 left matrix profile indices and the last - column, at numpy indexing 2k+1, contains top-1 right matrix profile indices. + The first k columns contain the top-k matrix profile, the next k columns + contain their corresponding matrix profile indices, the column at + numpy indexing 2k contains the top-1 left matrix profile indices and the last + column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. See Also -------- From df4c5d1ad8db3109eb8316c99314785cb02f5325 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 23:47:54 -0600 Subject: [PATCH 074/151] Correct Format --- stumpy/aamp.py | 2 +- stumpy/core.py | 1 - stumpy/stump.py | 6 ++++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index b00c8cbf1..807c3164b 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -240,7 +240,7 @@ def _aamp( return np.power(P[0, :, :], 1.0 / p), I[0, :, :] -def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary +def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary # and this function needs to be changed to return top-k """ Compute the non-normalized (i.e., without z-normalization) matrix profile diff --git a/stumpy/core.py b/stumpy/core.py index 753b0affa..a2a30c043 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -123,7 +123,6 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): if exclude is None: exclude = ["normalize", "p"] - @functools.wraps(non_norm) def outer_wrapper(norm): @functools.wraps(norm) diff --git a/stumpy/stump.py b/stumpy/stump.py index 6fe2b7e41..3e241a11e 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -465,7 +465,9 @@ def _stump( for thread_idx in range(1, n_threads): for i in prange(l): # top-k - for j in range(k - 1, -1, -1): # reverse iteration to preserve order in ties + for j in range( + k - 1, -1, -1 + ): # reverse iteration to preserve order in ties if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] @@ -487,7 +489,7 @@ def _stump( I = I[0, :, ::-1] # Convert pearson correlations to distances. - p_norm = np.abs(2 * m * (1 - ρ)) + p_norm = np.abs(2 * m * (1 - ρ)) p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) From c5c881bebc2ebffb9d55a1491ebff6f239b73553 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:22:26 -0600 Subject: [PATCH 075/151] minor improvement of docstring --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 3e241a11e..ae6a21a15 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -553,8 +553,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): Returns ------- out : numpy.ndarray - The first k columns contain the top-k matrix profile, the next k columns - contain their corresponding matrix profile indices, the column at + The first k columns consist of the top-k matrix profile, the next k columns + consist of their corresponding matrix profile indices, the column at numpy indexing 2k contains the top-1 left matrix profile indices and the last column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. From d9dcdc037168ef4f7cd4a9ef4cda491a94f24495 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:24:49 -0600 Subject: [PATCH 076/151] Add parameter k to the arguments of function the function will be revised to return top-k matrix profile --- stumpy/aamped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/aamped.py b/stumpy/aamped.py index d6bf6d97b..c158c9423 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -12,7 +12,8 @@ logger = logging.getLogger(__name__) -def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0): +def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): + # function needs to be revised to return top-k matix profile """ Compute the non-normalized (i.e., without z-normalization) matrix profile From c6b81f0410769cd700cf68dbbb8f473dd50bfabf Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:52:31 -0600 Subject: [PATCH 077/151] Add parameter k to arguments Temporarily add parameter k to avoid non-normalized decorator test failure --- stumpy/stumped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 0c1c34e07..a48f6a957 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -14,7 +14,8 @@ @core.non_normalized(aamped) -def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): +def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): + # the function needs to be revisd to return top-k matrix profile """ Compute the z-normalized matrix profile with a distributed dask cluster From 4ffc7fca9733cccb6dddab528a1a5d2ca996089c Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:53:55 -0600 Subject: [PATCH 078/151] Correct format --- stumpy/stumped.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index a48f6a957..e922536f3 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -14,7 +14,9 @@ @core.non_normalized(aamped) -def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): +def stumped( + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 +): # the function needs to be revisd to return top-k matrix profile """ Compute the z-normalized matrix profile with a distributed dask cluster From 102c627f64eb5736f528cc31bba8bb01f8645628 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 02:03:34 -0600 Subject: [PATCH 079/151] Remove parameter k from arguements --- stumpy/aamped.py | 3 +-- stumpy/stumped.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/stumpy/aamped.py b/stumpy/aamped.py index c158c9423..d6bf6d97b 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -12,8 +12,7 @@ logger = logging.getLogger(__name__) -def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): - # function needs to be revised to return top-k matix profile +def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0): """ Compute the non-normalized (i.e., without z-normalization) matrix profile diff --git a/stumpy/stumped.py b/stumpy/stumped.py index e922536f3..6cdfc5aed 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -15,9 +15,7 @@ @core.non_normalized(aamped) def stumped( - dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 -): - # the function needs to be revisd to return top-k matrix profile + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): """ Compute the z-normalized matrix profile with a distributed dask cluster From a37f793306d54123af0660c428ec845a880b3930 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 02:24:53 -0600 Subject: [PATCH 080/151] Add one new unit test --- tests/test_stumped.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index ca53829fc..02e914436 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -608,3 +608,20 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap( naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore:numpy.dtype size changed") +@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed") +@pytest.mark.filterwarnings("ignore:numpy.ndarray size changed") +@pytest.mark.filterwarnings("ignore:\\s+Port 8787 is already in use:UserWarning") +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): + with Client(dask_cluster) as dask_client: + k = 3 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 0755af4ddcdf5ad5a331a1b535af53f879dfc160 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 11:31:12 -0600 Subject: [PATCH 081/151] Add parameter k=1 to arguments This is to avoid unit test failure in non-normalized decorator. After finalizing the normalized function, the non normalized functions will be revised to return top-k matrix profile. --- stumpy/aamped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/aamped.py b/stumpy/aamped.py index d6bf6d97b..d833ee8b3 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -12,7 +12,8 @@ logger = logging.getLogger(__name__) -def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0): +def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): + # function needs to be revised to return top-k matrix profile """ Compute the non-normalized (i.e., without z-normalization) matrix profile From ca9fdcffcf94d5f0541b74e845fc5e11ee9481ae Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 14:16:06 -0600 Subject: [PATCH 082/151] Revise stumped to return top-k matrix profile --- stumpy/stumped.py | 50 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 6cdfc5aed..2b826ba71 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -15,7 +15,7 @@ @core.non_normalized(aamped) def stumped( - dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): """ Compute the z-normalized matrix profile with a distributed dask cluster @@ -55,6 +55,10 @@ def stumped( The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int + The number of smallest elements in distance profile that should be stored + for constructing the top-k matrix profile. + Returns ------- out : numpy.ndarray @@ -184,7 +188,6 @@ def stumped( l = n_A - m + 1 excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - out = np.empty((l, 4), dtype=object) hosts = list(dask_client.ncores().keys()) nworkers = len(hosts) @@ -249,27 +252,44 @@ def stumped( T_B_subseq_isconstant_future, diags_futures[i], ignore_trivial, - 1, + k, ) ) results = dask_client.gather(futures) profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] - profile = np.column_stack((profile, profile_L, profile_R)) - indices = np.column_stack((indices, indices_L, indices_R)) - for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - P = np.column_stack((P, PL, PR)) - I = np.column_stack((I, IL, IR)) - for col in range(P.shape[1]): # pragma: no cover - cond = P[:, col] < profile[:, col] - profile[:, col] = np.where(cond, P[:, col], profile[:, col]) - indices[:, col] = np.where(cond, I[:, col], indices[:, col]) - - out[:, 0] = profile[:, 0] - out[:, 1:4] = indices + # Update top-k matrix profile, alternative approach: + # np.argsort(np.concatenate(profile, P), kind='mergesort') + prof = profile.copy() + ind = indices.copy() + for j in range(l): + u, w = 0, 0 + for idx in range(k): + if prof[j, u] <= P[j, w]: + profile[j, idx] = prof[j, u] + indices[j, idx] = ind[j, u] + u += 1 + else: + profile[j, idx] = P[j, w] + indices[j, idx] = I[j, w] + w += 1 + + # Update top-1 left matrix profile and matrix profile index + cond = PL < profile_L + profile_L = np.where(cond, PL, profile_L) + indices_L = np.where(cond, IL, indices_L) + + # Update top-1 right matrix profile and matrix profile index + cond = PR < profile_R + profile_R = np.where(cond, PR, profile_R) + indices_R = np.where(cond, IR, indices_R) + + out = np.empty((l, 2 * k + 2), dtype=object) + out[:, :k] = profile + out[:, k:] = np.column_stack((indices, indices_L, indices_R)) # Delete data from Dask cluster dask_client.cancel(T_A_future) From 9408631f397ef6578dc2d21205e44bb5a45c38f6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 14:16:56 -0600 Subject: [PATCH 083/151] Correct format --- stumpy/stumped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 2b826ba71..037c4ba52 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -15,7 +15,8 @@ @core.non_normalized(aamped) def stumped( - dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 +): """ Compute the z-normalized matrix profile with a distributed dask cluster From 435d9b88ed52bfd8800ff5055375661287b3871d Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:19:44 -0600 Subject: [PATCH 084/151] several minor changes --- stumpy/aamp.py | 4 ++-- stumpy/scrump.py | 2 +- stumpy/stump.py | 16 ++++++++-------- stumpy/stumped.py | 12 ++++++------ tests/test_stump.py | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 807c3164b..87568f365 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -240,8 +240,8 @@ def _aamp( return np.power(P[0, :, :], 1.0 / p), I[0, :, :] -def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary - # and this function needs to be changed to return top-k +def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): + # function needs to be changed to return top-k matrix profile """ Compute the non-normalized (i.e., without z-normalization) matrix profile diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 25c4e4e3f..9b26478c2 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -454,6 +454,7 @@ def __init__( s=None, normalize=True, p=2.0, + k=1, # class needs to be revised to return (top-k) matrix profile ): """ Initialize the `scrump` object @@ -625,7 +626,6 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, - k=1, ) P = np.column_stack((P, PL, PR)) diff --git a/stumpy/stump.py b/stumpy/stump.py index ae6a21a15..60d965590 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -269,8 +269,8 @@ def _stump( """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel computation of the (top-k) matrix profile, the (top-k) matrix profile indices, - the top-1 left matrix profile and matrix profile indices, and the top-1 right - matrix profile and matrix profile indices. + the top-1 left matrix profile and its matrix profile index, and the top-1 right + matrix profile and its matrix profile index. Parameters ---------- @@ -327,7 +327,7 @@ def _stump( k : int The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile. + for constructing the top-k matrix profile. Returns ------- @@ -430,7 +430,7 @@ def _stump( for thread_idx in prange(n_threads): # Compute and update pearson correlations and matrix profile indices - # within a single thread to avoid race conditions + # within a single thread and avoiding race conditions _compute_diagonal( T_A, T_B, @@ -484,12 +484,12 @@ def _stump( ρR[0, i] = ρR[thread_idx, i] IR[0, i] = IR[thread_idx, i] - # The arrays ρ (and so I) should be reversed since ρ is in ascending order. - ρ = ρ[0, :, ::-1] + # Convert top-k pearson correlations to distances. The arrays ρ (and so I) should + # be reversed since ρ is in ascending order. + p_norm = np.abs(2 * m * (1 - ρ[0, :, ::-1])) I = I[0, :, ::-1] - # Convert pearson correlations to distances. - p_norm = np.abs(2 * m * (1 - ρ)) + # Convert top-1 left/right pearson correlations to distances. p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 037c4ba52..9aa815e6e 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -18,10 +18,10 @@ def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 ): """ - Compute the z-normalized matrix profile with a distributed dask cluster + Compute the z-normalized (top-k) matrix profile with a distributed dask cluster This is a highly distributed implementation around the Numba JIT-compiled - parallelized `_stump` function which computes the matrix profile according + parallelized `_stump` function which computes the (top-k) matrix profile according to STOMPopt with Pearson correlations. Parameters @@ -63,10 +63,10 @@ def stumped( Returns ------- out : numpy.ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. + The first k columns consist of the top-k matrix profile, the next k columns + consist of their corresponding matrix profile indices, the column at + numpy indexing 2k contains the top-1 left matrix profile indices and the last + column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. See Also -------- diff --git a/tests/test_stump.py b/tests/test_stump.py index 25b9c5283..af2a2315e 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -244,7 +244,7 @@ def test_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): - k = 2 + k = 3 m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) From c6580c8a8dc1d2cdc49ca4724c16d0649ed95028 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:20:45 -0600 Subject: [PATCH 085/151] Correct Format --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 9b26478c2..6a4f7b534 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -454,7 +454,7 @@ def __init__( s=None, normalize=True, p=2.0, - k=1, # class needs to be revised to return (top-k) matrix profile + k=1, # class needs to be revised to return (top-k) matrix profile ): """ Initialize the `scrump` object From e4b0473e0fa38f696a84aac3f2da9938eaeb198d Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:26:25 -0600 Subject: [PATCH 086/151] Remove k from arguments --- stumpy/scrump.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 6a4f7b534..e62658fc9 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -454,7 +454,6 @@ def __init__( s=None, normalize=True, p=2.0, - k=1, # class needs to be revised to return (top-k) matrix profile ): """ Initialize the `scrump` object From 8bf05ee3d7534488f0769ff2b4bf95eb1f818fc7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:33:27 -0600 Subject: [PATCH 087/151] Pass 1 as value of parameter k to a class method to avoid unit test failure --- stumpy/scrump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index e62658fc9..ea8808696 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -625,6 +625,7 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, + 1 # revise module to accept parameter k for top-k matrix profile ) P = np.column_stack((P, PL, PR)) From f12261cafd9637a1253444d0c321f61c8ee59b23 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:34:42 -0600 Subject: [PATCH 088/151] Pass 1 as the value of parameter k to avoid unit test failure --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index ea8808696..9fcb51e4b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -625,7 +625,7 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, - 1 # revise module to accept parameter k for top-k matrix profile + 1, # revise module to accept parameter k for top-k matrix profile ) P = np.column_stack((P, PL, PR)) From 695343e4e7ff927b1793de418bc0b2d3dc45b5df Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 22:38:25 -0600 Subject: [PATCH 089/151] Use np searchsort to avoid copying arrays into new memory --- stumpy/stumped.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 9aa815e6e..f6932325b 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -262,21 +262,18 @@ def stumped( for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - # Update top-k matrix profile, alternative approach: - # np.argsort(np.concatenate(profile, P), kind='mergesort') - prof = profile.copy() - ind = indices.copy() for j in range(l): - u, w = 0, 0 - for idx in range(k): - if prof[j, u] <= P[j, w]: - profile[j, idx] = prof[j, u] - indices[j, idx] = ind[j, u] - u += 1 - else: - profile[j, idx] = P[j, w] - indices[j, idx] = I[j, w] - w += 1 + # Uodate profie[j] + for D, ind in zip(P[j], I[j]): + if D >= profile[j, -1]: + break # no need to update profile[j] from this point. + idx = np.searchsorted(profile[j], D, side="right") # might be optimized + # with help of checkpoint idx from previous iteration. + profile[j, idx + 1 :] = profile[j, idx : k - 1] + profile[j, idx] = D + + indices[j, idx + 1 :] = indices[j, idx : k - 1] + indices[j, idx] = ind # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L From f4a37faa71b57cbd6258c981a513ec8a3d2e20b3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 23:42:43 -0600 Subject: [PATCH 090/151] All tests passed From b6d376319b027706dd771693e63811524605b1be Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 16:11:29 -0600 Subject: [PATCH 091/151] Replace nested for loops with numpy operations --- stumpy/stumped.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index f6932325b..34d665fc7 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -257,23 +257,27 @@ def stumped( ) ) + profile = np.empty((l, 2 * k)) + indices = np.empty((l, 2 * k)) + results = dask_client.gather(futures) - profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] + ( + profile[:, :k], + indices[:, :k], + profile_L, + indices_L, + profile_R, + indices_R, + ) = results[0] for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - for j in range(l): - # Uodate profie[j] - for D, ind in zip(P[j], I[j]): - if D >= profile[j, -1]: - break # no need to update profile[j] from this point. - idx = np.searchsorted(profile[j], D, side="right") # might be optimized - # with help of checkpoint idx from previous iteration. - profile[j, idx + 1 :] = profile[j, idx : k - 1] - profile[j, idx] = D - - indices[j, idx + 1 :] = indices[j, idx : k - 1] - indices[j, idx] = ind + + profile[:, k:] = P + indices[:, k:] = I + idx = np.argsort(profile, axis=1) + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L @@ -286,8 +290,8 @@ def stumped( indices_R = np.where(cond, IR, indices_R) out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = profile - out[:, k:] = np.column_stack((indices, indices_L, indices_R)) + out[:, :k] = profile[:, :k] + out[:, k:] = np.column_stack((indices[:, :k], indices_L, indices_R)) # Delete data from Dask cluster dask_client.cancel(T_A_future) From cc9c0769fde6e270ff903d69459e241207e57da2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 18:28:46 -0600 Subject: [PATCH 092/151] Change the order of some variables in inputs and outputs --- stumpy/stump.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 60d965590..9f37edc8b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -40,10 +40,10 @@ def _compute_diagonal( diags_stop_idx, thread_idx, ρ, - I, ρL, - IL, ρR, + I, + IL, IR, ignore_trivial, k, @@ -123,18 +123,18 @@ def _compute_diagonal( ρ : numpy.ndarray The (top-k) Pearson correlations, sorted in ascending order per row - I : numpy.ndarray - The (top-k) matrix profile indices - ρL : numpy.ndarray The top-1 left Pearson correlations - IL : numpy.ndarray - The top-1 left matrix profile indices - ρR : numpy.ndarray The top-1 right Pearson correlations + I : numpy.ndarray + The (top-k) matrix profile indices + + IL : numpy.ndarray + The top-1 left matrix profile indices + IR : numpy.ndarray The top-1 right matrix profile indices @@ -452,10 +452,10 @@ def _stump( diags_ranges[thread_idx, 1], thread_idx, ρ, - I, ρL, - IL, ρR, + I, + IL, IR, ignore_trivial, k, @@ -508,7 +508,7 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P, I, PL, IL[0, :], PR, IR[0, :] + return P, PL, PR, I, IL[0, :], IR[0, :] @core.non_normalized(aamp) @@ -676,7 +676,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I, PL, IL, PR, IR = _stump( + P, PL, PR, I, IL, IR = _stump( T_A, T_B, m, From a4d456691dacf788739db9dfdf3796ddc568f794 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 18:47:50 -0600 Subject: [PATCH 093/151] Revise docstrings and comments --- stumpy/aamp.py | 4 ++++ stumpy/aamped.py | 4 ++++ stumpy/stump.py | 42 +++++++++++++++++++++++------------------- stumpy/stumped.py | 19 ++++++++++++------- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 87568f365..428c3d4bd 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -268,6 +268,10 @@ def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- out : numpy.ndarray diff --git a/stumpy/aamped.py b/stumpy/aamped.py index d833ee8b3..ad147b42f 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -47,6 +47,10 @@ def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- out : numpy.ndarray diff --git a/stumpy/stump.py b/stumpy/stump.py index 9f37edc8b..bcf0d4103 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -143,8 +143,8 @@ def _compute_diagonal( `False`. Default is `True`. k : int - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- @@ -326,28 +326,28 @@ def _stump( `False`. Default is `True`. k : int - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- profile : numpy.ndarray - Top-k matrix profile + The (top-k) matrix profile indices : numpy.ndarray - Top-k matrix profile indices + The (top-k) matrix profile indices left profile : numpy.ndarray - Top-1 left matrix profile + The (top-1) left matrix profile left indices : numpy.ndarray - Top-1 left matrix profile indices + The (top-1) left matrix profile indices right profile : numpy.ndarray - Top-1 right matrix profile + The (top-1) right matrix profile right indices : numpy.ndarray - Top-1 right matrix profile indices + The (top-1) right matrix profile indices Notes ----- @@ -484,12 +484,11 @@ def _stump( ρR[0, i] = ρR[thread_idx, i] IR[0, i] = IR[thread_idx, i] - # Convert top-k pearson correlations to distances. The arrays ρ (and so I) should - # be reversed since ρ is in ascending order. + # Reverse top-k rho (and its associated I) to be in descending order and + # then convert from Pearson correlations to Euclidean distances (ascending order) p_norm = np.abs(2 * m * (1 - ρ[0, :, ::-1])) I = I[0, :, ::-1] - # Convert top-1 left/right pearson correlations to distances. p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) @@ -547,16 +546,21 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): ignored when `normalize == True`. k : int, default 1 - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- out : numpy.ndarray - The first k columns consist of the top-k matrix profile, the next k columns - consist of their corresponding matrix profile indices, the column at - numpy indexing 2k contains the top-1 left matrix profile indices and the last - column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. + When k = 1 (default), the first column consists of the matrix profile, + the second column consists of the matrix profile indices, the third column + consists of the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. However, when k > 1, the output array will + contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists + of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists + of the corresponding top-k matrix profile indices, and the last two columns + (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to + the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 34d665fc7..99a1ba0b1 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -56,17 +56,22 @@ def stumped( The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. - k : int - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- out : numpy.ndarray - The first k columns consist of the top-k matrix profile, the next k columns - consist of their corresponding matrix profile indices, the column at - numpy indexing 2k contains the top-1 left matrix profile indices and the last - column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. + When k = 1 (default), the first column consists of the matrix profile, + the second column consists of the matrix profile indices, the third column + consists of the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. However, when k > 1, the output array will + contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists + of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists + of the corresponding top-k matrix profile indices, and the last two columns + (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to + the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- From 5ab2978f9c09589e7cbc6279d7c5fb27c07d9723 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 22:17:51 -0600 Subject: [PATCH 094/151] Fixed order of outputs returned in _stump --- stumpy/scrump.py | 2 +- stumpy/stumped.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 9fcb51e4b..c547ab02b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -609,7 +609,7 @@ def update(self): if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] - P, I, PL, IL, PR, IR = _stump( + P, PL, PR, I, IL, IR = _stump( self._T_A, self._T_B, self._m, diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 99a1ba0b1..1c8b2cd80 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -268,15 +268,15 @@ def stumped( results = dask_client.gather(futures) ( profile[:, :k], - indices[:, :k], profile_L, - indices_L, profile_R, + indices[:, :k], + indices_L, indices_R, ) = results[0] for i in range(1, len(hosts)): - P, I, PL, IL, PR, IR = results[i] + P, PL, PR, I, IL, IR = results[i] profile[:, k:] = P indices[:, k:] = I From 6460a5bc57a4a6ecc3beff8a45f6262cfd47807b Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 21 May 2022 11:23:00 -0600 Subject: [PATCH 095/151] Add new function to update TopK MatrixProfile --- stumpy/stumped.py | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 1c8b2cd80..e9784e28f 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -5,6 +5,7 @@ import logging import numpy as np +from numba import njit, prange from . import core, config from .stump import _stump @@ -13,6 +14,35 @@ logger = logging.getLogger(__name__) +@njit(parallel=True) +def _merge_topk_profiles_indices(PA, PB, IA, IB): + """ + Merge two top-k matrix profiles while prioritizing values of PA in ties + and update PA (and so IA) + + PA : numpy.ndarray + a (top-k) matrix profile + + PB : numpy.ndarray + a (top-k) matrix profile + + IA : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PA + + IB : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PB + """ + for i in prange(PA.shape[0]): + for j in range(PA.shape[1]): + if PB[i, j] < PA[i, -1]: + idx = np.searchsorted(PA[i], PB[i, j], side="right") + + PA[i, idx + 1 :] = PA[i, idx:-1] + PA[i, idx] = PB[i, j] + IA[i, idx + 1 :] = IA[i, idx:-1] + IA[i, idx] = IB[i, j] + + @core.non_normalized(aamped) def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 @@ -266,23 +296,12 @@ def stumped( indices = np.empty((l, 2 * k)) results = dask_client.gather(futures) - ( - profile[:, :k], - profile_L, - profile_R, - indices[:, :k], - indices_L, - indices_R, - ) = results[0] + profile, profile_L, profile_R, indices, indices_L, indices_R = results[0] for i in range(1, len(hosts)): P, PL, PR, I, IL, IR = results[i] - - profile[:, k:] = P - indices[:, k:] = I - idx = np.argsort(profile, axis=1) - profile = np.take_along_axis(profile, idx, axis=1) - indices = np.take_along_axis(indices, idx, axis=1) + # Update top-k matrix profile and matrix profile indices + _merge_topk_profiles_indices(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L From d94db722bb2b3150a38008a323ccd117f4bfc1c2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 21 May 2022 11:34:38 -0600 Subject: [PATCH 096/151] Add .copy() to update array properly --- stumpy/stumped.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index e9784e28f..01606f5bf 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -37,9 +37,9 @@ def _merge_topk_profiles_indices(PA, PB, IA, IB): if PB[i, j] < PA[i, -1]: idx = np.searchsorted(PA[i], PB[i, j], side="right") - PA[i, idx + 1 :] = PA[i, idx:-1] + PA[i, idx + 1 :] = PA[i, idx:-1].copy() PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1] + IA[i, idx + 1 :] = IA[i, idx:-1].copy() IA[i, idx] = IB[i, j] From 72c3887b014f6be2fe89030013179bfd182bc1c1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 21 May 2022 12:31:08 -0600 Subject: [PATCH 097/151] Add new test function for TopK MatrixProfile with gpu_stump --- tests/test_gpu_stump.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 508b02a56..1a2662647 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -350,3 +350,20 @@ def test_gpu_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_gpu_stump_self_join_KNN(T_A, T_B): + k = 3 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) + comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 0068358c8c771d950090d62779a9fd30336f2bfc Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 00:15:00 -0600 Subject: [PATCH 098/151] Enhance gpu_stump to return TopK MatrixProfile --- stumpy/gpu_stump.py | 247 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 198 insertions(+), 49 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 667dd8b56..606bf7faf 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda +from numba import cuda, njit, prange from . import core, config from .gpu_aamp import gpu_aamp @@ -15,9 +15,38 @@ logger = logging.getLogger(__name__) +@njit(parallel=True) +def _merge_topk_profiles_indices(PA, PB, IA, IB): + """ + Merge two top-k matrix profiles while prioritizing values of PA in ties + and update PA (and so IA) + + PA : numpy.ndarray + a (top-k) matrix profile + + PB : numpy.ndarray + a (top-k) matrix profile + + IA : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PA + + IB : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PB + """ + for i in range(PA.shape[0]): + for j in range(PA.shape[1]): + if PB[i, j] < PA[i, -1]: + idx = np.searchsorted(PA[i], PB[i, j], side="right") + + PA[i, idx + 1 :] = PA[i, idx:-1].copy() + PA[i, idx] = PB[i, j] + IA[i, idx + 1 :] = IA[i, idx:-1].copy() + IA[i, idx] = IB[i, j] + + @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," - "f8[:], f8[:], i8, b1, i8, f8[:, :], i8[:, :], b1)" + "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i2)" ) def _compute_and_update_PI_kernel( i, @@ -31,12 +60,17 @@ def _compute_and_update_PI_kernel( Σ_T, μ_Q, σ_Q, - k, + profile_len, ignore_trivial, excl_zone, profile, + profile_L, + profile_R, indices, + indices_L, + indices_R, compute_QT, + k, ): """ A Numba CUDA kernel to update the matrix profile and matrix profile indices @@ -79,7 +113,7 @@ def _compute_and_update_PI_kernel( σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q` - k : int + profile_len : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -91,18 +125,30 @@ def _compute_and_update_PI_kernel( sliding window profile : numpy.ndarray - Matrix profile. The first column consists of the global matrix profile, - the second column consists of the left matrix profile, and the third - column consists of the right matrix profile. + The (top-k) matrix profile, sorted in ascending order per row + + profile_L : numpy.ndarray + The (top-1) left matrix profile + + profile_R : numpy.ndarray + The (top-1) right matrix profile indices : numpy.ndarray - The first column consists of the matrix profile indices, the second - column consists of the left matrix profile indices, and the third - column consists of the right matrix profile indices. + The (top-k) matrix profile indices + + indices_L : numpy.ndarray + The (top-1) left matrix profile indices + + indices_R : numpy.ndarray + The (top-1) right matrix profile indices compute_QT : bool A boolean flag for whether or not to compute QT + k : int + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- None @@ -126,7 +172,7 @@ def _compute_and_update_PI_kernel( for j in range(start, QT_out.shape[0], stride): zone_start = max(0, j - excl_zone) - zone_stop = min(k, j + excl_zone) + zone_stop = min(profile_len, j + excl_zone) if compute_QT: QT_out[j] = ( @@ -157,16 +203,22 @@ def _compute_and_update_PI_kernel( if ignore_trivial: if i <= zone_stop and i >= zone_start: p_norm = np.inf - if p_norm < profile[j, 1] and i < j: - profile[j, 1] = p_norm - indices[j, 1] = i - if p_norm < profile[j, 2] and i > j: - profile[j, 2] = p_norm - indices[j, 2] = i - - if p_norm < profile[j, 0]: - profile[j, 0] = p_norm - indices[j, 0] = i + if p_norm < profile_L[j] and i < j: + profile_L[j] = p_norm + indices_L[j] = i + if p_norm < profile_R[j] and i > j: + profile_R[j] = p_norm + indices_R[j] = i + + for idx in range(k, -1, -1): + if (p_norm < profile[j, idx - 1]) and (idx > 0): + profile[j, idx - 1] = profile[j, idx - 2] + indices[j, idx - 1] = indices[j, idx - 2] + else: + break + if idx < k: + profile[j, idx] = p_norm + indices[j, idx] = i def _gpu_stump( @@ -181,10 +233,11 @@ def _gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - k, + profile_len, ignore_trivial=True, range_start=1, device_id=0, + k=1, ): """ A Numba CUDA version of STOMP for parallel computation of the @@ -235,7 +288,7 @@ def _gpu_stump( The file name for the standard deviation of the query sequence, `Q`, relative to the current sliding window - k : int + profile_len : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -249,6 +302,10 @@ def _gpu_stump( device_id : int The (GPU) device number to use. The default value is `0`. + k : int + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- profile_fname : str @@ -316,11 +373,22 @@ def _gpu_stump( device_M_T = cuda.to_device(M_T) device_Σ_T = cuda.to_device(Σ_T) - profile = np.full((k, 3), np.inf, dtype=np.float64) - indices = np.full((k, 3), -1, dtype=np.int64) + profile = np.full((profile_len, k), np.inf, dtype=np.float64) + indices = np.full((profile_len, k), -1, dtype=np.int64) + + profile_L = np.full(profile_len, np.inf, dtype=np.float64) + indices_L = np.full(profile_len, -1, dtype=np.int64) + + profile_R = np.full(profile_len, np.inf, dtype=np.float64) + indices_R = np.full(profile_len, -1, dtype=np.int64) device_profile = cuda.to_device(profile) + device_profile_L = cuda.to_device(profile_L) + device_profile_R = cuda.to_device(profile_R) device_indices = cuda.to_device(indices) + device_indices_L = cuda.to_device(indices_L) + device_indices_R = cuda.to_device(indices_R) + _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block]( range_start - 1, device_T_A, @@ -333,12 +401,17 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - k, + profile_len, ignore_trivial, excl_zone, device_profile, + device_profile_L, + device_profile_R, device_indices, + device_indices_L, + device_indices_R, False, + k, ) for i in range(range_start, range_stop): @@ -354,27 +427,50 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - k, + profile_len, ignore_trivial, excl_zone, device_profile, + device_profile_L, + device_profile_R, device_indices, + device_indices_L, + device_indices_R, True, + k, ) profile = device_profile.copy_to_host() + profile_L = device_profile_L.copy_to_host() + profile_R = device_profile_R.copy_to_host() indices = device_indices.copy_to_host() + indices_L = device_indices_L.copy_to_host() + indices_R = device_indices_R.copy_to_host() + profile = np.sqrt(profile) + profile_L = np.sqrt(profile_L) + profile_R = np.sqrt(profile_R) profile_fname = core.array_to_temp_file(profile) + profile_L_fname = core.array_to_temp_file(profile_L) + profile_R_fname = core.array_to_temp_file(profile_R) indices_fname = core.array_to_temp_file(indices) + indices_L_fname = core.array_to_temp_file(indices_L) + indices_R_fname = core.array_to_temp_file(indices_R) - return profile_fname, indices_fname + return ( + profile_fname, + profile_L_fname, + profile_R_fname, + indices_fname, + indices_L_fname, + indices_R_fname, + ) @core.non_normalized(gpu_aamp) def gpu_stump( - T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0 + T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0, k=1 ): """ Compute the z-normalized matrix profile with one or more GPU devices @@ -417,13 +513,22 @@ def gpu_stump( The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- out : numpy.ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. + When k = 1 (default), the first column consists of the matrix profile, + the second column consists of the matrix profile indices, the third column + consists of the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. However, when k > 1, the output array will + contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists + of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists + of the corresponding top-k matrix profile indices, and the last two columns + (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to + the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- @@ -505,7 +610,7 @@ def gpu_stump( logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] - k = T_A.shape[0] - m + 1 + profile_len = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) @@ -518,8 +623,6 @@ def gpu_stump( μ_Q_fname = core.array_to_temp_file(μ_Q) σ_Q_fname = core.array_to_temp_file(σ_Q) - out = np.empty((k, 4), dtype=object) - if isinstance(device_id, int): device_ids = [device_id] else: @@ -528,6 +631,12 @@ def gpu_stump( profile = [None] * len(device_ids) indices = [None] * len(device_ids) + profile_L = [None] * len(device_ids) + indices_L = [None] * len(device_ids) + + profile_R = [None] * len(device_ids) + indices_R = [None] * len(device_ids) + for _id in device_ids: with cuda.gpus[_id]: if ( @@ -571,16 +680,24 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - k, + profile_len, ignore_trivial, start + 1, device_ids[idx], + k, ), ) else: # Execute last chunk in parent process # Only parent process is executed when a single GPU is requested - profile[idx], indices[idx] = _gpu_stump( + ( + profile[idx], + profile_L[idx], + profile_R[idx], + indices[idx], + indices_L[idx], + indices_R[idx], + ) = _gpu_stump( T_A_fname, T_B_fname, m, @@ -592,10 +709,11 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - k, + profile_len, ignore_trivial, start + 1, device_ids[idx], + k, ) # Clean up process pool for multi-GPU request @@ -606,7 +724,14 @@ def gpu_stump( # Collect results from spawned child processes if they exist for idx, result in enumerate(results): if result is not None: - profile[idx], indices[idx] = result.get() + ( + profile[idx], + profile_L[idx], + profile_R[idx], + indices[idx], + indices_L[idx], + indices_R[idx], + ) = result.get() os.remove(T_A_fname) os.remove(T_B_fname) @@ -621,22 +746,46 @@ def gpu_stump( for idx in range(len(device_ids)): profile_fname = profile[idx] + profile_L_fname = profile_L[idx] + profile_R_fname = profile_R[idx] indices_fname = indices[idx] + indices_L_fname = indices_L[idx] + indices_R_fname = indices_R[idx] + profile[idx] = np.load(profile_fname, allow_pickle=False) + profile_L[idx] = np.load(profile_L_fname, allow_pickle=False) + profile_R[idx] = np.load(profile_R_fname, allow_pickle=False) indices[idx] = np.load(indices_fname, allow_pickle=False) + indices_L[idx] = np.load(indices_L_fname, allow_pickle=False) + indices_R[idx] = np.load(indices_R_fname, allow_pickle=False) + os.remove(profile_fname) + os.remove(profile_L_fname) + os.remove(profile_R_fname) os.remove(indices_fname) + os.remove(indices_L_fname) + os.remove(indices_R_fname) for i in range(1, len(device_ids)): - # Update all matrix profiles and matrix profile indices - # (global, left, right) and store in profile[0] and indices[0] - for col in range(profile[0].shape[1]): # pragma: no cover - cond = profile[0][:, col] < profile[i][:, col] - profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col]) - indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col]) - - out[:, 0] = profile[0][:, 0] - out[:, 1:4] = indices[0][:, :] + # Update (top-k) matrix profile and matrix profile indices + _merge_topk_profiles_indices(profile[0], profile[i], indices[0], indices[i]) + + # Update (top-1) left matrix profile and matrix profil indices + cond = profile_L[0] < profile_L[i] + profile_L[0] = np.where(cond, profile_L[0], profile_L[i]) + indices_L[0] = np.where(cond, indices_L[0], indices_L[i]) + + # Update (top-1) right matrix profile and matrix profil indices + cond = profile_R[0] < profile_R[i] + profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) + indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) + + out = np.empty( + (profile_len, 2 * k + 2), dtype=object + ) # last two columns are to store + # (top-1) left/right matrix profile indices + out[:, :k] = profile[0] + out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From 1e7c05e0dce914ca2fc8fbc39cb5411b4fd5fb03 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 14:16:40 -0600 Subject: [PATCH 099/151] Refactored function for merging two TopK MatrixProfile --- stumpy/core.py | 37 ++++++++++++++++++++++++++++++++++++- stumpy/gpu_stump.py | 41 +++++++---------------------------------- stumpy/stumped.py | 33 +-------------------------------- 3 files changed, 44 insertions(+), 67 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2a30c043..64dee293c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import njit +from numba import njit, prange from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg @@ -2494,3 +2494,38 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): MPdist = partition[k] return MPdist + + +@njit(parallel=True) +def _merge_topk_profiles_indices(PA, PB, IA, IB): + """ + Merge two top-k matrix profiles PA and PB, and update PA (in place) while + prioritizing values of PA in ties. Also, update IA accordingly. + + Parameters + ---------- + PA : numpy.ndarray + a (top-k) matrix profile + + PB : numpy.ndarray + a (top-k) matrix profile + + IA : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PA + + IB : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PB + + Returns + ------- + None + """ + for i in prange(PA.shape[0]): + for j in range(PA.shape[1]): + if PB[i, j] < PA[i, -1]: + idx = np.searchsorted(PA[i], PB[i, j], side="right") + + PA[i, idx + 1 :] = PA[i, idx:-1].copy() + PA[i, idx] = PB[i, j] + IA[i, idx + 1 :] = IA[i, idx:-1].copy() + IA[i, idx] = IB[i, j] diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 606bf7faf..2df5b14b1 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda, njit, prange +from numba import cuda from . import core, config from .gpu_aamp import gpu_aamp @@ -15,35 +15,6 @@ logger = logging.getLogger(__name__) -@njit(parallel=True) -def _merge_topk_profiles_indices(PA, PB, IA, IB): - """ - Merge two top-k matrix profiles while prioritizing values of PA in ties - and update PA (and so IA) - - PA : numpy.ndarray - a (top-k) matrix profile - - PB : numpy.ndarray - a (top-k) matrix profile - - IA : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PA - - IB : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PB - """ - for i in range(PA.shape[0]): - for j in range(PA.shape[1]): - if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i], PB[i, j], side="right") - - PA[i, idx + 1 :] = PA[i, idx:-1].copy() - PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1].copy() - IA[i, idx] = IB[i, j] - - @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i2)" @@ -209,7 +180,7 @@ def _compute_and_update_PI_kernel( if p_norm < profile_R[j] and i > j: profile_R[j] = p_norm indices_R[j] = i - + for idx in range(k, -1, -1): if (p_norm < profile[j, idx - 1]) and (idx > 0): profile[j, idx - 1] = profile[j, idx - 2] @@ -766,9 +737,11 @@ def gpu_stump( os.remove(indices_L_fname) os.remove(indices_R_fname) + profile_0 = profile[0].copy() + indices_0 = indices[0].copy() for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - _merge_topk_profiles_indices(profile[0], profile[i], indices[0], indices[i]) + core._merge_topk_profiles_indices(profile_0, profile[i], indices_0, indices[i]) # Update (top-1) left matrix profile and matrix profil indices cond = profile_L[0] < profile_L[i] @@ -784,8 +757,8 @@ def gpu_stump( (profile_len, 2 * k + 2), dtype=object ) # last two columns are to store # (top-1) left/right matrix profile indices - out[:, :k] = profile[0] - out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) + out[:, :k] = profile_0 + out[:, k:] = np.column_stack((indices_0, indices_L[0], indices_R[0])) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 01606f5bf..0667713d3 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -5,7 +5,6 @@ import logging import numpy as np -from numba import njit, prange from . import core, config from .stump import _stump @@ -13,36 +12,6 @@ logger = logging.getLogger(__name__) - -@njit(parallel=True) -def _merge_topk_profiles_indices(PA, PB, IA, IB): - """ - Merge two top-k matrix profiles while prioritizing values of PA in ties - and update PA (and so IA) - - PA : numpy.ndarray - a (top-k) matrix profile - - PB : numpy.ndarray - a (top-k) matrix profile - - IA : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PA - - IB : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PB - """ - for i in prange(PA.shape[0]): - for j in range(PA.shape[1]): - if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i], PB[i, j], side="right") - - PA[i, idx + 1 :] = PA[i, idx:-1].copy() - PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1].copy() - IA[i, idx] = IB[i, j] - - @core.non_normalized(aamped) def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 @@ -301,7 +270,7 @@ def stumped( for i in range(1, len(hosts)): P, PL, PR, I, IL, IR = results[i] # Update top-k matrix profile and matrix profile indices - _merge_topk_profiles_indices(profile, P, indices, I) + core._merge_topk_profiles_indices(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L From 2ebc276498eab50fb08c3f1f2ecf30db337eb80e Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 14:21:33 -0600 Subject: [PATCH 100/151] Clean up code --- stumpy/gpu_stump.py | 10 +++++----- stumpy/stumped.py | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 2df5b14b1..803b020f0 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -737,11 +737,11 @@ def gpu_stump( os.remove(indices_L_fname) os.remove(indices_R_fname) - profile_0 = profile[0].copy() - indices_0 = indices[0].copy() for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - core._merge_topk_profiles_indices(profile_0, profile[i], indices_0, indices[i]) + core._merge_topk_profiles_indices( + profile[0], profile[i], indices[0], indices[i] + ) # Update (top-1) left matrix profile and matrix profil indices cond = profile_L[0] < profile_L[i] @@ -757,8 +757,8 @@ def gpu_stump( (profile_len, 2 * k + 2), dtype=object ) # last two columns are to store # (top-1) left/right matrix profile indices - out[:, :k] = profile_0 - out[:, k:] = np.column_stack((indices_0, indices_L[0], indices_R[0])) + out[:, :k] = profile[0] + out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 0667713d3..17e0d556c 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + @core.non_normalized(aamped) def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 @@ -283,8 +284,8 @@ def stumped( indices_R = np.where(cond, IR, indices_R) out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = profile[:, :k] - out[:, k:] = np.column_stack((indices[:, :k], indices_L, indices_R)) + out[:, :k] = profile + out[:, k:] = np.column_stack((indices, indices_L, indices_R)) # Delete data from Dask cluster dask_client.cancel(T_A_future) From 1170f2ebd770ed4f70aa3048dd4e6778bb723c53 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 15:44:18 -0600 Subject: [PATCH 101/151] Add naive version of merge_topk_matrix_profile function --- tests/test_core.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 6ef78d230..c26dd449d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -82,6 +82,15 @@ def naive_bsf_indices(n): return np.array(out) +def naive_merge_topk_profiles_indices(PA, PB, IA, IB): + profile = np.column_stack((PA, PB)) + indices = np.column_stack((IA, IB)) + + idx = np.argsort(profile, axis=1) + PA[:, :] = np.take_along_axis(profile, idx, axis=1)[:, : PA.shape[1]] + IA[:, :] = np.take_along_axis(indices, idx, axis=1)[:, : PA.shape[1]] + + test_data = [ (np.array([-1, 1, 2], dtype=np.float64), np.array(range(5), dtype=np.float64)), ( From 2a827b450df582b95d68a0578ca82ced758fe7f1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 15:48:54 -0600 Subject: [PATCH 102/151] Rename function --- stumpy/core.py | 2 +- stumpy/gpu_stump.py | 2 +- stumpy/stumped.py | 2 +- tests/test_core.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 64dee293c..89b6266fc 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2497,7 +2497,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): @njit(parallel=True) -def _merge_topk_profiles_indices(PA, PB, IA, IB): +def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles PA and PB, and update PA (in place) while prioritizing values of PA in ties. Also, update IA accordingly. diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 803b020f0..cc4537813 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -739,7 +739,7 @@ def gpu_stump( for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - core._merge_topk_profiles_indices( + core._merge_topk_PI( profile[0], profile[i], indices[0], indices[i] ) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 17e0d556c..0f6459db5 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -271,7 +271,7 @@ def stumped( for i in range(1, len(hosts)): P, PL, PR, I, IL, IR = results[i] # Update top-k matrix profile and matrix profile indices - core._merge_topk_profiles_indices(profile, P, indices, I) + core._merge_topk_PI(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L diff --git a/tests/test_core.py b/tests/test_core.py index c26dd449d..95dc268d3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -82,7 +82,7 @@ def naive_bsf_indices(n): return np.array(out) -def naive_merge_topk_profiles_indices(PA, PB, IA, IB): +def naive_merge_topk_PI(PA, PB, IA, IB): profile = np.column_stack((PA, PB)) indices = np.column_stack((IA, IB)) From cc62c74f11f229dcc7bd98aabba2759cda91260f Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 16:13:24 -0600 Subject: [PATCH 103/151] Revise naive function to make it more readable --- tests/test_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 95dc268d3..4585de1af 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -87,9 +87,11 @@ def naive_merge_topk_PI(PA, PB, IA, IB): indices = np.column_stack((IA, IB)) idx = np.argsort(profile, axis=1) - PA[:, :] = np.take_along_axis(profile, idx, axis=1)[:, : PA.shape[1]] - IA[:, :] = np.take_along_axis(indices, idx, axis=1)[:, : PA.shape[1]] + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) + PA[:, :] = profile[:, : PA.shape[1]] + IA[:, :] = indices[:, : PA.shape[1]] test_data = [ (np.array([-1, 1, 2], dtype=np.float64), np.array(range(5), dtype=np.float64)), From b6b74c4edaa2cb2bf5c3a45987b631cf1a76ab9e Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:32:44 -0600 Subject: [PATCH 104/151] Add test function for merge_topk_PI --- tests/test_core.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 4585de1af..8e29c2f1a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1039,3 +1039,51 @@ def test_select_P_ABBA_val_inf(): p_abba.sort() ref = p_abba[k - 1] npt.assert_almost_equal(ref, comp) + + +def test_merge_topk_PI(): + PA = np.array([ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf] + ]) + + PB = np.array([ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.15, 0.25, 0.35, 0.45], + [0.15, 0.25, 0.35, 0.45, 0.55], + [0.01, 0.02, 0.03, 0.04, 0.05], + [0.6, 0.7, 0.8, 0.9, 1], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.0, 0.3, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf], + ]) + + n, k = PA.shape + + IA = np.arange(n * k).reshape(n, k) + IB = IA.copy() + n * k + IA[7, 2:] = -1 + IA[8, :] = -1 + IB[7, 2:] = -1 + IB[8, :] = -1 + + ref_P = PA.copy() + ref_I = IA.copy() + + comp_P = PA.copy() + comp_I = IA.copy() + + naive_merge_topk_PI(ref_P, PB, ref_I, IB) + core._merge_topk_PI(comp_P, PB, comp_I, IB) + + ref = np.column_stack((ref_P, ref_I)) + comp = np.column_stack((comp_P, comp_I)) + npt.assert_array_equal(ref, comp) From b6d6450850453bfde5c932d129e79e063435b9f8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:37:51 -0600 Subject: [PATCH 105/151] Moved naive function to naive.py --- tests/naive.py | 12 ++++++++++++ tests/test_core.py | 13 +------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 4a5ed789a..3074c2359 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1760,3 +1760,15 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w ) return total_ndists + + +def merge_topk_PI(PA, PB, IA, IB): + profile = np.column_stack((PA, PB)) + indices = np.column_stack((IA, IB)) + + idx = np.argsort(profile, axis=1) + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) + + PA[:, :] = profile[:, : PA.shape[1]] + IA[:, :] = indices[:, : PA.shape[1]] diff --git a/tests/test_core.py b/tests/test_core.py index 8e29c2f1a..e45f8c600 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -82,17 +82,6 @@ def naive_bsf_indices(n): return np.array(out) -def naive_merge_topk_PI(PA, PB, IA, IB): - profile = np.column_stack((PA, PB)) - indices = np.column_stack((IA, IB)) - - idx = np.argsort(profile, axis=1) - profile = np.take_along_axis(profile, idx, axis=1) - indices = np.take_along_axis(indices, idx, axis=1) - - PA[:, :] = profile[:, : PA.shape[1]] - IA[:, :] = indices[:, : PA.shape[1]] - test_data = [ (np.array([-1, 1, 2], dtype=np.float64), np.array(range(5), dtype=np.float64)), ( @@ -1081,7 +1070,7 @@ def test_merge_topk_PI(): comp_P = PA.copy() comp_I = IA.copy() - naive_merge_topk_PI(ref_P, PB, ref_I, IB) + naive.merge_topk_PI(ref_P, PB, ref_I, IB) core._merge_topk_PI(comp_P, PB, comp_I, IB) ref = np.column_stack((ref_P, ref_I)) From 97a04f457ca7c7542b768e504652bf2a9b0d7abf Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:46:32 -0600 Subject: [PATCH 106/151] Correct Format --- stumpy/gpu_stump.py | 4 +--- tests/test_core.py | 50 ++++++++++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index cc4537813..26e49cbb2 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -739,9 +739,7 @@ def gpu_stump( for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - core._merge_topk_PI( - profile[0], profile[i], indices[0], indices[i] - ) + core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) # Update (top-1) left matrix profile and matrix profil indices cond = profile_L[0] < profile_L[i] diff --git a/tests/test_core.py b/tests/test_core.py index e45f8c600..707893d14 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1031,29 +1031,33 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): - PA = np.array([ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf] - ]) - - PB = np.array([ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.15, 0.25, 0.35, 0.45], - [0.15, 0.25, 0.35, 0.45, 0.55], - [0.01, 0.02, 0.03, 0.04, 0.05], - [0.6, 0.7, 0.8, 0.9, 1], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.0, 0.3, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf], - ]) + PA = np.array( + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf], + ] + ) + + PB = np.array( + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.15, 0.25, 0.35, 0.45], + [0.15, 0.25, 0.35, 0.45, 0.55], + [0.01, 0.02, 0.03, 0.04, 0.05], + [0.6, 0.7, 0.8, 0.9, 1], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.0, 0.3, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf], + ] + ) n, k = PA.shape From 50f4ee8cf84b6f5958b9691d23d961a26d5f06b5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:58:52 -0600 Subject: [PATCH 107/151] Correct Style --- stumpy/aamp.py | 3 ++- stumpy/aamped.py | 3 ++- stumpy/gpu_stump.py | 24 ++++++++++++++---------- stumpy/stump.py | 24 ++++++++++++++---------- stumpy/stumped.py | 18 ++++++++++-------- 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 428c3d4bd..82eb41639 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -270,7 +270,8 @@ def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- diff --git a/stumpy/aamped.py b/stumpy/aamped.py index ad147b42f..4499c58b5 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -49,7 +49,8 @@ def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 26e49cbb2..15583c58e 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -118,7 +118,8 @@ def _compute_and_update_PI_kernel( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -275,7 +276,8 @@ def _gpu_stump( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -486,20 +488,22 @@ def gpu_stump( k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- out : numpy.ndarray When k = 1 (default), the first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column - consists of the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. However, when k > 1, the output array will - contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists - of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists - of the corresponding top-k matrix profile indices, and the last two columns - (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to - the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. + consists of the left matrix profile indices, and the fourth column consists + of the right matrix profile indices. However, when k > 1, the output array + will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) + consists of the top-k matrix profile, the next set of k columns + (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile + indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or, + equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left + matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- diff --git a/stumpy/stump.py b/stumpy/stump.py index bcf0d4103..f5a5fe811 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -144,7 +144,8 @@ def _compute_diagonal( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -327,7 +328,8 @@ def _stump( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -547,20 +549,22 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- out : numpy.ndarray When k = 1 (default), the first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column - consists of the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. However, when k > 1, the output array will - contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists - of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists - of the corresponding top-k matrix profile indices, and the last two columns - (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to - the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. + consists of the left matrix profile indices, and the fourth column consists + of the right matrix profile indices. However, when k > 1, the output array + will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) + consists of the top-k matrix profile, the next set of k columns + (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile + indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or, + equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left + matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 0f6459db5..f98338ce9 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -58,20 +58,22 @@ def stumped( k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- out : numpy.ndarray When k = 1 (default), the first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column - consists of the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. However, when k > 1, the output array will - contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists - of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists - of the corresponding top-k matrix profile indices, and the last two columns - (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to - the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. + consists of the left matrix profile indices, and the fourth column consists + of the right matrix profile indices. However, when k > 1, the output array + will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) + consists of the top-k matrix profile, the next set of k columns + (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile + indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or, + equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left + matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- From 5b7da52bf1a936a147d47321e06653a67da1db29 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 18:02:04 -0600 Subject: [PATCH 108/151] Add parameter k to avoid failure in non-normalized decorater unit test --- stumpy/gpu_aamp.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/stumpy/gpu_aamp.py b/stumpy/gpu_aamp.py index e62be7b02..0c9a21a85 100644 --- a/stumpy/gpu_aamp.py +++ b/stumpy/gpu_aamp.py @@ -339,7 +339,9 @@ def _gpu_aamp( return profile_fname, indices_fname -def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0): +def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0, k=1): + # function needs to be revised to return (top-k) matrix profile and + # matrix profile indices """ Compute the non-normalized (i.e., without z-normalization) matrix profile with one or more GPU devices @@ -375,6 +377,11 @@ def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Returns ------- out : numpy.ndarray From e983ef0997ac2e4bcf1c14387be5ec617ec66a4d Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 20:34:58 -0600 Subject: [PATCH 109/151] Skip a for-loop in unit test coverage --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 15583c58e..0d76e19b6 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -741,7 +741,7 @@ def gpu_stump( os.remove(indices_L_fname) os.remove(indices_R_fname) - for i in range(1, len(device_ids)): + for i in range(1, len(device_ids)): # pragma: no cover # Update (top-k) matrix profile and matrix profile indices core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) From b0c5cace4951f97b201f7b42ca0d9627c22bf890 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 20:37:07 -0600 Subject: [PATCH 110/151] All tests pass From 787e3f761162475e556c7fb4bbc252796fa2f9a6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 21:20:50 -0600 Subject: [PATCH 111/151] Use randomly generated arrays for test function --- tests/test_core.py | 77 ++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 707893d14..21e08fd76 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1031,42 +1031,53 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): - PA = np.array( - [ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf], - ] - ) - - PB = np.array( - [ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.15, 0.25, 0.35, 0.45], - [0.15, 0.25, 0.35, 0.45, 0.55], - [0.01, 0.02, 0.03, 0.04, 0.05], - [0.6, 0.7, 0.8, 0.9, 1], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.0, 0.3, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf], - ] - ) - - n, k = PA.shape + n=50 + k=5 + + PA = np.random.randint(0, 5, size=(n, k)) + PA = np.sort(PA) + + PB = np.random.randint(0, 5, size=(n, k)) + PB = np.sort(PB) + + #PA = np.array( + # [ + # [0.0, 0.0, 0.0, 0.0, 0.0], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.1, 0.2, 0.3, 0.4], + # [0.1, 0.2, np.inf, np.inf, np.inf], + # [np.inf, np.inf, np.inf, np.inf, np.inf], + # ] + #) + + #PB = np.array( + # [ + # [0.0, 0.0, 0.0, 0.0, 0.0], + # [0.0, 0.15, 0.25, 0.35, 0.45], + # [0.15, 0.25, 0.35, 0.45, 0.55], + # [0.01, 0.02, 0.03, 0.04, 0.05], + # [0.6, 0.7, 0.8, 0.9, 1], + # [0.1, 0.1, 0.2, 0.3, 0.4], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.0, 0.3, np.inf, np.inf, np.inf], + # [np.inf, np.inf, np.inf, np.inf, np.inf], + # ] + #) IA = np.arange(n * k).reshape(n, k) IB = IA.copy() + n * k - IA[7, 2:] = -1 - IA[8, :] = -1 - IB[7, 2:] = -1 - IB[8, :] = -1 + + #n, k = PA.shape + #IA = np.arange(n * k).reshape(n, k) + #IB = IA.copy() + n * k + #IA[7, 2:] = -1 + #IA[8, :] = -1 + #IB[7, 2:] = -1 + #IB[8, :] = -1 ref_P = PA.copy() ref_I = IA.copy() From 2ff2b85d7a4ec6f3bbea3dd21c05681a15e62dc7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 23:01:32 -0600 Subject: [PATCH 112/151] Add minor comment --- stumpy/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 89b6266fc..7528d5f85 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2525,6 +2525,8 @@ def _merge_topk_PI(PA, PB, IA, IB): if PB[i, j] < PA[i, -1]: idx = np.searchsorted(PA[i], PB[i, j], side="right") + # .copy() operation is needed to resolve wrong result that is + # caused by "prange" PA[i, idx + 1 :] = PA[i, idx:-1].copy() PA[i, idx] = PB[i, j] IA[i, idx + 1 :] = IA[i, idx:-1].copy() From c3060278426d583fa4a35c41b0c8758f8aa857a8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 23:04:40 -0600 Subject: [PATCH 113/151] Erase unnecessary comments --- tests/test_core.py | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 21e08fd76..a1efbf681 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1031,8 +1031,8 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): - n=50 - k=5 + n = 50 + k = 5 PA = np.random.randint(0, 5, size=(n, k)) PA = np.sort(PA) @@ -1040,45 +1040,9 @@ def test_merge_topk_PI(): PB = np.random.randint(0, 5, size=(n, k)) PB = np.sort(PB) - #PA = np.array( - # [ - # [0.0, 0.0, 0.0, 0.0, 0.0], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.1, 0.2, 0.3, 0.4], - # [0.1, 0.2, np.inf, np.inf, np.inf], - # [np.inf, np.inf, np.inf, np.inf, np.inf], - # ] - #) - - #PB = np.array( - # [ - # [0.0, 0.0, 0.0, 0.0, 0.0], - # [0.0, 0.15, 0.25, 0.35, 0.45], - # [0.15, 0.25, 0.35, 0.45, 0.55], - # [0.01, 0.02, 0.03, 0.04, 0.05], - # [0.6, 0.7, 0.8, 0.9, 1], - # [0.1, 0.1, 0.2, 0.3, 0.4], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.0, 0.3, np.inf, np.inf, np.inf], - # [np.inf, np.inf, np.inf, np.inf, np.inf], - # ] - #) - IA = np.arange(n * k).reshape(n, k) IB = IA.copy() + n * k - #n, k = PA.shape - #IA = np.arange(n * k).reshape(n, k) - #IB = IA.copy() + n * k - #IA[7, 2:] = -1 - #IA[8, :] = -1 - #IB[7, 2:] = -1 - #IB[8, :] = -1 - ref_P = PA.copy() ref_I = IA.copy() From 898e9f366d3d4a0cebc1bfdddd20c722a2594f26 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 23 May 2022 11:56:42 -0600 Subject: [PATCH 114/151] Remove unnecessary copy operation --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index a1efbf681..3fa1447bd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1041,7 +1041,7 @@ def test_merge_topk_PI(): PB = np.sort(PB) IA = np.arange(n * k).reshape(n, k) - IB = IA.copy() + n * k + IB = IA + n * k ref_P = PA.copy() ref_I = IA.copy() From 3541faec462fc0869af9bcb3b6eafc93469ebc21 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 24 May 2022 11:49:05 -0600 Subject: [PATCH 115/151] Major revision in function _merge_topk_PI - use PB to get number of iterations for the two most outer for-loops - improve Docstring - use start and stop to narrow down the search space - use for-loop instead of .copy() operation. --- stumpy/core.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index bce98964d..0cc858f93 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2505,10 +2505,12 @@ def _merge_topk_PI(PA, PB, IA, IB): Parameters ---------- PA : numpy.ndarray - a (top-k) matrix profile + a (top-k) matrix profile, with ndim of 2, where values in each row are + sorted in ascending order. Also, it needs to be the same shape as PB. PB : numpy.ndarray - a (top-k) matrix profile + a (top-k) matrix profile, with ndim of 2, where values in each row are + sorted in ascending order. Also, it needs to be the same shape as PA. IA : numpy.ndarray a (top-k) matrix profile indices, corresponding to PA @@ -2520,14 +2522,20 @@ def _merge_topk_PI(PA, PB, IA, IB): ------- None """ - for i in prange(PA.shape[0]): - for j in range(PA.shape[1]): + for i in prange(PB.shape[0]): + start = 0 + stop = np.searchsorted(PA[i], PB[i, -1], side="right") + + for j in range(PB.shape[1]): if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i], PB[i, j], side="right") + idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start + + for g in range(PB.shape[1] - 1, idx, -1): + PA[i, g] = PA[i, g - 1] + IA[i, g] = IA[i, g - 1] - # .copy() operation is needed to resolve wrong result that is - # caused by "prange" - PA[i, idx + 1 :] = PA[i, idx:-1].copy() PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1].copy() IA[i, idx] = IB[i, j] + + start = idx + stop += 1 # because of shifting elements to the right by one From ce8cd4c599b8763519b483a4c9c3f695dc445350 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 00:54:00 -0600 Subject: [PATCH 116/151] Add device function to find insertion index into sorted array --- stumpy/core.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 535471761..200980648 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,3 +2604,48 @@ def _merge_topk_PI(PA, PB, IA, IB): start = idx stop += 1 # because of shifting elements to the right by one + + +@cuda.jit("i8(f8[:], f8, i8[:], i8)", device=True) +def _gpu_searchsorted_right(a, v, bfs, nlevel): + """ + a device function in replace of numpy.searchsorted(a, v, side='right') + + Parameters + ---------- + a : numpy.ndarray + 1-dim array sorted in ascending order. + + v : float + value to insert into array `a` + + bfs : numpy.ndarray + the level order indices from the implicit construction of a binary + search tree followed by a breadth first (level order) search. + + nlevel : int + the number of levels in the binary search tree based from which the array + `bfs` is obtained. + + Returns + ------- + idx : int + the index of the insertion point + """ + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v < a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel-1 or bfs[next_idx]<0: + if v < a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx From 09bbe7fb689e47330aacac6737e56d5d0d416356 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 01:11:01 -0600 Subject: [PATCH 117/151] Add test function for gpu_searchsorted --- tests/test_core.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 4437149d8..7423718ab 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1086,3 +1086,37 @@ def test_merge_topk_PI(): ref = np.column_stack((ref_P, ref_I)) comp = np.column_stack((comp_P, comp_I)) npt.assert_array_equal(ref, comp) + + +def test_gpu_searchsorted(): + # define a function the same as `core._gpu_searchsorted_right` but + # without cuda.jit decorator. + def gpu_searchsorted_right(a, v, bfs, nlevel): + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v < a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel-1 or bfs[next_idx]<0: + if v < a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx + + for n in range(1, 100): + a = np.sort(np.random.rand(n)) + bfs = core._bfs_indices(n, fill_value=-1) + nlevel = np.floor(np.log2(n) + 1).astype(np.int64) + for i in range(n): + v = a[i] + npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + + v = a[i] + 0.001 + npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) From 4948667e38c3b76c1421f4ccf0aedf05c9d82f96 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 01:13:04 -0600 Subject: [PATCH 118/151] Correct format --- stumpy/core.py | 2 +- tests/test_core.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 200980648..3245bd216 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2640,7 +2640,7 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): else: next_idx = 2 * idx + 2 - if level == nlevel-1 or bfs[next_idx]<0: + if level == nlevel - 1 or bfs[next_idx] < 0: if v < a[bfs[idx]]: idx = max(bfs[idx], 0) else: diff --git a/tests/test_core.py b/tests/test_core.py index 7423718ab..152a58a01 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1100,7 +1100,7 @@ def gpu_searchsorted_right(a, v, bfs, nlevel): else: next_idx = 2 * idx + 2 - if level == nlevel-1 or bfs[next_idx]<0: + if level == nlevel - 1 or bfs[next_idx] < 0: if v < a[bfs[idx]]: idx = max(bfs[idx], 0) else: @@ -1116,7 +1116,13 @@ def gpu_searchsorted_right(a, v, bfs, nlevel): nlevel = np.floor(np.log2(n) + 1).astype(np.int64) for i in range(n): v = a[i] - npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) v = a[i] + 0.001 - npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) From cdd7a334ac69408a2ba6810f521b5419afc9ed02 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 01:25:55 -0600 Subject: [PATCH 119/151] Fixed minor bug --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 3245bd216..e2688459e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import njit, prange +from numba import cuda, njit, prange from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg From 71ade4772dce47a9765c8f5081a02b523d8501fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 11:52:57 -0600 Subject: [PATCH 120/151] Fixed the name of a variable --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 0d76e19b6..1b82707fb 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -319,7 +319,7 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ threads_per_block = config.STUMPY_THREADS_PER_BLOCK - blocks_per_grid = math.ceil(k / threads_per_block) + blocks_per_grid = math.ceil(profile_len / threads_per_block) T_A = np.load(T_A_fname, allow_pickle=False) T_B = np.load(T_B_fname, allow_pickle=False) From ac472fc2331f6ef03e2fb5b08fb0c05090d15341 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 15:02:19 -0600 Subject: [PATCH 121/151] Fixed grammatical error in docstring --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index e2688459e..9a7b1012b 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2624,7 +2624,7 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): search tree followed by a breadth first (level order) search. nlevel : int - the number of levels in the binary search tree based from which the array + the number of levels in the binary search tree from which the array `bfs` is obtained. Returns From fc149e688ce2f1bdac409ec66b7376a881edcd21 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 15:09:01 -0600 Subject: [PATCH 122/151] Use device function for searchsorting --- stumpy/gpu_stump.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 1b82707fb..d8ad43fe8 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -17,7 +17,7 @@ @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," - "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i2)" + "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i8[:], i8, i2)" ) def _compute_and_update_PI_kernel( i, @@ -41,6 +41,8 @@ def _compute_and_update_PI_kernel( indices_L, indices_R, compute_QT, + bfs, + nlevel, k, ): """ @@ -116,6 +118,14 @@ def _compute_and_update_PI_kernel( compute_QT : bool A boolean flag for whether or not to compute QT + bfs : numpy.ndarray + the level order indices from the implicit construction of a binary + search tree followed by a breadth first (level order) search. + + nlevel : int + the number of levels in the binary search tree from which the array + `bfs` is obtained. + k : int The number of top `k` smallest distances used to construct the matrix profile. Note that this will increase the total computational time and memory usage @@ -182,13 +192,12 @@ def _compute_and_update_PI_kernel( profile_R[j] = p_norm indices_R[j] = i - for idx in range(k, -1, -1): - if (p_norm < profile[j, idx - 1]) and (idx > 0): - profile[j, idx - 1] = profile[j, idx - 2] - indices[j, idx - 1] = indices[j, idx - 2] - else: - break - if idx < k: + if p_norm < profile[j, -1]: + idx = core._gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) + for g in range(k - 1, idx, -1): + profile[j, g] = profile[j, g - 1] + indices[j, g] = indices[j, g - 1] + profile[j, idx] = p_norm indices[j, idx] = i @@ -318,6 +327,10 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ + bfs = core._bfs_indices(k, fill_value=-1) + nlevel = np.floor(np.log2(k) + 1).astype(np.int64) # number of levels in + # binary seearch tree from which `bfs` is constructed. + threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(profile_len / threads_per_block) @@ -384,6 +397,8 @@ def _gpu_stump( device_indices_L, device_indices_R, False, + bfs, + nlevel, k, ) @@ -410,6 +425,8 @@ def _gpu_stump( device_indices_L, device_indices_R, True, + bfs, + nlevel, k, ) From 7ac67a8302ddcbd0d3affc0538b891fe19a92b17 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 15:16:31 -0600 Subject: [PATCH 123/151] Correct style --- stumpy/core.py | 2 +- stumpy/gpu_stump.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 9a7b1012b..101813759 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2609,7 +2609,7 @@ def _merge_topk_PI(PA, PB, IA, IB): @cuda.jit("i8(f8[:], f8, i8[:], i8)", device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - a device function in replace of numpy.searchsorted(a, v, side='right') + Device function to replace numpy.searchsorted(a, v, side='right') Parameters ---------- diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index d8ad43fe8..1a379eda0 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -17,7 +17,8 @@ @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," - "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i8[:], i8, i2)" + "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:]," + "b1, i8[:], i8, i2)" ) def _compute_and_update_PI_kernel( i, From 92467e24387e490b0289a37738261904ce3148d7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 17:27:06 -0600 Subject: [PATCH 124/151] Remove signature from cuda device function --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 101813759..f342d888e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2606,7 +2606,7 @@ def _merge_topk_PI(PA, PB, IA, IB): stop += 1 # because of shifting elements to the right by one -@cuda.jit("i8(f8[:], f8, i8[:], i8)", device=True) +@cuda.jit(device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Device function to replace numpy.searchsorted(a, v, side='right') From bdfb258ea5e516f1c064141fe3d1d15dc895b858 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 20:37:23 -0600 Subject: [PATCH 125/151] Full Coverage confirmed From bb5de99711bd580b77cc407cc5091ace97839c5c Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 20:45:12 -0600 Subject: [PATCH 126/151] revising the definiton of parameter bfs in docstring --- stumpy/core.py | 4 ++-- stumpy/gpu_stump.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f342d888e..54eb29e4c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2620,8 +2620,8 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): value to insert into array `a` bfs : numpy.ndarray - the level order indices from the implicit construction of a binary - search tree followed by a breadth first (level order) search. + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. nlevel : int the number of levels in the binary search tree from which the array diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 1a379eda0..d8d877078 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -120,8 +120,8 @@ def _compute_and_update_PI_kernel( A boolean flag for whether or not to compute QT bfs : numpy.ndarray - the level order indices from the implicit construction of a binary - search tree followed by a breadth first (level order) search. + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. nlevel : int the number of levels in the binary search tree from which the array From a005a415482dbce75a6030a5e0a3e98118cad333 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 30 May 2022 01:56:24 -0600 Subject: [PATCH 127/151] Copy array into device memory before passing it to kernel function --- stumpy/gpu_stump.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index d8d877078..a7682f52f 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -328,10 +328,6 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ - bfs = core._bfs_indices(k, fill_value=-1) - nlevel = np.floor(np.log2(k) + 1).astype(np.int64) # number of levels in - # binary seearch tree from which `bfs` is constructed. - threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(profile_len / threads_per_block) @@ -344,6 +340,11 @@ def _gpu_stump( μ_Q = np.load(μ_Q_fname, allow_pickle=False) σ_Q = np.load(σ_Q_fname, allow_pickle=False) + + device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) + nlevel = np.floor(np.log2(k) + 1).astype(np.int64) + # number of levels in # binary seearch tree from which `bfs` is constructed. + with cuda.gpus[device_id]: device_T_A = cuda.to_device(T_A) device_QT_odd = cuda.to_device(QT) @@ -398,7 +399,7 @@ def _gpu_stump( device_indices_L, device_indices_R, False, - bfs, + device_bfs, nlevel, k, ) @@ -426,7 +427,7 @@ def _gpu_stump( device_indices_L, device_indices_R, True, - bfs, + device_bfs, nlevel, k, ) From ade9bb4f37295d1c0ae831672356a27ce625ff31 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 30 May 2022 16:58:35 -0600 Subject: [PATCH 128/151] use float values for generating arrays --- tests/test_core.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 152a58a01..e25d6a664 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1065,11 +1065,14 @@ def test_merge_topk_PI(): n = 50 k = 5 - PA = np.random.randint(0, 5, size=(n, k)) - PA = np.sort(PA) + PA = np.random.rand(n * k).reshape(n, k) + PA = np.sort(PA, axis=1) - PB = np.random.randint(0, 5, size=(n, k)) - PB = np.sort(PB) + PB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) + for i in range(n): + PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) + PB = np.sort(PB, axis=1) IA = np.arange(n * k).reshape(n, k) IB = IA + n * k @@ -1083,9 +1086,8 @@ def test_merge_topk_PI(): naive.merge_topk_PI(ref_P, PB, ref_I, IB) core._merge_topk_PI(comp_P, PB, comp_I, IB) - ref = np.column_stack((ref_P, ref_I)) - comp = np.column_stack((comp_P, comp_I)) - npt.assert_array_equal(ref, comp) + npt.assert_array_equal(ref_P, comp_P) + npt.assert_array_equal(ref_I, comp_I) def test_gpu_searchsorted(): From 853c2ec805e37b7839983856f9ba0e882da3730a Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:27:54 -0600 Subject: [PATCH 129/151] move device function to gpu_stump module --- stumpy/core.py | 45 --------------------------------------------- stumpy/gpu_stump.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 54eb29e4c..0ebb5ae50 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,48 +2604,3 @@ def _merge_topk_PI(PA, PB, IA, IB): start = idx stop += 1 # because of shifting elements to the right by one - - -@cuda.jit(device=True) -def _gpu_searchsorted_right(a, v, bfs, nlevel): - """ - Device function to replace numpy.searchsorted(a, v, side='right') - - Parameters - ---------- - a : numpy.ndarray - 1-dim array sorted in ascending order. - - v : float - value to insert into array `a` - - bfs : numpy.ndarray - The breadth-first-search indices where the missing leaves of its corresponding - binary search tree are filled with -1. - - nlevel : int - the number of levels in the binary search tree from which the array - `bfs` is obtained. - - Returns - ------- - idx : int - the index of the insertion point - """ - n = a.shape[0] - idx = 0 - for level in range(nlevel): - if v < a[bfs[idx]]: - next_idx = 2 * idx + 1 - else: - next_idx = 2 * idx + 2 - - if level == nlevel - 1 or bfs[next_idx] < 0: - if v < a[bfs[idx]]: - idx = max(bfs[idx], 0) - else: - idx = min(bfs[idx] + 1, n) - break - idx = next_idx - - return idx diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index a7682f52f..ec6db99d3 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -15,6 +15,51 @@ logger = logging.getLogger(__name__) +@cuda.jit(device=True) +def _gpu_searchsorted_right(a, v, bfs, nlevel): + """ + Device function to replace numpy.searchsorted(a, v, side='right') + + Parameters + ---------- + a : numpy.ndarray + 1-dim array sorted in ascending order. + + v : float + value to insert into array `a` + + bfs : numpy.ndarray + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. + + nlevel : int + the number of levels in the binary search tree from which the array + `bfs` is obtained. + + Returns + ------- + idx : int + the index of the insertion point + """ + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v < a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel - 1 or bfs[next_idx] < 0: + if v < a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx + + @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:]," From e3b5119246a964bb46e560a525832fe68b397bf4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:29:33 -0600 Subject: [PATCH 130/151] Add gpu_searchsorted_left for the sake completeness --- stumpy/gpu_stump.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index ec6db99d3..c7f7aec16 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -15,6 +15,51 @@ logger = logging.getLogger(__name__) +@cuda.jit(device=True) +def _gpu_searchsorted_left(a, v, bfs, nlevel): + """ + Device function to replace numpy.searchsorted(a, v, side='left') + + Parameters + ---------- + a : numpy.ndarray + 1-dim array sorted in ascending order. + + v : float + value to insert into array `a` + + bfs : numpy.ndarray + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. + + nlevel : int + the number of levels in the binary search tree from which the array + `bfs` is obtained. + + Returns + ------- + idx : int + the index of the insertion point + """ + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v <= a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel - 1 or bfs[next_idx] < 0: + if v <= a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx + + @cuda.jit(device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ From c5779e551e2288f3db60ea93d9293cf60a70c2bd Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:46:04 -0600 Subject: [PATCH 131/151] Move test function to test_gpu_stump --- tests/test_core.py | 40 ---------------------------------------- tests/test_gpu_stump.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 40 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index e25d6a664..528286061 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1088,43 +1088,3 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) - - -def test_gpu_searchsorted(): - # define a function the same as `core._gpu_searchsorted_right` but - # without cuda.jit decorator. - def gpu_searchsorted_right(a, v, bfs, nlevel): - n = a.shape[0] - idx = 0 - for level in range(nlevel): - if v < a[bfs[idx]]: - next_idx = 2 * idx + 1 - else: - next_idx = 2 * idx + 2 - - if level == nlevel - 1 or bfs[next_idx] < 0: - if v < a[bfs[idx]]: - idx = max(bfs[idx], 0) - else: - idx = min(bfs[idx] + 1, n) - break - idx = next_idx - - return idx - - for n in range(1, 100): - a = np.sort(np.random.rand(n)) - bfs = core._bfs_indices(n, fill_value=-1) - nlevel = np.floor(np.log2(n) + 1).astype(np.int64) - for i in range(n): - v = a[i] - npt.assert_almost_equal( - gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) - - v = a[i] + 0.001 - npt.assert_almost_equal( - gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 1a2662647..dfbf5e405 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -38,6 +38,24 @@ def test_gpu_stump_int_input(): with pytest.raises(TypeError): gpu_stump(np.arange(10), 5, ignore_trivial=True) +def test_gpu_searchsorted(): + for n in range(1, 100): + a = np.sort(np.random.rand(n)) + bfs = core._bfs_indices(n, fill_value=-1) + nlevel = np.floor(np.log2(n) + 1).astype(np.int64) + for i in range(n): + v = a[i] - 0.001 + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + + v = a[i] + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + + v = a[i] + 0.001 + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) From 38e531c34a63e3f4a98f476c9501b705de2a2b29 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:50:47 -0600 Subject: [PATCH 132/151] correct format --- stumpy/core.py | 2 +- stumpy/gpu_stump.py | 1 - tests/test_gpu_stump.py | 35 +++++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 0ebb5ae50..535471761 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import cuda, njit, prange +from numba import njit, prange from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index c7f7aec16..22748e089 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -430,7 +430,6 @@ def _gpu_stump( μ_Q = np.load(μ_Q_fname, allow_pickle=False) σ_Q = np.load(σ_Q_fname, allow_pickle=False) - device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) # number of levels in # binary seearch tree from which `bfs` is constructed. diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index dfbf5e405..1e79fb577 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -1,7 +1,7 @@ import numpy as np import numpy.testing as npt import pandas as pd -from stumpy import gpu_stump +from stumpy import core, gpu_stump from stumpy import config from numba import cuda @@ -38,23 +38,42 @@ def test_gpu_stump_int_input(): with pytest.raises(TypeError): gpu_stump(np.arange(10), 5, ignore_trivial=True) + def test_gpu_searchsorted(): for n in range(1, 100): a = np.sort(np.random.rand(n)) bfs = core._bfs_indices(n, fill_value=-1) nlevel = np.floor(np.log2(n) + 1).astype(np.int64) for i in range(n): - v = a[i] - 0.001 - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + v = a[i] - 0.001 + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + np.searchsorted(a, v, side="left"), + ) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) v = a[i] - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + np.searchsorted(a, v, side="left"), + ) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) v = a[i] + 0.001 - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + np.searchsorted(a, v, side="left"), + ) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) From 5a7b3c099419de1a09368f2930eabce410730693 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:26:58 -0600 Subject: [PATCH 133/151] Fixed calling function --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 22748e089..bf7e3b57d 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -284,7 +284,7 @@ def _compute_and_update_PI_kernel( indices_R[j] = i if p_norm < profile[j, -1]: - idx = core._gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) + idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) for g in range(k - 1, idx, -1): profile[j, g] = profile[j, g - 1] indices[j, g] = indices[j, g - 1] From e1b0d205e463fd2e02a906ab349ca492d303be27 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:32:25 -0600 Subject: [PATCH 134/151] Make function callable from both CPU and GPU to avoid duplication for unit testing. --- stumpy/gpu_stump.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index bf7e3b57d..99a3ba839 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda +from numba import cuda, jit from . import core, config from .gpu_aamp import gpu_aamp @@ -15,10 +15,11 @@ logger = logging.getLogger(__name__) -@cuda.jit(device=True) +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_left(a, v, bfs, nlevel): """ - Device function to replace numpy.searchsorted(a, v, side='left') + A function equivalent to numpy.searchsorted(a, v, side='left'), designed + to be used mainly as device function Parameters ---------- @@ -60,7 +61,7 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): return idx -@cuda.jit(device=True) +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Device function to replace numpy.searchsorted(a, v, side='right') From 922544c3ae21d018db7600d4b466a2ae40d107fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:46:07 -0600 Subject: [PATCH 135/151] Fixed calling function --- tests/test_gpu_stump.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 1e79fb577..108ac0d91 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -2,6 +2,7 @@ import numpy.testing as npt import pandas as pd from stumpy import core, gpu_stump +from stumpy.gpu_stump import _gpu_searchsorted_left, _gpu_searchsorted_right from stumpy import config from numba import cuda @@ -47,31 +48,31 @@ def test_gpu_searchsorted(): for i in range(n): v = a[i] - 0.001 npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + _gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left"), ) npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + _gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right"), ) v = a[i] npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + _gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left"), ) npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + _gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right"), ) v = a[i] + 0.001 npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + _gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left"), ) npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + _gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right"), ) From 102979b1235e00744567484b97a774658d3b2e1d Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:51:31 -0600 Subject: [PATCH 136/151] Revised the test function for merge_topk_PI --- tests/test_core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 528286061..a297dd3fa 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1066,11 +1066,12 @@ def test_merge_topk_PI(): k = 5 PA = np.random.rand(n * k).reshape(n, k) - PA = np.sort(PA, axis=1) + PA = np.sort(PA, axis=1) # sorting each row separately PB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) - for i in range(n): + for i in range(n): # creating ties between values of PA and PB PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) PB = np.sort(PB, axis=1) From a8aecf6679a9dbb02be80cdf75cf55ce99ae6aae Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 09:18:04 -0600 Subject: [PATCH 137/151] Revise docstrings --- stumpy/gpu_stump.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 99a3ba839..35bf3f12f 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -18,7 +18,7 @@ @jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_left(a, v, bfs, nlevel): """ - A function equivalent to numpy.searchsorted(a, v, side='left'), designed + Equivalent to numpy.searchsorted(a, v, side='left'), designed to be used mainly as device function Parameters @@ -64,7 +64,8 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): @jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - Device function to replace numpy.searchsorted(a, v, side='right') + Equivalent to numpy.searchsorted(a, v, side='left'), designed + to be used mainly as device function Parameters ---------- From 38318ecdb8ab602d8ceb9d8afe4d1abc1b6ed9ed Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 09:19:31 -0600 Subject: [PATCH 138/151] Rename variable --- stumpy/gpu_stump.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 35bf3f12f..9fb657668 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -124,7 +124,7 @@ def _compute_and_update_PI_kernel( Σ_T, μ_Q, σ_Q, - profile_len, + w, ignore_trivial, excl_zone, profile, @@ -179,7 +179,7 @@ def _compute_and_update_PI_kernel( σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q` - profile_len : int + w : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -247,7 +247,7 @@ def _compute_and_update_PI_kernel( for j in range(start, QT_out.shape[0], stride): zone_start = max(0, j - excl_zone) - zone_stop = min(profile_len, j + excl_zone) + zone_stop = min(w, j + excl_zone) if compute_QT: QT_out[j] = ( @@ -307,7 +307,7 @@ def _gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - profile_len, + w, ignore_trivial=True, range_start=1, device_id=0, @@ -362,7 +362,7 @@ def _gpu_stump( The file name for the standard deviation of the query sequence, `Q`, relative to the current sliding window - profile_len : int + w : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -421,7 +421,7 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ threads_per_block = config.STUMPY_THREADS_PER_BLOCK - blocks_per_grid = math.ceil(profile_len / threads_per_block) + blocks_per_grid = math.ceil(w / threads_per_block) T_A = np.load(T_A_fname, allow_pickle=False) T_B = np.load(T_B_fname, allow_pickle=False) @@ -452,14 +452,14 @@ def _gpu_stump( device_M_T = cuda.to_device(M_T) device_Σ_T = cuda.to_device(Σ_T) - profile = np.full((profile_len, k), np.inf, dtype=np.float64) - indices = np.full((profile_len, k), -1, dtype=np.int64) + profile = np.full((w, k), np.inf, dtype=np.float64) + indices = np.full((w, k), -1, dtype=np.int64) - profile_L = np.full(profile_len, np.inf, dtype=np.float64) - indices_L = np.full(profile_len, -1, dtype=np.int64) + profile_L = np.full(w, np.inf, dtype=np.float64) + indices_L = np.full(w, -1, dtype=np.int64) - profile_R = np.full(profile_len, np.inf, dtype=np.float64) - indices_R = np.full(profile_len, -1, dtype=np.int64) + profile_R = np.full(w, np.inf, dtype=np.float64) + indices_R = np.full(w, -1, dtype=np.int64) device_profile = cuda.to_device(profile) device_profile_L = cuda.to_device(profile_L) @@ -480,7 +480,7 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - profile_len, + w, ignore_trivial, excl_zone, device_profile, @@ -508,7 +508,7 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - profile_len, + w, ignore_trivial, excl_zone, device_profile, @@ -695,7 +695,7 @@ def gpu_stump( logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] - profile_len = T_A.shape[0] - m + 1 + w = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) @@ -765,7 +765,7 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - profile_len, + w, ignore_trivial, start + 1, device_ids[idx], @@ -794,7 +794,7 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - profile_len, + w, ignore_trivial, start + 1, device_ids[idx], @@ -866,7 +866,7 @@ def gpu_stump( indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) out = np.empty( - (profile_len, 2 * k + 2), dtype=object + (w, 2 * k + 2), dtype=object ) # last two columns are to store # (top-1) left/right matrix profile indices out[:, :k] = profile[0] From 76f97cbb896f0d66819022cb0acfc43e011d67c0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 14:38:11 -0600 Subject: [PATCH 139/151] Corrected format --- stumpy/gpu_stump.py | 8 +++----- tests/test_core.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 9fb657668..371bbeaa4 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) -@jit # equivalent to `__host__ __device__` in C++ CUDA +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_left(a, v, bfs, nlevel): """ Equivalent to numpy.searchsorted(a, v, side='left'), designed @@ -61,7 +61,7 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): return idx -@jit # equivalent to `__host__ __device__` in C++ CUDA +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Equivalent to numpy.searchsorted(a, v, side='left'), designed @@ -865,9 +865,7 @@ def gpu_stump( profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) - out = np.empty( - (w, 2 * k + 2), dtype=object - ) # last two columns are to store + out = np.empty((w, 2 * k + 2), dtype=object) # last two columns are to store # (top-1) left/right matrix profile indices out[:, :k] = profile[0] out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) diff --git a/tests/test_core.py b/tests/test_core.py index a297dd3fa..63a33d1d0 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1071,7 +1071,7 @@ def test_merge_topk_PI(): PB = np.random.rand(n * k).reshape(n, k) col_idx = np.random.randint(0, k, size=n) - for i in range(n): # creating ties between values of PA and PB + for i in range(n): # creating ties between values of PA and PB PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) PB = np.sort(PB, axis=1) From d45733cc1d2b9bc59d31714860869e8516038c77 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 01:40:17 -0600 Subject: [PATCH 140/151] Fixed typo --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 371bbeaa4..d6e02f669 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -64,7 +64,7 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): @jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - Equivalent to numpy.searchsorted(a, v, side='left'), designed + Equivalent to numpy.searchsorted(a, v, side='right'), designed to be used mainly as device function Parameters From b2db50585f64eb0d0a9ddc5fb6bdbdc9fb4e0011 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 03:03:09 -0600 Subject: [PATCH 141/151] change decorator so that the function can be used ONLY as device function --- stumpy/gpu_stump.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index d6e02f669..e67a0a362 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda, jit +from numba import cuda from . import core, config from .gpu_aamp import gpu_aamp @@ -15,11 +15,11 @@ logger = logging.getLogger(__name__) -@jit # equivalent to `__host__ __device__` in C++ CUDA +@cuda.jit(device=True) def _gpu_searchsorted_left(a, v, bfs, nlevel): """ Equivalent to numpy.searchsorted(a, v, side='left'), designed - to be used mainly as device function + to be used as device function Parameters ---------- @@ -61,11 +61,11 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): return idx -@jit # equivalent to `__host__ __device__` in C++ CUDA +@cuda.jit(device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Equivalent to numpy.searchsorted(a, v, side='right'), designed - to be used mainly as device function + to be used as device function Parameters ---------- From b023b20dc0934d821a6cbfdb192597eb5f6ce361 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 14:08:39 -0600 Subject: [PATCH 142/151] considered more than one value for parameter k in unit testing --- tests/test_gpu_stump.py | 18 +++++++++--------- tests/test_stump.py | 18 +++++++++--------- tests/test_stumped.py | 12 ++++++------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 108ac0d91..02a415541 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -393,15 +393,15 @@ def test_gpu_stump_nan_zero_mean_self_join(): @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) def test_gpu_stump_self_join_KNN(T_A, T_B): - k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) - comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + for k in range(1, 4): + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) + comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) - comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stump.py b/tests/test_stump.py index af2a2315e..d8f0983ee 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -244,15 +244,15 @@ def test_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): - k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True, k=k) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + for k in range(1, 4): + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index 02e914436..168a5f570 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -617,11 +617,11 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap( @pytest.mark.parametrize("T_A, T_B", test_data) def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: - k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + for k in range(1, 4): + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 2e3483461b31441e33bbfca74c17aca04e100cf3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 16:12:05 -0600 Subject: [PATCH 143/151] add test for A_B_join_KNN --- tests/test_gpu_stump.py | 12 ++++++++++++ tests/test_stump.py | 15 +++++++++++++++ tests/test_stumped.py | 16 ++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 02a415541..688592a22 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -405,3 +405,15 @@ def test_gpu_stump_self_join_KNN(T_A, T_B): comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_gpu_stump_A_B_join_KNN(T_A, T_B): + m = 3 + for k in range(1, 4): + ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True, k=k) + comp_mp = gpu_stump(T_B, m, T_A, ignore_trivial=False, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stump.py b/tests/test_stump.py index d8f0983ee..3e0b34299 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -256,3 +256,18 @@ def test_stump_self_join_KNN(T_A, T_B): comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_A_B_join_KNN(T_A, T_B): + m = 3 + for k in range(1, 4): + ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index 168a5f570..7e8b053d3 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -625,3 +625,19 @@ def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore:numpy.dtype size changed") +@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed") +@pytest.mark.filterwarnings("ignore:numpy.ndarray size changed") +@pytest.mark.filterwarnings("ignore:\\s+Port 8787 is already in use:UserWarning") +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stumped_A_B_join_KNN(T_A, T_B, dask_cluster): + with Client(dask_cluster) as dask_client: + m = 3 + for k in range(1, 4): + ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) + comp_mp = stumped(dask_client, T_A, m, T_B, ignore_trivial=False, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 4da736624b6310dfe2c41c90af2b053883abe0b4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 16:15:50 -0600 Subject: [PATCH 144/151] swap TA and TB in test function so that the value of k becomes valid --- tests/test_gpu_stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 688592a22..178effc61 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -412,8 +412,8 @@ def test_gpu_stump_self_join_KNN(T_A, T_B): def test_gpu_stump_A_B_join_KNN(T_A, T_B): m = 3 for k in range(1, 4): - ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True, k=k) - comp_mp = gpu_stump(T_B, m, T_A, ignore_trivial=False, k=k) + ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True, k=k) + comp_mp = gpu_stump(T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From dcee0f9c240d765dd384a22832660380a0a4f6d0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 16:24:32 -0600 Subject: [PATCH 145/151] Replicating commits 5565904 and 10878fd From 7d5e7fcd46e820a405e4c347b59e4930df902b97 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 20:18:09 -0600 Subject: [PATCH 146/151] Add wrapper kernel function for testing a device function Replicating commits bf6edcc, 1b7d971, 7f65b94, d282dfd, 6faa6453 --- tests/test_gpu_stump.py | 84 ++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 178effc61..0a65de68f 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -40,42 +40,58 @@ def test_gpu_stump_int_input(): gpu_stump(np.arange(10), 5, ignore_trivial=True) +@cuda.jit("(f8[:, :], f8[:], i8[:], i8, b1, i8[:])") +def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): + # A wrapper kernel for calling device function _gpu_searchsorted_left/right. + i = cuda.grid(1) + if i < A.shape[0]: + if is_left: + IDX[i] = _gpu_searchsorted_left(A[i], V[i], bfs, nlevel) + else: + IDX[i] = _gpu_searchsorted_right(A[i], V[i], bfs, nlevel) + + def test_gpu_searchsorted(): - for n in range(1, 100): - a = np.sort(np.random.rand(n)) - bfs = core._bfs_indices(n, fill_value=-1) - nlevel = np.floor(np.log2(n) + 1).astype(np.int64) - for i in range(n): - v = a[i] - 0.001 - npt.assert_almost_equal( - _gpu_searchsorted_left(a, v, bfs, nlevel), - np.searchsorted(a, v, side="left"), - ) - npt.assert_almost_equal( - _gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) - - v = a[i] - npt.assert_almost_equal( - _gpu_searchsorted_left(a, v, bfs, nlevel), - np.searchsorted(a, v, side="left"), - ) - npt.assert_almost_equal( - _gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) - - v = a[i] + 0.001 - npt.assert_almost_equal( - _gpu_searchsorted_left(a, v, bfs, nlevel), - np.searchsorted(a, v, side="left"), - ) - npt.assert_almost_equal( - _gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) + n = 5000 + threads_per_block = config.STUMPY_THREADS_PER_BLOCK + blocks_per_grid = math.ceil(n / threads_per_block) + + for k in range(1, 32): + device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) + nlevel = np.floor(np.log2(k) + 1).astype(np.int64) + + A = np.sort(np.random.rand(n, k), axis=1) + device_A = cuda.to_device(A) + + V = np.random.rand(n) + for i, idx in enumerate(np.random.choice(np.arange(n), size=k, replace=False)): + V[idx] = A[idx, i] # create ties + device_V = cuda.to_device(V) + + is_left = True # test case + ref_IDX = [np.searchsorted(A[i], V[i], side="left") for i in range(n)] + ref_IDX = np.asarray(ref_IDX, dtype=np.int64) + + comp_IDX = np.full(n, -1, dtype=np.int64) + device_comp_IDX = cuda.to_device(comp_IDX) + _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( + device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX + ) + comp_IDX = device_comp_IDX.copy_to_host() + npt.assert_array_equal(ref_IDX, comp_IDX) + is_left = False # test case + ref_IDX = [np.searchsorted(A[i], V[i], side="right") for i in range(n)] + ref_IDX = np.asarray(ref_IDX, dtype=np.int64) + + comp_IDX = np.full(n, -1, dtype=np.int64) + device_comp_IDX = cuda.to_device(comp_IDX) + _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( + device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX + ) + comp_IDX = device_comp_IDX.copy_to_host() + npt.assert_array_equal(ref_IDX, comp_IDX) + @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) From 862a3e810a6dd3caddd482af893bc9a08163fc5c Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 20:23:57 -0600 Subject: [PATCH 147/151] Correct format --- tests/test_gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 0a65de68f..9ac2ae484 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -91,7 +91,7 @@ def test_gpu_searchsorted(): ) comp_IDX = device_comp_IDX.copy_to_host() npt.assert_array_equal(ref_IDX, comp_IDX) - + @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) From 90e66becfa219a10047306c07553e38fa5fe7771 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 20:26:30 -0600 Subject: [PATCH 148/151] import missing module --- tests/test_gpu_stump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 9ac2ae484..071337cd5 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -1,3 +1,4 @@ +import math import numpy as np import numpy.testing as npt import pandas as pd From 461e1eb901b4db0f4f181616e7c428184bf529e6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 22:19:01 -0600 Subject: [PATCH 149/151] testing function for more than one value for parameter k Replicated commit 9789cd9 --- tests/test_core.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 63a33d1d0..1087f999d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1063,29 +1063,27 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): n = 50 - k = 5 + for k in range(1, 6): + PA = np.random.rand(n * k).reshape(n, k) + PA = np.sort(PA, axis=1) # sorting each row separately - PA = np.random.rand(n * k).reshape(n, k) - PA = np.sort(PA, axis=1) # sorting each row separately + PB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) + for i in range(n): # creating ties between values of PA and PB + PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) + PB = np.sort(PB, axis=1) - PB = np.random.rand(n * k).reshape(n, k) + IA = np.arange(n * k).reshape(n, k) + IB = IA + n * k - col_idx = np.random.randint(0, k, size=n) - for i in range(n): # creating ties between values of PA and PB - PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) - PB = np.sort(PB, axis=1) + ref_P = PA.copy() + ref_I = IA.copy() - IA = np.arange(n * k).reshape(n, k) - IB = IA + n * k + comp_P = PA.copy() + comp_I = IA.copy() - ref_P = PA.copy() - ref_I = IA.copy() + naive.merge_topk_PI(ref_P, PB, ref_I, IB) + core._merge_topk_PI(comp_P, PB, comp_I, IB) - comp_P = PA.copy() - comp_I = IA.copy() - - naive.merge_topk_PI(ref_P, PB, ref_I, IB) - core._merge_topk_PI(comp_P, PB, comp_I, IB) - - npt.assert_array_equal(ref_P, comp_P) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_array_equal(ref_P, comp_P) + npt.assert_array_equal(ref_I, comp_I) From 1de7532d7faa34752563eaf1cf166524639989f3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 6 Jun 2022 23:55:33 -0600 Subject: [PATCH 150/151] Renamed function to improve readability replicated commits 4f2ea6c and (partially) 1b7d971 --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 75c69876b..fabe3d922 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -156,7 +156,7 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): # pragma: no cover return result -def searchsorted(a, v): +def searchsorted_right(a, v): """ Naive version of numpy.searchsorted(..., side='right') """ @@ -239,14 +239,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): for i in iter_range: D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: - idx = searchsorted(P[i], D) + idx = searchsorted_right(P[i], D) # to keep the top-k, we must get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k - 1]: - idx = searchsorted(P[i + g], D) + idx = searchsorted_right(P[i + g], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 08c75f76cbe50e90a049e9cd612e12bf39f0352b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 7 Jun 2022 01:17:04 -0600 Subject: [PATCH 151/151] Fixed typos --- stumpy/gpu_stump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index e67a0a362..8f4683388 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -434,7 +434,7 @@ def _gpu_stump( device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) - # number of levels in # binary seearch tree from which `bfs` is constructed. + # number of levels in binary seearch tree from which `bfs` is constructed. with cuda.gpus[device_id]: device_T_A = cuda.to_device(T_A) @@ -855,12 +855,12 @@ def gpu_stump( # Update (top-k) matrix profile and matrix profile indices core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) - # Update (top-1) left matrix profile and matrix profil indices + # Update (top-1) left matrix profile and matrix profile indices cond = profile_L[0] < profile_L[i] profile_L[0] = np.where(cond, profile_L[0], profile_L[i]) indices_L[0] = np.where(cond, indices_L[0], indices_L[i]) - # Update (top-1) right matrix profile and matrix profil indices + # Update (top-1) right matrix profile and matrix profile indices cond = profile_R[0] < profile_R[i] profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) indices_R[0] = np.where(cond, indices_R[0], indices_R[i])