From 223b68b656591fa48fe951c727010e8624e25187 Mon Sep 17 00:00:00 2001 From: "Martin D. Weinberg" Date: Tue, 25 Feb 2025 15:31:35 -0500 Subject: [PATCH 1/3] An updated k-means analysis that separates printing from analysis. The primary routine is now the full channel analysis, not the per-channel analysis in pyEXP. --- expui/expMSSA.H | 10 ++- expui/expMSSA.cc | 194 +++++++++++++++++++++++++++++------------- pyEXP/MSSAWrappers.cc | 51 ++++++++--- 3 files changed, 185 insertions(+), 70 deletions(-) diff --git a/expui/expMSSA.H b/expui/expMSSA.H index 3c1d180c6..60fe5e6b2 100644 --- a/expui/expMSSA.H +++ b/expui/expMSSA.H @@ -249,7 +249,15 @@ namespace MSSA @param toTerm write to stdout if true @param toFile write to file if true */ - void kmeans(int clusters, bool toTerm=true, bool toFile=false); + void kmeansPrint(int clusters, bool toTerm=true, bool toFile=false); + + //! Get Kmeans analysis per channel + std::tuple, std::vector, double> + kmeansChannel(int clusters, Key key); + + //! Get Kmeans analysis for all channels + std::tuple, std::vector, double> + kmeans(int clusters); //! Save current MSSA state to an HDF5 file with the given prefix void saveState(const std::string& prefix); diff --git a/expui/expMSSA.cc b/expui/expMSSA.cc index 2ff5079dd..d4e23b8ab 100644 --- a/expui/expMSSA.cc +++ b/expui/expMSSA.cc @@ -1063,10 +1063,10 @@ namespace MSSA { #endif } - void expMSSA::kmeans(int clusters, bool toTerm, bool toFile) + void expMSSA::kmeansPrint(int clusters, bool toTerm, bool toFile) { if (clusters==0) { - std::cout << "expMSSA::kmeans: you need clusters>0" << std::endl; + std::cout << "expMSSA::kmeansPrint: you need clusters>0" << std::endl; return; } @@ -1087,41 +1087,22 @@ namespace MSSA { std::cerr << "Error opening file <" << filename << ">" << std::endl; } - // W-correlation-based distance functor - // - KMeans::WcorrDistance dist(numT, numW); - for (auto u : mean) { - // Pack point array - // - std::vector data; - for (int j=0; j(numT)); - for (int i=0; ix[i] = RC[u.first](i, j); - } - - // Initialize k-means routine - // - KMeans::kMeansClustering kMeans(data); - - // Run 100 iterations - // - kMeans.iterate(dist, 100, clusters, 2, false); - // Retrieve cluster associations - // - auto results = kMeans.get_results(); + auto [id, dd, tol] = kmeansChannel(clusters, u.first); // Write to file // if (out) { out << std::string(60, '-') << std::endl << " *** n=" << u.first << std::endl + << " *** tol=" << tol << std::endl << std::string(60, '-') << std::endl; - for (int j=0; j(results[j]) + << std::setw(12) << id[j] + << std::setw(16) << dd[j] << std::endl; } } @@ -1133,9 +1114,10 @@ namespace MSSA { << " *** n=" << u.first << std::endl << std::string(60, '-') << std::endl; - for (int j=0; j(results[j]) + for (int j=0; j data; - int sz = mean.size(); - for (int j=0; j(numT*sz)); - int c = 0; - for (auto u : mean) { - for (int i=0; ix[c++] = RC[u.first](i, j); - } - } - - // Initialize k-means routine - // - KMeans::kMeansClustering kMeans(data); - - // Run 100 iterations - // - KMeans::WcorrDistMulti dist2(numT, numW, sz); - kMeans.iterate(dist2, 100, clusters, 2, false); - - // Retrieve cluster associations - // - auto results = kMeans.get_results(); + auto [id, dd, tol] = kmeans(clusters); // Write to file // if (out) { out << std::string(60, '-') << std::endl << " *** total" << std::endl + << " *** tol=" << tol << std::endl << std::string(60, '-') << std::endl; - for (int j=0; j(results[j]) + for (int j=0; j(results[j]) + for (int j=0; j, std::vector, double> + expMSSA::kmeansChannel(int clusters, Key key) + { + if (clusters==0) { + throw std::invalid_argument("expMSSA::kmeansChannel: clusters==0"); + } + + if (mean.find(key) == mean.end()) { + std::ostringstream sout; + sout << "expMSSA::kmeansKey: key <" << key << "> not found"; + throw std::invalid_argument(sout.str()); + } + + KMeans::WcorrDistance dist(numT, numW); + + // Pack point array + // + std::vector data; + for (int j=0; j(numT)); + for (int i=0; ix[i] = RC[key](i, j); + } + + // Initialize k-means routine + // + KMeans::kMeansClustering kMeans(data); + + // Run 100 iterations + // + kMeans.iterate(dist, 1000, clusters, 1); + + // Retrieve cluster associations + // + auto results = kMeans.get_results(); + auto centers = kMeans.get_cen(); + + // Compute inertia + // + auto inertia = [&](int j, int id) -> double { + auto & cen = centers[id]; + double d = 0.0; + for (int i=0; ix[i])*(cen[i] - data[j]->x[i]); + return sqrt(d); + }; + + // Pack return vector + // + std::vector retI; + std::vector retD; + for (int j=0; j(results[j])); + retD.push_back(inertia(j, std::get<1>(results[j]))); + } + + return {retI, retD, kMeans.getTol()}; + } + + std::tuple, std::vector, double> + expMSSA::kmeans(int clusters) + { + if (clusters==0) { + throw std::invalid_argument("expMSSA::kmeans: you need clusters>0"); + } + + // Pack point array + // + std::vector data; + int sz = mean.size(); + for (int j=0; j(numT*sz)); + int c = 0; + for (auto u : mean) { + for (int i=0; ix[c++] = RC[u.first](i, j); + } + } + + // Initialize k-means routine + // + KMeans::kMeansClustering kMeans(data); + + // Run 100 iterations + // + KMeans::WcorrDistMulti dist(numT, numW, sz); + kMeans.iterate(dist, 1000, clusters, 1); + + // Retrieve cluster associations + // + auto results = kMeans.get_results(); + auto centers = kMeans.get_cen(); + + // Compute inertia + // + auto inertia = [&](int j, int id) -> double { + auto & cen = centers[id]; + double d = 0.0; + for (int i=0; ix[i])*(cen[i] - data[j]->x[i]); + return sqrt(d); + }; + + // Pack return vector + // + std::vector retI; + std::vector retD; + for (int j=0; j(results[j])); + retD.push_back(inertia(j, std::get<1>(results[j]))); + } + + return {retI, retD, kMeans.getTol()}; + } + std::map expMSSA::getReconstructed(bool reconstructmean) { if (not reconstructed) { diff --git a/pyEXP/MSSAWrappers.cc b/pyEXP/MSSAWrappers.cc index aa5ef5c57..082bd9270 100644 --- a/pyEXP/MSSAWrappers.cc +++ b/pyEXP/MSSAWrappers.cc @@ -436,28 +436,59 @@ void MSSAtoolkitClasses(py::module &m) { f.def("kmeans", &expMSSA::kmeans, py::arg("clusters") = 4, - py::arg("toTerm") = true, - py::arg("toFile") = false, R"( - Do a k-means analysis on the reconstructed trajectory matrices to provide grouping insight + Do a k-means analysis on the reconstructed trajectory matrices for a single channel (specified key value) to + provide grouping insight. A vector of channel indices that identify clusters is return in a vector ordered by PC + index. Parameters ---------- clusters : int, default=4 number of clusters for the k-means analysis - toTerm : bool, default=True - flag indicating whether to output to the terminal (standard output) - toFile : bool - flag indicating whether to write the output to a file Returns ------- - None + tuple : (numpy.niarray, numpy.ndarray, double) + The PC indices of the k-means clusters, distance from the centroid, and the + final update error + + Notes + ----- + The k-means partitions n vector observations into k clusters in which each observation belongs to the cluster with + the nearest centers while minimizing the variance within each cluster. In this case, the vectors are the full + trajectory matrices and the distance is the distance between the trajectory matricies reconstructed from each + eigentriple from mSSA. The distance used here is the Frobenius distance or matrix norm distance: the square root + of the sum of squares of all elements in the difference between two matrices. + + This version does the analysis for all channels together, the most useful for estimating groups. For individual + contributions by channel, use kmeansChannel. + )"); + + f.def("kmeansChannel", &expMSSA::kmeansChannel, + py::arg("clusters") = 4, + py::arg("key"), + R"( + Do a k-means analysis on the reconstructed trajectory matrices for a single channel (specified key value) to + provide grouping insight. In most cases, you will want to use the kmeans() version which analyzes all channels + together. + + Parameters + ---------- + clusters : int, default=4 + number of clusters for the k-means analysis + key : list(int) + identifier indices of the selected data channel + + Returns + ------- + tuple : (numpy.niarray, numpy.ndarray, double) + The PC indices of the k-means clusters, distance from the centroid, and the + final update error Notes ----- - By default, results are written to the standard output. Set `toFile=True` to write - the output to a file. The file name will be derived from the 'output' parameter. + This version does the analysis channel-by-channel. You may wish to see all channels together using + kmeansTotal. See kemans() for more details. )"); f.def("contrib", &expMSSA::contributions, From 3074340788b8086ab08193d5d9b2d9a7a1f30852 Mon Sep 17 00:00:00 2001 From: "Martin D. Weinberg" Date: Tue, 25 Feb 2025 18:26:45 -0500 Subject: [PATCH 2/3] Make the random shuffle seed for KMeans the default in expMSSA --- expui/KMeans.cc | 18 ++++++++++++++---- expui/expMSSA.H | 16 ++++++++++++++-- expui/expMSSA.cc | 6 ++++-- pyEXP/MSSAWrappers.cc | 6 ++++-- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/expui/KMeans.cc b/expui/KMeans.cc index 9e220e10f..9a1a4f65e 100644 --- a/expui/KMeans.cc +++ b/expui/KMeans.cc @@ -13,18 +13,28 @@ namespace MSSA // cen.clear(); - if (s>0) { // Seed centers by stride + if (s>0) { + // Seed centers by stride for (int i=0; i=k) break; cen.push_back(classes.at(i)->x); } k = cen.size(); - } else { // obtain a seed from the system clock + } else { + // Obtain a seed from the system clock unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + + // Make a random generator std::mt19937 gen(seed); - // Seed centers randomly + + // Randomly shuffle a list of indexes + std::vector indx; + for (int i=0; ix); + cen.push_back(classes.at(indx[i])->x); } } diff --git a/expui/expMSSA.H b/expui/expMSSA.H index 60fe5e6b2..7857d6146 100644 --- a/expui/expMSSA.H +++ b/expui/expMSSA.H @@ -243,19 +243,31 @@ namespace MSSA //! Create wcorrlation matricies and output PNG void wcorrPNG(); + //@{ /** Kmean analysis of the trajectories by PC with fixed cluster size + */ + + /** Perform Kmeans analysis for a given number of clusters and + print the results @param toTerm write to stdout if true @param toFile write to file if true */ void kmeansPrint(int clusters, bool toTerm=true, bool toFile=false); - //! Get Kmeans analysis per channel + /** Get Kmeans analysis per channel + + @param clusters is the number of clusters to seed + @param key is the channel id vector + */ std::tuple, std::vector, double> kmeansChannel(int clusters, Key key); - //! Get Kmeans analysis for all channels + /** Get Kmeans analysis for all channels + + @param clusters is the number of clusters to seed + */ std::tuple, std::vector, double> kmeans(int clusters); diff --git a/expui/expMSSA.cc b/expui/expMSSA.cc index d4e23b8ab..c6341470b 100644 --- a/expui/expMSSA.cc +++ b/expui/expMSSA.cc @@ -1198,7 +1198,8 @@ namespace MSSA { // Run 100 iterations // - kMeans.iterate(dist, 1000, clusters, 1); + // kMeans.iterate(dist, 1000, clusters, 1); + kMeans.iterate(dist, 1000, clusters); // Retrieve cluster associations // @@ -1253,7 +1254,8 @@ namespace MSSA { // Run 100 iterations // KMeans::WcorrDistMulti dist(numT, numW, sz); - kMeans.iterate(dist, 1000, clusters, 1); + // kMeans.iterate(dist, 1000, clusters, 1); + kMeans.iterate(dist, 1000, clusters); // Retrieve cluster associations // diff --git a/pyEXP/MSSAWrappers.cc b/pyEXP/MSSAWrappers.cc index 082bd9270..2eb5e46ab 100644 --- a/pyEXP/MSSAWrappers.cc +++ b/pyEXP/MSSAWrappers.cc @@ -450,7 +450,8 @@ void MSSAtoolkitClasses(py::module &m) { ------- tuple : (numpy.niarray, numpy.ndarray, double) The PC indices of the k-means clusters, distance from the centroid, and the - final update error + final update error. A zero update error implies that the k-means algorithm + converged. Notes ----- @@ -483,7 +484,8 @@ void MSSAtoolkitClasses(py::module &m) { ------- tuple : (numpy.niarray, numpy.ndarray, double) The PC indices of the k-means clusters, distance from the centroid, and the - final update error + final update error. A zero update error implies that the k-means algorithm + converged. Notes ----- From 4f40e7c2ff4e1b29b9403cc9e424728cc387b833 Mon Sep 17 00:00:00 2001 From: "Martin D. Weinberg" Date: Tue, 25 Feb 2025 18:44:43 -0500 Subject: [PATCH 3/3] Allow the user to specify the seeding strategy for k-means. 2-strided by default, a good choice for quasi-periodic systems --- expui/expMSSA.H | 12 +++++++++--- expui/expMSSA.cc | 25 +++++++++++++++---------- pyEXP/MSSAWrappers.cc | 7 ++++++- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/expui/expMSSA.H b/expui/expMSSA.H index 7857d6146..2fe4d0c3a 100644 --- a/expui/expMSSA.H +++ b/expui/expMSSA.H @@ -251,10 +251,15 @@ namespace MSSA /** Perform Kmeans analysis for a given number of clusters and print the results + @param clusters is the maximum number of clusters considered + @param stride is the seed strategy. If positive, it is used to + select initial cluster centers by stride from the PC list. If + it is zero, centers are selected randomly from the PC list @param toTerm write to stdout if true @param toFile write to file if true */ - void kmeansPrint(int clusters, bool toTerm=true, bool toFile=false); + void kmeansPrint(int clusters, int stride, + bool toTerm=true, bool toFile=false); /** Get Kmeans analysis per channel @@ -262,14 +267,15 @@ namespace MSSA @param key is the channel id vector */ std::tuple, std::vector, double> - kmeansChannel(int clusters, Key key); + kmeansChannel(Key key, int clusters, int stride); /** Get Kmeans analysis for all channels @param clusters is the number of clusters to seed + @param stride is the seeded strategy for initial centers */ std::tuple, std::vector, double> - kmeans(int clusters); + kmeans(int clusters, int stride); //! Save current MSSA state to an HDF5 file with the given prefix void saveState(const std::string& prefix); diff --git a/expui/expMSSA.cc b/expui/expMSSA.cc index c6341470b..106857ee2 100644 --- a/expui/expMSSA.cc +++ b/expui/expMSSA.cc @@ -1063,7 +1063,7 @@ namespace MSSA { #endif } - void expMSSA::kmeansPrint(int clusters, bool toTerm, bool toFile) + void expMSSA::kmeansPrint(int clusters, int stride, bool toTerm, bool toFile) { if (clusters==0) { std::cout << "expMSSA::kmeansPrint: you need clusters>0" << std::endl; @@ -1089,7 +1089,7 @@ namespace MSSA { for (auto u : mean) { - auto [id, dd, tol] = kmeansChannel(clusters, u.first); + auto [id, dd, tol] = kmeansChannel(u.first, clusters, stride); // Write to file // @@ -1125,7 +1125,7 @@ namespace MSSA { if (params["allchan"]) { - auto [id, dd, tol] = kmeans(clusters); + auto [id, dd, tol] = kmeans(clusters, stride); // Write to file // @@ -1165,17 +1165,20 @@ namespace MSSA { std::cout << "Bad output stream for <" << filename << ">" << std::endl; } out.close(); - } std::tuple, std::vector, double> - expMSSA::kmeansChannel(int clusters, Key key) + expMSSA::kmeansChannel(Key key, int clusters, int stride) { if (clusters==0) { throw std::invalid_argument("expMSSA::kmeansChannel: clusters==0"); } + if (stride<0) { + throw std::invalid_argument("expMSSA::kmeansChannel: stride must be >= 0"); + } + if (mean.find(key) == mean.end()) { std::ostringstream sout; sout << "expMSSA::kmeansKey: key <" << key << "> not found"; @@ -1198,8 +1201,7 @@ namespace MSSA { // Run 100 iterations // - // kMeans.iterate(dist, 1000, clusters, 1); - kMeans.iterate(dist, 1000, clusters); + kMeans.iterate(dist, 1000, clusters, stride); // Retrieve cluster associations // @@ -1229,12 +1231,16 @@ namespace MSSA { } std::tuple, std::vector, double> - expMSSA::kmeans(int clusters) + expMSSA::kmeans(int clusters, int stride) { if (clusters==0) { throw std::invalid_argument("expMSSA::kmeans: you need clusters>0"); } + if (stride<0) { + throw std::invalid_argument("expMSSA::kmeans: stride must be >= 0"); + } + // Pack point array // std::vector data; @@ -1254,8 +1260,7 @@ namespace MSSA { // Run 100 iterations // KMeans::WcorrDistMulti dist(numT, numW, sz); - // kMeans.iterate(dist, 1000, clusters, 1); - kMeans.iterate(dist, 1000, clusters); + kMeans.iterate(dist, 1000, clusters, stride); // Retrieve cluster associations // diff --git a/pyEXP/MSSAWrappers.cc b/pyEXP/MSSAWrappers.cc index 2eb5e46ab..b6e556993 100644 --- a/pyEXP/MSSAWrappers.cc +++ b/pyEXP/MSSAWrappers.cc @@ -436,6 +436,7 @@ void MSSAtoolkitClasses(py::module &m) { f.def("kmeans", &expMSSA::kmeans, py::arg("clusters") = 4, + py::arg("stride") = 2, R"( Do a k-means analysis on the reconstructed trajectory matrices for a single channel (specified key value) to provide grouping insight. A vector of channel indices that identify clusters is return in a vector ordered by PC @@ -445,6 +446,9 @@ void MSSAtoolkitClasses(py::module &m) { ---------- clusters : int, default=4 number of clusters for the k-means analysis + stride : int, default=2 + if positive, the initial cluster centers are stride selected from the PC list. If zero, the centers are + selected randomly from the PC list Returns ------- @@ -466,8 +470,9 @@ void MSSAtoolkitClasses(py::module &m) { )"); f.def("kmeansChannel", &expMSSA::kmeansChannel, - py::arg("clusters") = 4, py::arg("key"), + py::arg("clusters") = 4, + py::arg("stride") = 2, R"( Do a k-means analysis on the reconstructed trajectory matrices for a single channel (specified key value) to provide grouping insight. In most cases, you will want to use the kmeans() version which analyzes all channels