diff --git a/expui/KMeans.cc b/expui/KMeans.cc index 9e220e10f..9a1a4f65e 100644 --- a/expui/KMeans.cc +++ b/expui/KMeans.cc @@ -13,18 +13,28 @@ namespace MSSA // cen.clear(); - if (s>0) { // Seed centers by stride + if (s>0) { + // Seed centers by stride for (int i=0; i=k) break; cen.push_back(classes.at(i)->x); } k = cen.size(); - } else { // obtain a seed from the system clock + } else { + // Obtain a seed from the system clock unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + + // Make a random generator std::mt19937 gen(seed); - // Seed centers randomly + + // Randomly shuffle a list of indexes + std::vector indx; + for (int i=0; ix); + cen.push_back(classes.at(indx[i])->x); } } diff --git a/expui/expMSSA.H b/expui/expMSSA.H index 3c1d180c6..2fe4d0c3a 100644 --- a/expui/expMSSA.H +++ b/expui/expMSSA.H @@ -243,13 +243,39 @@ namespace MSSA //! Create wcorrlation matricies and output PNG void wcorrPNG(); + //@{ /** Kmean analysis of the trajectories by PC with fixed cluster size + */ + + /** Perform Kmeans analysis for a given number of clusters and + print the results + @param clusters is the maximum number of clusters considered + @param stride is the seed strategy. If positive, it is used to + select initial cluster centers by stride from the PC list. If + it is zero, centers are selected randomly from the PC list @param toTerm write to stdout if true @param toFile write to file if true */ - void kmeans(int clusters, bool toTerm=true, bool toFile=false); + void kmeansPrint(int clusters, int stride, + bool toTerm=true, bool toFile=false); + + /** Get Kmeans analysis per channel + + @param clusters is the number of clusters to seed + @param key is the channel id vector + */ + std::tuple, std::vector, double> + kmeansChannel(Key key, int clusters, int stride); + + /** Get Kmeans analysis for all channels + + @param clusters is the number of clusters to seed + @param stride is the seeded strategy for initial centers + */ + std::tuple, std::vector, double> + kmeans(int clusters, int stride); //! Save current MSSA state to an HDF5 file with the given prefix void saveState(const std::string& prefix); diff --git a/expui/expMSSA.cc b/expui/expMSSA.cc index 2ff5079dd..106857ee2 100644 --- a/expui/expMSSA.cc +++ b/expui/expMSSA.cc @@ -1063,10 +1063,10 @@ namespace MSSA { #endif } - void expMSSA::kmeans(int clusters, bool toTerm, bool toFile) + void expMSSA::kmeansPrint(int clusters, int stride, bool toTerm, bool toFile) { if (clusters==0) { - std::cout << "expMSSA::kmeans: you need clusters>0" << std::endl; + std::cout << "expMSSA::kmeansPrint: you need clusters>0" << std::endl; return; } @@ -1087,41 +1087,22 @@ namespace MSSA { std::cerr << "Error opening file <" << filename << ">" << std::endl; } - // W-correlation-based distance functor - // - KMeans::WcorrDistance dist(numT, numW); - for (auto u : mean) { - // Pack point array - // - std::vector data; - for (int j=0; j(numT)); - for (int i=0; ix[i] = RC[u.first](i, j); - } - // Initialize k-means routine - // - KMeans::kMeansClustering kMeans(data); - - // Run 100 iterations - // - kMeans.iterate(dist, 100, clusters, 2, false); - - // Retrieve cluster associations - // - auto results = kMeans.get_results(); + auto [id, dd, tol] = kmeansChannel(u.first, clusters, stride); // Write to file // if (out) { out << std::string(60, '-') << std::endl << " *** n=" << u.first << std::endl + << " *** tol=" << tol << std::endl << std::string(60, '-') << std::endl; - for (int j=0; j(results[j]) + << std::setw(12) << id[j] + << std::setw(16) << dd[j] << std::endl; } } @@ -1133,9 +1114,10 @@ namespace MSSA { << " *** n=" << u.first << std::endl << std::string(60, '-') << std::endl; - for (int j=0; j(results[j]) + for (int j=0; j data; - int sz = mean.size(); - for (int j=0; j(numT*sz)); - int c = 0; - for (auto u : mean) { - for (int i=0; ix[c++] = RC[u.first](i, j); - } - } - - // Initialize k-means routine - // - KMeans::kMeansClustering kMeans(data); - - // Run 100 iterations - // - KMeans::WcorrDistMulti dist2(numT, numW, sz); - kMeans.iterate(dist2, 100, clusters, 2, false); - - // Retrieve cluster associations - // - auto results = kMeans.get_results(); + auto [id, dd, tol] = kmeans(clusters, stride); // Write to file // if (out) { out << std::string(60, '-') << std::endl << " *** total" << std::endl + << " *** tol=" << tol << std::endl << std::string(60, '-') << std::endl; - for (int j=0; j(results[j]) + for (int j=0; j(results[j]) + for (int j=0; j" << std::endl; } out.close(); + } + + + std::tuple, std::vector, double> + expMSSA::kmeansChannel(Key key, int clusters, int stride) + { + if (clusters==0) { + throw std::invalid_argument("expMSSA::kmeansChannel: clusters==0"); + } + + if (stride<0) { + throw std::invalid_argument("expMSSA::kmeansChannel: stride must be >= 0"); + } + + if (mean.find(key) == mean.end()) { + std::ostringstream sout; + sout << "expMSSA::kmeansKey: key <" << key << "> not found"; + throw std::invalid_argument(sout.str()); + } + + KMeans::WcorrDistance dist(numT, numW); + + // Pack point array + // + std::vector data; + for (int j=0; j(numT)); + for (int i=0; ix[i] = RC[key](i, j); + } + + // Initialize k-means routine + // + KMeans::kMeansClustering kMeans(data); + + // Run 100 iterations + // + kMeans.iterate(dist, 1000, clusters, stride); + + // Retrieve cluster associations + // + auto results = kMeans.get_results(); + auto centers = kMeans.get_cen(); + + // Compute inertia + // + auto inertia = [&](int j, int id) -> double { + auto & cen = centers[id]; + double d = 0.0; + for (int i=0; ix[i])*(cen[i] - data[j]->x[i]); + return sqrt(d); + }; + + // Pack return vector + // + std::vector retI; + std::vector retD; + for (int j=0; j(results[j])); + retD.push_back(inertia(j, std::get<1>(results[j]))); + } + + return {retI, retD, kMeans.getTol()}; + } + + std::tuple, std::vector, double> + expMSSA::kmeans(int clusters, int stride) + { + if (clusters==0) { + throw std::invalid_argument("expMSSA::kmeans: you need clusters>0"); + } + + if (stride<0) { + throw std::invalid_argument("expMSSA::kmeans: stride must be >= 0"); + } + + // Pack point array + // + std::vector data; + int sz = mean.size(); + for (int j=0; j(numT*sz)); + int c = 0; + for (auto u : mean) { + for (int i=0; ix[c++] = RC[u.first](i, j); + } + } + + // Initialize k-means routine + // + KMeans::kMeansClustering kMeans(data); + + // Run 100 iterations + // + KMeans::WcorrDistMulti dist(numT, numW, sz); + kMeans.iterate(dist, 1000, clusters, stride); + + // Retrieve cluster associations + // + auto results = kMeans.get_results(); + auto centers = kMeans.get_cen(); + + // Compute inertia + // + auto inertia = [&](int j, int id) -> double { + auto & cen = centers[id]; + double d = 0.0; + for (int i=0; ix[i])*(cen[i] - data[j]->x[i]); + return sqrt(d); + }; + + // Pack return vector + // + std::vector retI; + std::vector retD; + for (int j=0; j(results[j])); + retD.push_back(inertia(j, std::get<1>(results[j]))); + } + return {retI, retD, kMeans.getTol()}; } std::map expMSSA::getReconstructed(bool reconstructmean) diff --git a/pyEXP/MSSAWrappers.cc b/pyEXP/MSSAWrappers.cc index aa5ef5c57..b6e556993 100644 --- a/pyEXP/MSSAWrappers.cc +++ b/pyEXP/MSSAWrappers.cc @@ -436,28 +436,66 @@ void MSSAtoolkitClasses(py::module &m) { f.def("kmeans", &expMSSA::kmeans, py::arg("clusters") = 4, - py::arg("toTerm") = true, - py::arg("toFile") = false, + py::arg("stride") = 2, R"( - Do a k-means analysis on the reconstructed trajectory matrices to provide grouping insight + Do a k-means analysis on the reconstructed trajectory matrices for a single channel (specified key value) to + provide grouping insight. A vector of channel indices that identify clusters is return in a vector ordered by PC + index. Parameters ---------- clusters : int, default=4 number of clusters for the k-means analysis - toTerm : bool, default=True - flag indicating whether to output to the terminal (standard output) - toFile : bool - flag indicating whether to write the output to a file + stride : int, default=2 + if positive, the initial cluster centers are stride selected from the PC list. If zero, the centers are + selected randomly from the PC list Returns ------- - None + tuple : (numpy.niarray, numpy.ndarray, double) + The PC indices of the k-means clusters, distance from the centroid, and the + final update error. A zero update error implies that the k-means algorithm + converged. + + Notes + ----- + The k-means partitions n vector observations into k clusters in which each observation belongs to the cluster with + the nearest centers while minimizing the variance within each cluster. In this case, the vectors are the full + trajectory matrices and the distance is the distance between the trajectory matricies reconstructed from each + eigentriple from mSSA. The distance used here is the Frobenius distance or matrix norm distance: the square root + of the sum of squares of all elements in the difference between two matrices. + + This version does the analysis for all channels together, the most useful for estimating groups. For individual + contributions by channel, use kmeansChannel. + )"); + + f.def("kmeansChannel", &expMSSA::kmeansChannel, + py::arg("key"), + py::arg("clusters") = 4, + py::arg("stride") = 2, + R"( + Do a k-means analysis on the reconstructed trajectory matrices for a single channel (specified key value) to + provide grouping insight. In most cases, you will want to use the kmeans() version which analyzes all channels + together. + + Parameters + ---------- + clusters : int, default=4 + number of clusters for the k-means analysis + key : list(int) + identifier indices of the selected data channel + + Returns + ------- + tuple : (numpy.niarray, numpy.ndarray, double) + The PC indices of the k-means clusters, distance from the centroid, and the + final update error. A zero update error implies that the k-means algorithm + converged. Notes ----- - By default, results are written to the standard output. Set `toFile=True` to write - the output to a file. The file name will be derived from the 'output' parameter. + This version does the analysis channel-by-channel. You may wish to see all channels together using + kmeansTotal. See kemans() for more details. )"); f.def("contrib", &expMSSA::contributions,