From 7acada05dff24f55b5edcc9cb62141390f14560e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 19 Sep 2019 14:08:36 -0700 Subject: [PATCH 1/4] [SPARK-29142][PYTHON][ML] Pyspark clustering models support column setters/getters/predict --- python/pyspark/ml/clustering.py | 570 ++++++++++++++++++-------------- 1 file changed, 323 insertions(+), 247 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 9b21aacacd710..52597a15c5ba3 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -95,7 +95,28 @@ def numIter(self): return self._call_java("numIter") -class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable, HasTrainingSummary): +@inherit_doc +class GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, + HasProbabilityCol, HasTol): + """ + (Private) Params for GaussianMixture. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + + "Must be > 1.", typeConverter=TypeConverters.toInt) + + @since("2.0.0") + def getK(self): + """ + Gets the value of `k` + """ + return self.getOrDefault(self.k) + + +class GaussianMixtureModel(JavaModel, GaussianMixtureParams, JavaMLWritable, JavaMLReadable, + HasTrainingSummary): """ Model fitted by GaussianMixture. @@ -135,10 +156,16 @@ def summary(self): raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) + @inherit_doc -class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, - HasProbabilityCol, JavaMLWritable, JavaMLReadable): +class GaussianMixture(JavaEstimator, GaussianMixtureParams, JavaMLWritable, JavaMLReadable): """ GaussianMixture clustering. This class performs expectation maximization for multivariate Gaussian @@ -169,6 +196,12 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte >>> gm = GaussianMixture(k=3, tol=0.0001, ... maxIter=10, seed=10) >>> model = gm.fit(df) + >>> model.getFeaturesCol() + 'features' + >>> model.setPredictionCol("newPrediction") + GaussianMixture... + >>> model.predict(df.head().features) + 2 >>> model.hasSummary True >>> summary = model.summary @@ -185,11 +218,11 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte Row(mean=DenseVector([0.825, 0.8675])) >>> model.gaussiansDF.select("cov").head() Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False)) - >>> transformed = model.transform(df).select("features", "prediction") + >>> transformed = model.transform(df).select("features", "newPrediction") >>> rows = transformed.collect() - >>> rows[4].prediction == rows[5].prediction + >>> rows[4].newPrediction == rows[5].newPrediction True - >>> rows[2].prediction == rows[3].prediction + >>> rows[2].newPrediction == rows[3].newPrediction True >>> gmm_path = temp_path + "/gmm" >>> gm.save(gmm_path) @@ -211,9 +244,6 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte .. versionadded:: 2.0.0 """ - k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + - "Must be > 1.", typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", k=2, probabilityCol="probability", tol=0.01, maxIter=100, seed=None): @@ -251,13 +281,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.0.0") - def getK(self): - """ - Gets the value of `k` - """ - return self.getOrDefault(self.k) - class GaussianMixtureSummary(ClusteringSummary): """ @@ -308,7 +331,49 @@ def trainingCost(self): return self._call_java("trainingCost") -class KMeansModel(JavaModel, GeneralJavaMLWritable, JavaMLReadable, HasTrainingSummary): +@inherit_doc +class KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTol, + HasDistanceMeasure): + """ + (Private) Params for KMeans. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.", + typeConverter=TypeConverters.toInt) + initMode = Param(Params._dummy(), "initMode", + "The initialization algorithm. This can be either \"random\" to " + + "choose random points as initial cluster centers, or \"k-means||\" " + + "to use a parallel variant of k-means++", + typeConverter=TypeConverters.toString) + initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + + "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) + + @since("1.5.0") + def getK(self): + """ + Gets the value of `k` + """ + return self.getOrDefault(self.k) + + @since("1.5.0") + def getInitMode(self): + """ + Gets the value of `initMode` + """ + return self.getOrDefault(self.initMode) + + @since("1.5.0") + def getInitSteps(self): + """ + Gets the value of `initSteps` + """ + return self.getOrDefault(self.initSteps) + + +class KMeansModel(JavaModel, KMeansParams, GeneralJavaMLWritable, JavaMLReadable, + HasTrainingSummary): """ Model fitted by KMeans. @@ -333,10 +398,16 @@ def summary(self): raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) + @inherit_doc -class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, HasMaxIter, - HasTol, HasSeed, JavaMLWritable, JavaMLReadable): +class KMeans(JavaEstimator, KMeansParams, JavaMLWritable, JavaMLReadable): """ K-means clustering with a k-means++ like initialization mode (the k-means|| algorithm by Bahmani et al). @@ -347,14 +418,20 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol >>> df = spark.createDataFrame(data, ["features"]) >>> kmeans = KMeans(k=2, seed=1) >>> model = kmeans.fit(df) + >>> model.getDistanceMeasure() + 'euclidean' + >>> model.setPredictionCol("newPrediction") + KMeans... + >>> model.predict(df.head().features) + 0 >>> centers = model.clusterCenters() >>> len(centers) 2 - >>> transformed = model.transform(df).select("features", "prediction") + >>> transformed = model.transform(df).select("features", "newPrediction") >>> rows = transformed.collect() - >>> rows[0].prediction == rows[1].prediction + >>> rows[0].newPrediction == rows[1].newPrediction True - >>> rows[2].prediction == rows[3].prediction + >>> rows[2].newPrediction == rows[3].newPrediction True >>> model.hasSummary True @@ -383,16 +460,6 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol .. versionadded:: 1.5.0 """ - k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.", - typeConverter=TypeConverters.toInt) - initMode = Param(Params._dummy(), "initMode", - "The initialization algorithm. This can be either \"random\" to " + - "choose random points as initial cluster centers, or \"k-means||\" " + - "to use a parallel variant of k-means++", - typeConverter=TypeConverters.toString) - initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + - "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, @@ -434,13 +501,6 @@ def setK(self, value): """ return self._set(k=value) - @since("1.5.0") - def getK(self): - """ - Gets the value of `k` - """ - return self.getOrDefault(self.k) - @since("1.5.0") def setInitMode(self, value): """ @@ -448,13 +508,6 @@ def setInitMode(self, value): """ return self._set(initMode=value) - @since("1.5.0") - def getInitMode(self): - """ - Gets the value of `initMode` - """ - return self.getOrDefault(self.initMode) - @since("1.5.0") def setInitSteps(self, value): """ @@ -462,13 +515,6 @@ def setInitSteps(self, value): """ return self._set(initSteps=value) - @since("1.5.0") - def getInitSteps(self): - """ - Gets the value of `initSteps` - """ - return self.getOrDefault(self.initSteps) - @since("2.4.0") def setDistanceMeasure(self, value): """ @@ -476,15 +522,40 @@ def setDistanceMeasure(self, value): """ return self._set(distanceMeasure=value) - @since("2.4.0") - def getDistanceMeasure(self): + +@inherit_doc +class BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, + HasDistanceMeasure): + """ + (Private) Params for BisectingKMeans. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.", + typeConverter=TypeConverters.toInt) + minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize", + "The minimum number of points (if >= 1.0) or the minimum " + + "proportion of points (if < 1.0) of a divisible cluster.", + typeConverter=TypeConverters.toFloat) + + @since("2.0.0") + def getK(self): """ - Gets the value of `distanceMeasure` + Gets the value of `k` or its default value. """ - return self.getOrDefault(self.distanceMeasure) + return self.getOrDefault(self.k) + @since("2.0.0") + def getMinDivisibleClusterSize(self): + """ + Gets the value of `minDivisibleClusterSize` or its default value. + """ + return self.getOrDefault(self.minDivisibleClusterSize) -class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable, HasTrainingSummary): + +class BisectingKMeansModel(JavaModel, BisectingKMeansParams, JavaMLWritable, JavaMLReadable, + HasTrainingSummary): """ Model fitted by BisectingKMeans. @@ -524,10 +595,16 @@ def summary(self): raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) + @inherit_doc -class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, - HasMaxIter, HasSeed, JavaMLWritable, JavaMLReadable): +class BisectingKMeans(JavaEstimator, BisectingKMeansParams, JavaMLWritable, JavaMLReadable): """ A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark. @@ -544,6 +621,12 @@ class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPred >>> df = spark.createDataFrame(data, ["features"]) >>> bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) >>> model = bkm.fit(df) + >>> model.getMaxIter() + 20 + >>> model.setPredictionCol("newPrediction") + BisectingKMeans... + >>> model.predict(df.head().features) + 0 >>> centers = model.clusterCenters() >>> len(centers) 2 @@ -558,11 +641,11 @@ class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPred [2, 2] >>> summary.trainingCost 2.000... - >>> transformed = model.transform(df).select("features", "prediction") + >>> transformed = model.transform(df).select("features", "newPrediction") >>> rows = transformed.collect() - >>> rows[0].prediction == rows[1].prediction + >>> rows[0].newPrediction == rows[1].newPrediction True - >>> rows[2].prediction == rows[3].prediction + >>> rows[2].newPrediction == rows[3].newPrediction True >>> bkm_path = temp_path + "/bkm" >>> bkm.save(bkm_path) @@ -584,13 +667,6 @@ class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPred .. versionadded:: 2.0.0 """ - k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.", - typeConverter=TypeConverters.toInt) - minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize", - "The minimum number of points (if >= 1.0) or the minimum " + - "proportion of points (if < 1.0) of a divisible cluster.", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20, seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"): @@ -624,13 +700,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.0.0") - def getK(self): - """ - Gets the value of `k` or its default value. - """ - return self.getOrDefault(self.k) - @since("2.0.0") def setMinDivisibleClusterSize(self, value): """ @@ -638,13 +707,6 @@ def setMinDivisibleClusterSize(self, value): """ return self._set(minDivisibleClusterSize=value) - @since("2.0.0") - def getMinDivisibleClusterSize(self): - """ - Gets the value of `minDivisibleClusterSize` or its default value. - """ - return self.getOrDefault(self.minDivisibleClusterSize) - @since("2.4.0") def setDistanceMeasure(self, value): """ @@ -652,13 +714,6 @@ def setDistanceMeasure(self, value): """ return self._set(distanceMeasure=value) - @since("2.4.0") - def getDistanceMeasure(self): - """ - Gets the value of `distanceMeasure` or its default value. - """ - return self.getOrDefault(self.distanceMeasure) - def _create_model(self, java_model): return BisectingKMeansModel(java_model) @@ -681,7 +736,126 @@ def trainingCost(self): @inherit_doc -class LDAModel(JavaModel): +class LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval): + """ + (Private) Params for LDA. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.", + typeConverter=TypeConverters.toInt) + optimizer = Param(Params._dummy(), "optimizer", + "Optimizer or inference algorithm used to estimate the LDA model. " + "Supported: online, em", typeConverter=TypeConverters.toString) + learningOffset = Param(Params._dummy(), "learningOffset", + "A (positive) learning parameter that downweights early iterations." + " Larger values make early iterations count less", + typeConverter=TypeConverters.toFloat) + learningDecay = Param(Params._dummy(), "learningDecay", "Learning rate, set as an" + "exponential decay rate. This should be between (0.5, 1.0] to " + "guarantee asymptotic convergence.", typeConverter=TypeConverters.toFloat) + subsamplingRate = Param(Params._dummy(), "subsamplingRate", + "Fraction of the corpus to be sampled and used in each iteration " + "of mini-batch gradient descent, in range (0, 1].", + typeConverter=TypeConverters.toFloat) + optimizeDocConcentration = Param(Params._dummy(), "optimizeDocConcentration", + "Indicates whether the docConcentration (Dirichlet parameter " + "for document-topic distribution) will be optimized during " + "training.", typeConverter=TypeConverters.toBoolean) + docConcentration = Param(Params._dummy(), "docConcentration", + "Concentration parameter (commonly named \"alpha\") for the " + "prior placed on documents' distributions over topics (\"theta\").", + typeConverter=TypeConverters.toListFloat) + topicConcentration = Param(Params._dummy(), "topicConcentration", + "Concentration parameter (commonly named \"beta\" or \"eta\") for " + "the prior placed on topic' distributions over terms.", + typeConverter=TypeConverters.toFloat) + topicDistributionCol = Param(Params._dummy(), "topicDistributionCol", + "Output column with estimates of the topic mixture distribution " + "for each document (often called \"theta\" in the literature). " + "Returns a vector of zeros for an empty document.", + typeConverter=TypeConverters.toString) + keepLastCheckpoint = Param(Params._dummy(), "keepLastCheckpoint", + "(For EM optimizer) If using checkpointing, this indicates whether" + " to keep the last checkpoint. If false, then the checkpoint will be" + " deleted. Deleting the checkpoint can cause failures if a data" + " partition is lost, so set this bit with care.", + TypeConverters.toBoolean) + + @since("2.0.0") + def getK(self): + """ + Gets the value of :py:attr:`k` or its default value. + """ + return self.getOrDefault(self.k) + + @since("2.0.0") + def getOptimizer(self): + """ + Gets the value of :py:attr:`optimizer` or its default value. + """ + return self.getOrDefault(self.optimizer) + + @since("2.0.0") + def getLearningOffset(self): + """ + Gets the value of :py:attr:`learningOffset` or its default value. + """ + return self.getOrDefault(self.learningOffset) + + @since("2.0.0") + def getLearningDecay(self): + """ + Gets the value of :py:attr:`learningDecay` or its default value. + """ + return self.getOrDefault(self.learningDecay) + + @since("2.0.0") + def getSubsamplingRate(self): + """ + Gets the value of :py:attr:`subsamplingRate` or its default value. + """ + return self.getOrDefault(self.subsamplingRate) + + @since("2.0.0") + def getOptimizeDocConcentration(self): + """ + Gets the value of :py:attr:`optimizeDocConcentration` or its default value. + """ + return self.getOrDefault(self.optimizeDocConcentration) + + @since("2.0.0") + def getDocConcentration(self): + """ + Gets the value of :py:attr:`docConcentration` or its default value. + """ + return self.getOrDefault(self.docConcentration) + + @since("2.0.0") + def getTopicConcentration(self): + """ + Gets the value of :py:attr:`topicConcentration` or its default value. + """ + return self.getOrDefault(self.topicConcentration) + + @since("2.0.0") + def getTopicDistributionCol(self): + """ + Gets the value of :py:attr:`topicDistributionCol` or its default value. + """ + return self.getOrDefault(self.topicDistributionCol) + + @since("2.0.0") + def getKeepLastCheckpoint(self): + """ + Gets the value of :py:attr:`keepLastCheckpoint` or its default value. + """ + return self.getOrDefault(self.keepLastCheckpoint) + + +@inherit_doc +class LDAModel(JavaModel, LDAParams): """ Latent Dirichlet Allocation (LDA) model. This abstraction permits for different underlying representations, @@ -836,8 +1010,7 @@ class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): @inherit_doc -class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval, - JavaMLReadable, JavaMLWritable): +class LDA(JavaEstimator, LDAParams, JavaMLReadable, JavaMLWritable): """ Latent Dirichlet Allocation (LDA), a topic model designed for text documents. @@ -864,6 +1037,8 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) >>> lda = LDA(k=2, seed=1, optimizer="em") >>> model = lda.fit(df) + >>> model.getTopicDistributionCol() + 'topicDistribution' >>> model.isDistributed() True >>> localModel = model.toLocal() @@ -894,46 +1069,6 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter .. versionadded:: 2.0.0 """ - k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.", - typeConverter=TypeConverters.toInt) - optimizer = Param(Params._dummy(), "optimizer", - "Optimizer or inference algorithm used to estimate the LDA model. " - "Supported: online, em", typeConverter=TypeConverters.toString) - learningOffset = Param(Params._dummy(), "learningOffset", - "A (positive) learning parameter that downweights early iterations." - " Larger values make early iterations count less", - typeConverter=TypeConverters.toFloat) - learningDecay = Param(Params._dummy(), "learningDecay", "Learning rate, set as an" - "exponential decay rate. This should be between (0.5, 1.0] to " - "guarantee asymptotic convergence.", typeConverter=TypeConverters.toFloat) - subsamplingRate = Param(Params._dummy(), "subsamplingRate", - "Fraction of the corpus to be sampled and used in each iteration " - "of mini-batch gradient descent, in range (0, 1].", - typeConverter=TypeConverters.toFloat) - optimizeDocConcentration = Param(Params._dummy(), "optimizeDocConcentration", - "Indicates whether the docConcentration (Dirichlet parameter " - "for document-topic distribution) will be optimized during " - "training.", typeConverter=TypeConverters.toBoolean) - docConcentration = Param(Params._dummy(), "docConcentration", - "Concentration parameter (commonly named \"alpha\") for the " - "prior placed on documents' distributions over topics (\"theta\").", - typeConverter=TypeConverters.toListFloat) - topicConcentration = Param(Params._dummy(), "topicConcentration", - "Concentration parameter (commonly named \"beta\" or \"eta\") for " - "the prior placed on topic' distributions over terms.", - typeConverter=TypeConverters.toFloat) - topicDistributionCol = Param(Params._dummy(), "topicDistributionCol", - "Output column with estimates of the topic mixture distribution " - "for each document (often called \"theta\" in the literature). " - "Returns a vector of zeros for an empty document.", - typeConverter=TypeConverters.toString) - keepLastCheckpoint = Param(Params._dummy(), "keepLastCheckpoint", - "(For EM optimizer) If using checkpointing, this indicates whether" - " to keep the last checkpoint. If false, then the checkpoint will be" - " deleted. Deleting the checkpoint can cause failures if a data" - " partition is lost, so set this bit with care.", - TypeConverters.toBoolean) - @keyword_only def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10, k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, @@ -992,13 +1127,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.0.0") - def getK(self): - """ - Gets the value of :py:attr:`k` or its default value. - """ - return self.getOrDefault(self.k) - @since("2.0.0") def setOptimizer(self, value): """ @@ -1011,13 +1139,6 @@ def setOptimizer(self, value): """ return self._set(optimizer=value) - @since("2.0.0") - def getOptimizer(self): - """ - Gets the value of :py:attr:`optimizer` or its default value. - """ - return self.getOrDefault(self.optimizer) - @since("2.0.0") def setLearningOffset(self, value): """ @@ -1029,13 +1150,6 @@ def setLearningOffset(self, value): """ return self._set(learningOffset=value) - @since("2.0.0") - def getLearningOffset(self): - """ - Gets the value of :py:attr:`learningOffset` or its default value. - """ - return self.getOrDefault(self.learningOffset) - @since("2.0.0") def setLearningDecay(self, value): """ @@ -1047,13 +1161,6 @@ def setLearningDecay(self, value): """ return self._set(learningDecay=value) - @since("2.0.0") - def getLearningDecay(self): - """ - Gets the value of :py:attr:`learningDecay` or its default value. - """ - return self.getOrDefault(self.learningDecay) - @since("2.0.0") def setSubsamplingRate(self, value): """ @@ -1065,13 +1172,6 @@ def setSubsamplingRate(self, value): """ return self._set(subsamplingRate=value) - @since("2.0.0") - def getSubsamplingRate(self): - """ - Gets the value of :py:attr:`subsamplingRate` or its default value. - """ - return self.getOrDefault(self.subsamplingRate) - @since("2.0.0") def setOptimizeDocConcentration(self, value): """ @@ -1083,13 +1183,6 @@ def setOptimizeDocConcentration(self, value): """ return self._set(optimizeDocConcentration=value) - @since("2.0.0") - def getOptimizeDocConcentration(self): - """ - Gets the value of :py:attr:`optimizeDocConcentration` or its default value. - """ - return self.getOrDefault(self.optimizeDocConcentration) - @since("2.0.0") def setDocConcentration(self, value): """ @@ -1101,13 +1194,6 @@ def setDocConcentration(self, value): """ return self._set(docConcentration=value) - @since("2.0.0") - def getDocConcentration(self): - """ - Gets the value of :py:attr:`docConcentration` or its default value. - """ - return self.getOrDefault(self.docConcentration) - @since("2.0.0") def setTopicConcentration(self, value): """ @@ -1119,13 +1205,6 @@ def setTopicConcentration(self, value): """ return self._set(topicConcentration=value) - @since("2.0.0") - def getTopicConcentration(self): - """ - Gets the value of :py:attr:`topicConcentration` or its default value. - """ - return self.getOrDefault(self.topicConcentration) - @since("2.0.0") def setTopicDistributionCol(self, value): """ @@ -1137,13 +1216,6 @@ def setTopicDistributionCol(self, value): """ return self._set(topicDistributionCol=value) - @since("2.0.0") - def getTopicDistributionCol(self): - """ - Gets the value of :py:attr:`topicDistributionCol` or its default value. - """ - return self.getOrDefault(self.topicDistributionCol) - @since("2.0.0") def setKeepLastCheckpoint(self, value): """ @@ -1155,16 +1227,62 @@ def setKeepLastCheckpoint(self, value): """ return self._set(keepLastCheckpoint=value) - @since("2.0.0") - def getKeepLastCheckpoint(self): + +@inherit_doc +class PowerIterationClusteringParams(HasMaxIter, HasWeightCol): + """ + (Private) Params for PowerIterationClustering. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", + "The number of clusters to create. Must be > 1.", + typeConverter=TypeConverters.toInt) + initMode = Param(Params._dummy(), "initMode", + "The initialization algorithm. This can be either " + + "'random' to use a random vector as vertex properties, or 'degree' to use " + + "a normalized sum of similarities with other vertices. Supported options: " + + "'random' and 'degree'.", + typeConverter=TypeConverters.toString) + srcCol = Param(Params._dummy(), "srcCol", + "Name of the input column for source vertex IDs.", + typeConverter=TypeConverters.toString) + dstCol = Param(Params._dummy(), "dstCol", + "Name of the input column for destination vertex IDs.", + typeConverter=TypeConverters.toString) + + @since("2.4.0") + def getK(self): """ - Gets the value of :py:attr:`keepLastCheckpoint` or its default value. + Gets the value of :py:attr:`k` or its default value. """ - return self.getOrDefault(self.keepLastCheckpoint) + return self.getOrDefault(self.k) + + @since("2.4.0") + def getInitMode(self): + """ + Gets the value of :py:attr:`initMode` or its default value. + """ + return self.getOrDefault(self.initMode) + + @since("2.4.0") + def getSrcCol(self): + """ + Gets the value of :py:attr:`srcCol` or its default value. + """ + return self.getOrDefault(self.srcCol) + + @since("2.4.0") + def getDstCol(self): + """ + Gets the value of :py:attr:`dstCol` or its default value. + """ + return self.getOrDefault(self.dstCol) @inherit_doc -class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReadable, +class PowerIterationClustering(PowerIterationClusteringParams, JavaParams, JavaMLReadable, JavaMLWritable): """ Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by @@ -1184,7 +1302,9 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight").repartition(1) - >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") + >>> pic = PowerIterationClustering(k=2, weightCol="weight") + >>> pic.setMaxIter(40) + PowerIterationClustering... >>> assignments = pic.assignClusters(df) >>> assignments.sort(assignments.id).show(truncate=False) +---+-------+ @@ -1209,22 +1329,6 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada .. versionadded:: 2.4.0 """ - k = Param(Params._dummy(), "k", - "The number of clusters to create. Must be > 1.", - typeConverter=TypeConverters.toInt) - initMode = Param(Params._dummy(), "initMode", - "The initialization algorithm. This can be either " + - "'random' to use a random vector as vertex properties, or 'degree' to use " + - "a normalized sum of similarities with other vertices. Supported options: " + - "'random' and 'degree'.", - typeConverter=TypeConverters.toString) - srcCol = Param(Params._dummy(), "srcCol", - "Name of the input column for source vertex IDs.", - typeConverter=TypeConverters.toString) - dstCol = Param(Params._dummy(), "dstCol", - "Name of the input column for destination vertex IDs.", - typeConverter=TypeConverters.toString) - @keyword_only def __init__(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst", weightCol=None): @@ -1258,13 +1362,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.4.0") - def getK(self): - """ - Gets the value of :py:attr:`k` or its default value. - """ - return self.getOrDefault(self.k) - @since("2.4.0") def setInitMode(self, value): """ @@ -1272,13 +1369,6 @@ def setInitMode(self, value): """ return self._set(initMode=value) - @since("2.4.0") - def getInitMode(self): - """ - Gets the value of :py:attr:`initMode` or its default value. - """ - return self.getOrDefault(self.initMode) - @since("2.4.0") def setSrcCol(self, value): """ @@ -1286,13 +1376,6 @@ def setSrcCol(self, value): """ return self._set(srcCol=value) - @since("2.4.0") - def getSrcCol(self): - """ - Gets the value of :py:attr:`srcCol` or its default value. - """ - return self.getOrDefault(self.srcCol) - @since("2.4.0") def setDstCol(self, value): """ @@ -1300,13 +1383,6 @@ def setDstCol(self, value): """ return self._set(dstCol=value) - @since("2.4.0") - def getDstCol(self): - """ - Gets the value of :py:attr:`dstCol` or its default value. - """ - return self.getOrDefault(self.dstCol) - @since("2.4.0") def assignClusters(self, dataset): """ From 4af0dc6f2c3e2df515c800af4399024c43ddb36f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 24 Sep 2019 09:49:18 -0700 Subject: [PATCH 2/4] address comments --- python/pyspark/ml/clustering.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 52597a15c5ba3..67d8eb5a7c3f1 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -163,6 +163,13 @@ def predict(self, value): """ return self._call_java("predict", value) + @since("3.0.0") + def predictProbability(self, value): + """ + Predict probability for the given features. + """ + return self._call_java("predictProbability", value) + @inherit_doc class GaussianMixture(JavaEstimator, GaussianMixtureParams, JavaMLWritable, JavaMLReadable): @@ -202,6 +209,8 @@ class GaussianMixture(JavaEstimator, GaussianMixtureParams, JavaMLWritable, Java GaussianMixture... >>> model.predict(df.head().features) 2 + >>> model.predictProbability(df.head().features) + DenseVector([0.0, 0.4736, 0.5264]) >>> model.hasSummary True >>> summary = model.summary From 6e6a5d5e2df2c2c1d185ad39dc191e8b51ad0763 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 26 Sep 2019 10:12:29 -0700 Subject: [PATCH 3/4] add _ in front of xxxParams to indicate internal use (PEP8) --- python/pyspark/ml/clustering.py | 44 ++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 67d8eb5a7c3f1..9b50245402ae3 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -96,10 +96,10 @@ def numIter(self): @inherit_doc -class GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, - HasProbabilityCol, HasTol): +class _GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, + HasProbabilityCol, HasTol): """ - (Private) Params for GaussianMixture. + Params for :py:attr:`GaussianMixture` and :py:attr:`GaussianMixtureModel`. .. versionadded:: 3.0.0 """ @@ -115,7 +115,7 @@ def getK(self): return self.getOrDefault(self.k) -class GaussianMixtureModel(JavaModel, GaussianMixtureParams, JavaMLWritable, JavaMLReadable, +class GaussianMixtureModel(JavaModel, _GaussianMixtureParams, JavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by GaussianMixture. @@ -172,7 +172,7 @@ def predictProbability(self, value): @inherit_doc -class GaussianMixture(JavaEstimator, GaussianMixtureParams, JavaMLWritable, JavaMLReadable): +class GaussianMixture(JavaEstimator, _GaussianMixtureParams, JavaMLWritable, JavaMLReadable): """ GaussianMixture clustering. This class performs expectation maximization for multivariate Gaussian @@ -341,10 +341,10 @@ def trainingCost(self): @inherit_doc -class KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTol, - HasDistanceMeasure): +class _KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTol, + HasDistanceMeasure): """ - (Private) Params for KMeans. + Params for :py:attr:`KMeans` and :py:attr:`KMeansModel`. .. versionadded:: 3.0.0 """ @@ -381,7 +381,7 @@ def getInitSteps(self): return self.getOrDefault(self.initSteps) -class KMeansModel(JavaModel, KMeansParams, GeneralJavaMLWritable, JavaMLReadable, +class KMeansModel(JavaModel, _KMeansParams, GeneralJavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by KMeans. @@ -416,7 +416,7 @@ def predict(self, value): @inherit_doc -class KMeans(JavaEstimator, KMeansParams, JavaMLWritable, JavaMLReadable): +class KMeans(JavaEstimator, _KMeansParams, JavaMLWritable, JavaMLReadable): """ K-means clustering with a k-means++ like initialization mode (the k-means|| algorithm by Bahmani et al). @@ -533,10 +533,10 @@ def setDistanceMeasure(self, value): @inherit_doc -class BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, - HasDistanceMeasure): +class _BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, + HasDistanceMeasure): """ - (Private) Params for BisectingKMeans. + Params for :py:attr:`BisectingKMeans` and :py:attr:`BisectingKMeansModel`. .. versionadded:: 3.0.0 """ @@ -563,7 +563,7 @@ def getMinDivisibleClusterSize(self): return self.getOrDefault(self.minDivisibleClusterSize) -class BisectingKMeansModel(JavaModel, BisectingKMeansParams, JavaMLWritable, JavaMLReadable, +class BisectingKMeansModel(JavaModel, _BisectingKMeansParams, JavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by BisectingKMeans. @@ -613,7 +613,7 @@ def predict(self, value): @inherit_doc -class BisectingKMeans(JavaEstimator, BisectingKMeansParams, JavaMLWritable, JavaMLReadable): +class BisectingKMeans(JavaEstimator, _BisectingKMeansParams, JavaMLWritable, JavaMLReadable): """ A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark. @@ -745,9 +745,9 @@ def trainingCost(self): @inherit_doc -class LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval): +class _LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval): """ - (Private) Params for LDA. + Params for :py:attr:`LDA` and :py:attr:`LDAModel`. .. versionadded:: 3.0.0 """ @@ -864,7 +864,7 @@ def getKeepLastCheckpoint(self): @inherit_doc -class LDAModel(JavaModel, LDAParams): +class LDAModel(JavaModel, _LDAParams): """ Latent Dirichlet Allocation (LDA) model. This abstraction permits for different underlying representations, @@ -1019,7 +1019,7 @@ class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): @inherit_doc -class LDA(JavaEstimator, LDAParams, JavaMLReadable, JavaMLWritable): +class LDA(JavaEstimator, _LDAParams, JavaMLReadable, JavaMLWritable): """ Latent Dirichlet Allocation (LDA), a topic model designed for text documents. @@ -1238,9 +1238,9 @@ def setKeepLastCheckpoint(self, value): @inherit_doc -class PowerIterationClusteringParams(HasMaxIter, HasWeightCol): +class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol): """ - (Private) Params for PowerIterationClustering. + Params for :py:attr:`PowerIterationClustering` and :py:attr:`PowerIterationClusteringModel`. .. versionadded:: 3.0.0 """ @@ -1291,7 +1291,7 @@ def getDstCol(self): @inherit_doc -class PowerIterationClustering(PowerIterationClusteringParams, JavaParams, JavaMLReadable, +class PowerIterationClustering(_PowerIterationClusteringParams, JavaParams, JavaMLReadable, JavaMLWritable): """ Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by From 712ef78ca4c01c7c93d5512ccdbe042f52bb00cd Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 26 Sep 2019 10:17:25 -0700 Subject: [PATCH 4/4] fix a problem --- python/pyspark/ml/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 9b50245402ae3..02219d35c3484 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -1240,7 +1240,7 @@ def setKeepLastCheckpoint(self, value): @inherit_doc class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol): """ - Params for :py:attr:`PowerIterationClustering` and :py:attr:`PowerIterationClusteringModel`. + Params for :py:attr:`PowerIterationClustering`. .. versionadded:: 3.0.0 """