diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 9b21aacacd710..02219d35c3484 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -95,7 +95,28 @@ def numIter(self): return self._call_java("numIter") -class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable, HasTrainingSummary): +@inherit_doc +class _GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, + HasProbabilityCol, HasTol): + """ + Params for :py:attr:`GaussianMixture` and :py:attr:`GaussianMixtureModel`. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + + "Must be > 1.", typeConverter=TypeConverters.toInt) + + @since("2.0.0") + def getK(self): + """ + Gets the value of `k` + """ + return self.getOrDefault(self.k) + + +class GaussianMixtureModel(JavaModel, _GaussianMixtureParams, JavaMLWritable, JavaMLReadable, + HasTrainingSummary): """ Model fitted by GaussianMixture. @@ -135,10 +156,23 @@ def summary(self): raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) + + @since("3.0.0") + def predictProbability(self, value): + """ + Predict probability for the given features. + """ + return self._call_java("predictProbability", value) + @inherit_doc -class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, - HasProbabilityCol, JavaMLWritable, JavaMLReadable): +class GaussianMixture(JavaEstimator, _GaussianMixtureParams, JavaMLWritable, JavaMLReadable): """ GaussianMixture clustering. This class performs expectation maximization for multivariate Gaussian @@ -169,6 +203,14 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte >>> gm = GaussianMixture(k=3, tol=0.0001, ... maxIter=10, seed=10) >>> model = gm.fit(df) + >>> model.getFeaturesCol() + 'features' + >>> model.setPredictionCol("newPrediction") + GaussianMixture... + >>> model.predict(df.head().features) + 2 + >>> model.predictProbability(df.head().features) + DenseVector([0.0, 0.4736, 0.5264]) >>> model.hasSummary True >>> summary = model.summary @@ -185,11 +227,11 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte Row(mean=DenseVector([0.825, 0.8675])) >>> model.gaussiansDF.select("cov").head() Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False)) - >>> transformed = model.transform(df).select("features", "prediction") + >>> transformed = model.transform(df).select("features", "newPrediction") >>> rows = transformed.collect() - >>> rows[4].prediction == rows[5].prediction + >>> rows[4].newPrediction == rows[5].newPrediction True - >>> rows[2].prediction == rows[3].prediction + >>> rows[2].newPrediction == rows[3].newPrediction True >>> gmm_path = temp_path + "/gmm" >>> gm.save(gmm_path) @@ -211,9 +253,6 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte .. versionadded:: 2.0.0 """ - k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + - "Must be > 1.", typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", k=2, probabilityCol="probability", tol=0.01, maxIter=100, seed=None): @@ -251,13 +290,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.0.0") - def getK(self): - """ - Gets the value of `k` - """ - return self.getOrDefault(self.k) - class GaussianMixtureSummary(ClusteringSummary): """ @@ -308,7 +340,49 @@ def trainingCost(self): return self._call_java("trainingCost") -class KMeansModel(JavaModel, GeneralJavaMLWritable, JavaMLReadable, HasTrainingSummary): +@inherit_doc +class _KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTol, + HasDistanceMeasure): + """ + Params for :py:attr:`KMeans` and :py:attr:`KMeansModel`. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.", + typeConverter=TypeConverters.toInt) + initMode = Param(Params._dummy(), "initMode", + "The initialization algorithm. This can be either \"random\" to " + + "choose random points as initial cluster centers, or \"k-means||\" " + + "to use a parallel variant of k-means++", + typeConverter=TypeConverters.toString) + initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + + "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) + + @since("1.5.0") + def getK(self): + """ + Gets the value of `k` + """ + return self.getOrDefault(self.k) + + @since("1.5.0") + def getInitMode(self): + """ + Gets the value of `initMode` + """ + return self.getOrDefault(self.initMode) + + @since("1.5.0") + def getInitSteps(self): + """ + Gets the value of `initSteps` + """ + return self.getOrDefault(self.initSteps) + + +class KMeansModel(JavaModel, _KMeansParams, GeneralJavaMLWritable, JavaMLReadable, + HasTrainingSummary): """ Model fitted by KMeans. @@ -333,10 +407,16 @@ def summary(self): raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) + @inherit_doc -class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, HasMaxIter, - HasTol, HasSeed, JavaMLWritable, JavaMLReadable): +class KMeans(JavaEstimator, _KMeansParams, JavaMLWritable, JavaMLReadable): """ K-means clustering with a k-means++ like initialization mode (the k-means|| algorithm by Bahmani et al). @@ -347,14 +427,20 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol >>> df = spark.createDataFrame(data, ["features"]) >>> kmeans = KMeans(k=2, seed=1) >>> model = kmeans.fit(df) + >>> model.getDistanceMeasure() + 'euclidean' + >>> model.setPredictionCol("newPrediction") + KMeans... + >>> model.predict(df.head().features) + 0 >>> centers = model.clusterCenters() >>> len(centers) 2 - >>> transformed = model.transform(df).select("features", "prediction") + >>> transformed = model.transform(df).select("features", "newPrediction") >>> rows = transformed.collect() - >>> rows[0].prediction == rows[1].prediction + >>> rows[0].newPrediction == rows[1].newPrediction True - >>> rows[2].prediction == rows[3].prediction + >>> rows[2].newPrediction == rows[3].newPrediction True >>> model.hasSummary True @@ -383,16 +469,6 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol .. versionadded:: 1.5.0 """ - k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.", - typeConverter=TypeConverters.toInt) - initMode = Param(Params._dummy(), "initMode", - "The initialization algorithm. This can be either \"random\" to " + - "choose random points as initial cluster centers, or \"k-means||\" " + - "to use a parallel variant of k-means++", - typeConverter=TypeConverters.toString) - initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + - "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, @@ -434,13 +510,6 @@ def setK(self, value): """ return self._set(k=value) - @since("1.5.0") - def getK(self): - """ - Gets the value of `k` - """ - return self.getOrDefault(self.k) - @since("1.5.0") def setInitMode(self, value): """ @@ -448,13 +517,6 @@ def setInitMode(self, value): """ return self._set(initMode=value) - @since("1.5.0") - def getInitMode(self): - """ - Gets the value of `initMode` - """ - return self.getOrDefault(self.initMode) - @since("1.5.0") def setInitSteps(self, value): """ @@ -462,13 +524,6 @@ def setInitSteps(self, value): """ return self._set(initSteps=value) - @since("1.5.0") - def getInitSteps(self): - """ - Gets the value of `initSteps` - """ - return self.getOrDefault(self.initSteps) - @since("2.4.0") def setDistanceMeasure(self, value): """ @@ -476,15 +531,40 @@ def setDistanceMeasure(self, value): """ return self._set(distanceMeasure=value) - @since("2.4.0") - def getDistanceMeasure(self): + +@inherit_doc +class _BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, + HasDistanceMeasure): + """ + Params for :py:attr:`BisectingKMeans` and :py:attr:`BisectingKMeansModel`. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.", + typeConverter=TypeConverters.toInt) + minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize", + "The minimum number of points (if >= 1.0) or the minimum " + + "proportion of points (if < 1.0) of a divisible cluster.", + typeConverter=TypeConverters.toFloat) + + @since("2.0.0") + def getK(self): """ - Gets the value of `distanceMeasure` + Gets the value of `k` or its default value. """ - return self.getOrDefault(self.distanceMeasure) + return self.getOrDefault(self.k) + + @since("2.0.0") + def getMinDivisibleClusterSize(self): + """ + Gets the value of `minDivisibleClusterSize` or its default value. + """ + return self.getOrDefault(self.minDivisibleClusterSize) -class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable, HasTrainingSummary): +class BisectingKMeansModel(JavaModel, _BisectingKMeansParams, JavaMLWritable, JavaMLReadable, + HasTrainingSummary): """ Model fitted by BisectingKMeans. @@ -524,10 +604,16 @@ def summary(self): raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) + @inherit_doc -class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, - HasMaxIter, HasSeed, JavaMLWritable, JavaMLReadable): +class BisectingKMeans(JavaEstimator, _BisectingKMeansParams, JavaMLWritable, JavaMLReadable): """ A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark. @@ -544,6 +630,12 @@ class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPred >>> df = spark.createDataFrame(data, ["features"]) >>> bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) >>> model = bkm.fit(df) + >>> model.getMaxIter() + 20 + >>> model.setPredictionCol("newPrediction") + BisectingKMeans... + >>> model.predict(df.head().features) + 0 >>> centers = model.clusterCenters() >>> len(centers) 2 @@ -558,11 +650,11 @@ class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPred [2, 2] >>> summary.trainingCost 2.000... - >>> transformed = model.transform(df).select("features", "prediction") + >>> transformed = model.transform(df).select("features", "newPrediction") >>> rows = transformed.collect() - >>> rows[0].prediction == rows[1].prediction + >>> rows[0].newPrediction == rows[1].newPrediction True - >>> rows[2].prediction == rows[3].prediction + >>> rows[2].newPrediction == rows[3].newPrediction True >>> bkm_path = temp_path + "/bkm" >>> bkm.save(bkm_path) @@ -584,13 +676,6 @@ class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPred .. versionadded:: 2.0.0 """ - k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.", - typeConverter=TypeConverters.toInt) - minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize", - "The minimum number of points (if >= 1.0) or the minimum " + - "proportion of points (if < 1.0) of a divisible cluster.", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20, seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"): @@ -624,13 +709,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.0.0") - def getK(self): - """ - Gets the value of `k` or its default value. - """ - return self.getOrDefault(self.k) - @since("2.0.0") def setMinDivisibleClusterSize(self, value): """ @@ -638,13 +716,6 @@ def setMinDivisibleClusterSize(self, value): """ return self._set(minDivisibleClusterSize=value) - @since("2.0.0") - def getMinDivisibleClusterSize(self): - """ - Gets the value of `minDivisibleClusterSize` or its default value. - """ - return self.getOrDefault(self.minDivisibleClusterSize) - @since("2.4.0") def setDistanceMeasure(self, value): """ @@ -652,13 +723,6 @@ def setDistanceMeasure(self, value): """ return self._set(distanceMeasure=value) - @since("2.4.0") - def getDistanceMeasure(self): - """ - Gets the value of `distanceMeasure` or its default value. - """ - return self.getOrDefault(self.distanceMeasure) - def _create_model(self, java_model): return BisectingKMeansModel(java_model) @@ -681,7 +745,126 @@ def trainingCost(self): @inherit_doc -class LDAModel(JavaModel): +class _LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval): + """ + Params for :py:attr:`LDA` and :py:attr:`LDAModel`. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.", + typeConverter=TypeConverters.toInt) + optimizer = Param(Params._dummy(), "optimizer", + "Optimizer or inference algorithm used to estimate the LDA model. " + "Supported: online, em", typeConverter=TypeConverters.toString) + learningOffset = Param(Params._dummy(), "learningOffset", + "A (positive) learning parameter that downweights early iterations." + " Larger values make early iterations count less", + typeConverter=TypeConverters.toFloat) + learningDecay = Param(Params._dummy(), "learningDecay", "Learning rate, set as an" + "exponential decay rate. This should be between (0.5, 1.0] to " + "guarantee asymptotic convergence.", typeConverter=TypeConverters.toFloat) + subsamplingRate = Param(Params._dummy(), "subsamplingRate", + "Fraction of the corpus to be sampled and used in each iteration " + "of mini-batch gradient descent, in range (0, 1].", + typeConverter=TypeConverters.toFloat) + optimizeDocConcentration = Param(Params._dummy(), "optimizeDocConcentration", + "Indicates whether the docConcentration (Dirichlet parameter " + "for document-topic distribution) will be optimized during " + "training.", typeConverter=TypeConverters.toBoolean) + docConcentration = Param(Params._dummy(), "docConcentration", + "Concentration parameter (commonly named \"alpha\") for the " + "prior placed on documents' distributions over topics (\"theta\").", + typeConverter=TypeConverters.toListFloat) + topicConcentration = Param(Params._dummy(), "topicConcentration", + "Concentration parameter (commonly named \"beta\" or \"eta\") for " + "the prior placed on topic' distributions over terms.", + typeConverter=TypeConverters.toFloat) + topicDistributionCol = Param(Params._dummy(), "topicDistributionCol", + "Output column with estimates of the topic mixture distribution " + "for each document (often called \"theta\" in the literature). " + "Returns a vector of zeros for an empty document.", + typeConverter=TypeConverters.toString) + keepLastCheckpoint = Param(Params._dummy(), "keepLastCheckpoint", + "(For EM optimizer) If using checkpointing, this indicates whether" + " to keep the last checkpoint. If false, then the checkpoint will be" + " deleted. Deleting the checkpoint can cause failures if a data" + " partition is lost, so set this bit with care.", + TypeConverters.toBoolean) + + @since("2.0.0") + def getK(self): + """ + Gets the value of :py:attr:`k` or its default value. + """ + return self.getOrDefault(self.k) + + @since("2.0.0") + def getOptimizer(self): + """ + Gets the value of :py:attr:`optimizer` or its default value. + """ + return self.getOrDefault(self.optimizer) + + @since("2.0.0") + def getLearningOffset(self): + """ + Gets the value of :py:attr:`learningOffset` or its default value. + """ + return self.getOrDefault(self.learningOffset) + + @since("2.0.0") + def getLearningDecay(self): + """ + Gets the value of :py:attr:`learningDecay` or its default value. + """ + return self.getOrDefault(self.learningDecay) + + @since("2.0.0") + def getSubsamplingRate(self): + """ + Gets the value of :py:attr:`subsamplingRate` or its default value. + """ + return self.getOrDefault(self.subsamplingRate) + + @since("2.0.0") + def getOptimizeDocConcentration(self): + """ + Gets the value of :py:attr:`optimizeDocConcentration` or its default value. + """ + return self.getOrDefault(self.optimizeDocConcentration) + + @since("2.0.0") + def getDocConcentration(self): + """ + Gets the value of :py:attr:`docConcentration` or its default value. + """ + return self.getOrDefault(self.docConcentration) + + @since("2.0.0") + def getTopicConcentration(self): + """ + Gets the value of :py:attr:`topicConcentration` or its default value. + """ + return self.getOrDefault(self.topicConcentration) + + @since("2.0.0") + def getTopicDistributionCol(self): + """ + Gets the value of :py:attr:`topicDistributionCol` or its default value. + """ + return self.getOrDefault(self.topicDistributionCol) + + @since("2.0.0") + def getKeepLastCheckpoint(self): + """ + Gets the value of :py:attr:`keepLastCheckpoint` or its default value. + """ + return self.getOrDefault(self.keepLastCheckpoint) + + +@inherit_doc +class LDAModel(JavaModel, _LDAParams): """ Latent Dirichlet Allocation (LDA) model. This abstraction permits for different underlying representations, @@ -836,8 +1019,7 @@ class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): @inherit_doc -class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval, - JavaMLReadable, JavaMLWritable): +class LDA(JavaEstimator, _LDAParams, JavaMLReadable, JavaMLWritable): """ Latent Dirichlet Allocation (LDA), a topic model designed for text documents. @@ -864,6 +1046,8 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) >>> lda = LDA(k=2, seed=1, optimizer="em") >>> model = lda.fit(df) + >>> model.getTopicDistributionCol() + 'topicDistribution' >>> model.isDistributed() True >>> localModel = model.toLocal() @@ -894,46 +1078,6 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter .. versionadded:: 2.0.0 """ - k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.", - typeConverter=TypeConverters.toInt) - optimizer = Param(Params._dummy(), "optimizer", - "Optimizer or inference algorithm used to estimate the LDA model. " - "Supported: online, em", typeConverter=TypeConverters.toString) - learningOffset = Param(Params._dummy(), "learningOffset", - "A (positive) learning parameter that downweights early iterations." - " Larger values make early iterations count less", - typeConverter=TypeConverters.toFloat) - learningDecay = Param(Params._dummy(), "learningDecay", "Learning rate, set as an" - "exponential decay rate. This should be between (0.5, 1.0] to " - "guarantee asymptotic convergence.", typeConverter=TypeConverters.toFloat) - subsamplingRate = Param(Params._dummy(), "subsamplingRate", - "Fraction of the corpus to be sampled and used in each iteration " - "of mini-batch gradient descent, in range (0, 1].", - typeConverter=TypeConverters.toFloat) - optimizeDocConcentration = Param(Params._dummy(), "optimizeDocConcentration", - "Indicates whether the docConcentration (Dirichlet parameter " - "for document-topic distribution) will be optimized during " - "training.", typeConverter=TypeConverters.toBoolean) - docConcentration = Param(Params._dummy(), "docConcentration", - "Concentration parameter (commonly named \"alpha\") for the " - "prior placed on documents' distributions over topics (\"theta\").", - typeConverter=TypeConverters.toListFloat) - topicConcentration = Param(Params._dummy(), "topicConcentration", - "Concentration parameter (commonly named \"beta\" or \"eta\") for " - "the prior placed on topic' distributions over terms.", - typeConverter=TypeConverters.toFloat) - topicDistributionCol = Param(Params._dummy(), "topicDistributionCol", - "Output column with estimates of the topic mixture distribution " - "for each document (often called \"theta\" in the literature). " - "Returns a vector of zeros for an empty document.", - typeConverter=TypeConverters.toString) - keepLastCheckpoint = Param(Params._dummy(), "keepLastCheckpoint", - "(For EM optimizer) If using checkpointing, this indicates whether" - " to keep the last checkpoint. If false, then the checkpoint will be" - " deleted. Deleting the checkpoint can cause failures if a data" - " partition is lost, so set this bit with care.", - TypeConverters.toBoolean) - @keyword_only def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10, k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, @@ -992,13 +1136,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.0.0") - def getK(self): - """ - Gets the value of :py:attr:`k` or its default value. - """ - return self.getOrDefault(self.k) - @since("2.0.0") def setOptimizer(self, value): """ @@ -1011,13 +1148,6 @@ def setOptimizer(self, value): """ return self._set(optimizer=value) - @since("2.0.0") - def getOptimizer(self): - """ - Gets the value of :py:attr:`optimizer` or its default value. - """ - return self.getOrDefault(self.optimizer) - @since("2.0.0") def setLearningOffset(self, value): """ @@ -1029,13 +1159,6 @@ def setLearningOffset(self, value): """ return self._set(learningOffset=value) - @since("2.0.0") - def getLearningOffset(self): - """ - Gets the value of :py:attr:`learningOffset` or its default value. - """ - return self.getOrDefault(self.learningOffset) - @since("2.0.0") def setLearningDecay(self, value): """ @@ -1047,13 +1170,6 @@ def setLearningDecay(self, value): """ return self._set(learningDecay=value) - @since("2.0.0") - def getLearningDecay(self): - """ - Gets the value of :py:attr:`learningDecay` or its default value. - """ - return self.getOrDefault(self.learningDecay) - @since("2.0.0") def setSubsamplingRate(self, value): """ @@ -1065,13 +1181,6 @@ def setSubsamplingRate(self, value): """ return self._set(subsamplingRate=value) - @since("2.0.0") - def getSubsamplingRate(self): - """ - Gets the value of :py:attr:`subsamplingRate` or its default value. - """ - return self.getOrDefault(self.subsamplingRate) - @since("2.0.0") def setOptimizeDocConcentration(self, value): """ @@ -1083,13 +1192,6 @@ def setOptimizeDocConcentration(self, value): """ return self._set(optimizeDocConcentration=value) - @since("2.0.0") - def getOptimizeDocConcentration(self): - """ - Gets the value of :py:attr:`optimizeDocConcentration` or its default value. - """ - return self.getOrDefault(self.optimizeDocConcentration) - @since("2.0.0") def setDocConcentration(self, value): """ @@ -1101,13 +1203,6 @@ def setDocConcentration(self, value): """ return self._set(docConcentration=value) - @since("2.0.0") - def getDocConcentration(self): - """ - Gets the value of :py:attr:`docConcentration` or its default value. - """ - return self.getOrDefault(self.docConcentration) - @since("2.0.0") def setTopicConcentration(self, value): """ @@ -1119,13 +1214,6 @@ def setTopicConcentration(self, value): """ return self._set(topicConcentration=value) - @since("2.0.0") - def getTopicConcentration(self): - """ - Gets the value of :py:attr:`topicConcentration` or its default value. - """ - return self.getOrDefault(self.topicConcentration) - @since("2.0.0") def setTopicDistributionCol(self, value): """ @@ -1137,13 +1225,6 @@ def setTopicDistributionCol(self, value): """ return self._set(topicDistributionCol=value) - @since("2.0.0") - def getTopicDistributionCol(self): - """ - Gets the value of :py:attr:`topicDistributionCol` or its default value. - """ - return self.getOrDefault(self.topicDistributionCol) - @since("2.0.0") def setKeepLastCheckpoint(self, value): """ @@ -1155,16 +1236,62 @@ def setKeepLastCheckpoint(self, value): """ return self._set(keepLastCheckpoint=value) - @since("2.0.0") - def getKeepLastCheckpoint(self): + +@inherit_doc +class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol): + """ + Params for :py:attr:`PowerIterationClustering`. + + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", + "The number of clusters to create. Must be > 1.", + typeConverter=TypeConverters.toInt) + initMode = Param(Params._dummy(), "initMode", + "The initialization algorithm. This can be either " + + "'random' to use a random vector as vertex properties, or 'degree' to use " + + "a normalized sum of similarities with other vertices. Supported options: " + + "'random' and 'degree'.", + typeConverter=TypeConverters.toString) + srcCol = Param(Params._dummy(), "srcCol", + "Name of the input column for source vertex IDs.", + typeConverter=TypeConverters.toString) + dstCol = Param(Params._dummy(), "dstCol", + "Name of the input column for destination vertex IDs.", + typeConverter=TypeConverters.toString) + + @since("2.4.0") + def getK(self): """ - Gets the value of :py:attr:`keepLastCheckpoint` or its default value. + Gets the value of :py:attr:`k` or its default value. """ - return self.getOrDefault(self.keepLastCheckpoint) + return self.getOrDefault(self.k) + + @since("2.4.0") + def getInitMode(self): + """ + Gets the value of :py:attr:`initMode` or its default value. + """ + return self.getOrDefault(self.initMode) + + @since("2.4.0") + def getSrcCol(self): + """ + Gets the value of :py:attr:`srcCol` or its default value. + """ + return self.getOrDefault(self.srcCol) + + @since("2.4.0") + def getDstCol(self): + """ + Gets the value of :py:attr:`dstCol` or its default value. + """ + return self.getOrDefault(self.dstCol) @inherit_doc -class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReadable, +class PowerIterationClustering(_PowerIterationClusteringParams, JavaParams, JavaMLReadable, JavaMLWritable): """ Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by @@ -1184,7 +1311,9 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight").repartition(1) - >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") + >>> pic = PowerIterationClustering(k=2, weightCol="weight") + >>> pic.setMaxIter(40) + PowerIterationClustering... >>> assignments = pic.assignClusters(df) >>> assignments.sort(assignments.id).show(truncate=False) +---+-------+ @@ -1209,22 +1338,6 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada .. versionadded:: 2.4.0 """ - k = Param(Params._dummy(), "k", - "The number of clusters to create. Must be > 1.", - typeConverter=TypeConverters.toInt) - initMode = Param(Params._dummy(), "initMode", - "The initialization algorithm. This can be either " + - "'random' to use a random vector as vertex properties, or 'degree' to use " + - "a normalized sum of similarities with other vertices. Supported options: " + - "'random' and 'degree'.", - typeConverter=TypeConverters.toString) - srcCol = Param(Params._dummy(), "srcCol", - "Name of the input column for source vertex IDs.", - typeConverter=TypeConverters.toString) - dstCol = Param(Params._dummy(), "dstCol", - "Name of the input column for destination vertex IDs.", - typeConverter=TypeConverters.toString) - @keyword_only def __init__(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst", weightCol=None): @@ -1258,13 +1371,6 @@ def setK(self, value): """ return self._set(k=value) - @since("2.4.0") - def getK(self): - """ - Gets the value of :py:attr:`k` or its default value. - """ - return self.getOrDefault(self.k) - @since("2.4.0") def setInitMode(self, value): """ @@ -1272,13 +1378,6 @@ def setInitMode(self, value): """ return self._set(initMode=value) - @since("2.4.0") - def getInitMode(self): - """ - Gets the value of :py:attr:`initMode` or its default value. - """ - return self.getOrDefault(self.initMode) - @since("2.4.0") def setSrcCol(self, value): """ @@ -1286,13 +1385,6 @@ def setSrcCol(self, value): """ return self._set(srcCol=value) - @since("2.4.0") - def getSrcCol(self): - """ - Gets the value of :py:attr:`srcCol` or its default value. - """ - return self.getOrDefault(self.srcCol) - @since("2.4.0") def setDstCol(self, value): """ @@ -1300,13 +1392,6 @@ def setDstCol(self, value): """ return self._set(dstCol=value) - @since("2.4.0") - def getDstCol(self): - """ - Gets the value of :py:attr:`dstCol` or its default value. - """ - return self.getOrDefault(self.dstCol) - @since("2.4.0") def assignClusters(self, dataset): """