From 2ced6ea904f8df15d9c2dc6f06227a3ae71bc08d Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 23 Sep 2019 17:17:10 -0700 Subject: [PATCH 1/7] [SPARK-29143][PYTHON][ML] Pyspark feature models support column setters/getters --- python/pyspark/ml/feature.py | 860 +++++++++++++++----------- python/pyspark/ml/tests/test_param.py | 2 +- 2 files changed, 483 insertions(+), 379 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 94bdd24f0f887..4dcdb3a56ac37 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -129,7 +129,7 @@ def getThreshold(self): return self.getOrDefault(self.threshold) -class LSHParams(Params): +class LSHParams(JavaParams, HasInputCol, HasOutputCol): """ Mixin for Locality Sensitive Hashing (LSH) algorithm parameters. """ @@ -139,8 +139,17 @@ class LSHParams(Params): "and decreasing it improves the running performance.", typeConverter=TypeConverters.toInt) - def __init__(self): - super(LSHParams, self).__init__() + def getNumHashTables(self): + """ + Gets the value of numHashTables or its default value. + """ + return self.getOrDefault(self.numHashTables) + + +class LSH(JavaEstimator, LSHParams, JavaMLReadable, JavaMLWritable): + """ + Mixin for Locality Sensitive Hashing (LSH). + """ def setNumHashTables(self, value): """ @@ -148,14 +157,8 @@ def setNumHashTables(self, value): """ return self._set(numHashTables=value) - def getNumHashTables(self): - """ - Gets the value of numHashTables or its default value. - """ - return self.getOrDefault(self.numHashTables) - -class LSHModel(JavaModel): +class LSHModel(JavaModel, LSHParams): """ Mixin for Locality Sensitive Hashing (LSH) models. """ @@ -200,9 +203,27 @@ def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol") return self._call_java("approxSimilarityJoin", datasetA, datasetB, threshold, distCol) +class BucketedRandomProjectionLSHParams(JavaParams): + """ + (Private) Params for BucketedRandomProjectionParams. + .. versionadded:: 3.0.0 + """ + + bucketLength = Param(Params._dummy(), "bucketLength", "the length of each hash bucket, " + + "a larger bucket lowers the false negative rate.", + typeConverter=TypeConverters.toFloat) + + @since("2.2.0") + def getBucketLength(self): + """ + Gets the value of bucketLength or its default value. + """ + return self.getOrDefault(self.bucketLength) + + @inherit_doc -class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed, - JavaMLReadable, JavaMLWritable): +class BucketedRandomProjectionLSH(LSH, BucketedRandomProjectionLSHParams, + HasSeed, JavaMLReadable, JavaMLWritable): """ LSH class for Euclidean distance metrics. The input is dense or sparse vectors, each of which represents a point in the Euclidean @@ -223,6 +244,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp >>> brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", ... seed=12345, bucketLength=1.0) >>> model = brp.fit(df) + >>> model.getBucketLength() + 1.0 >>> model.transform(df).head() Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])]) >>> data2 = [(4, Vectors.dense([2.0, 2.0 ]),), @@ -266,10 +289,6 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp .. versionadded:: 2.2.0 """ - bucketLength = Param(Params._dummy(), "bucketLength", "the length of each hash bucket, " + - "a larger bucket lowers the false negative rate.", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, bucketLength=None): @@ -303,18 +322,12 @@ def setBucketLength(self, value): """ return self._set(bucketLength=value) - @since("2.2.0") - def getBucketLength(self): - """ - Gets the value of bucketLength or its default value. - """ - return self.getOrDefault(self.bucketLength) - def _create_model(self, java_model): return BucketedRandomProjectionLSHModel(java_model) -class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): +class BucketedRandomProjectionLSHModel(LSHModel, BucketedRandomProjectionLSHParams, JavaMLReadable, + JavaMLWritable): r""" Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are stored. The vectors are normalized to be unit vectors and each vector is used in a hash @@ -1130,8 +1143,39 @@ def numDocs(self): return self._call_java("numDocs") +class ImputerParams(JavaParams, HasInputCols, HasOutputCols): + """ + (Private) Params for ImputerParams. + .. versionadded:: 3.0.0 + """ + + strategy = Param(Params._dummy(), "strategy", + "strategy for imputation. If mean, then replace missing values using the mean " + "value of the feature. If median, then replace missing values using the " + "median value of the feature.", + typeConverter=TypeConverters.toString) + + missingValue = Param(Params._dummy(), "missingValue", + "The placeholder for the missing values. All occurrences of missingValue " + "will be imputed.", typeConverter=TypeConverters.toFloat) + + @since("2.2.0") + def getStrategy(self): + """ + Gets the value of :py:attr:`strategy` or its default value. + """ + return self.getOrDefault(self.strategy) + + @since("2.2.0") + def getMissingValue(self): + """ + Gets the value of :py:attr:`missingValue` or its default value. + """ + return self.getOrDefault(self.missingValue) + + @inherit_doc -class Imputer(JavaEstimator, HasInputCols, JavaMLReadable, JavaMLWritable): +class Imputer(JavaEstimator, ImputerParams, JavaMLReadable, JavaMLWritable): """ Imputation estimator for completing missing values, either using the mean or the median of the columns in which the missing values are located. The input columns should be of @@ -1147,6 +1191,8 @@ class Imputer(JavaEstimator, HasInputCols, JavaMLReadable, JavaMLWritable): ... (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) >>> imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) >>> model = imputer.fit(df) + >>> model.getStrategy() + 'mean' >>> model.surrogateDF.show() +---+---+ | a| b| @@ -1184,19 +1230,6 @@ class Imputer(JavaEstimator, HasInputCols, JavaMLReadable, JavaMLWritable): .. versionadded:: 2.2.0 """ - outputCols = Param(Params._dummy(), "outputCols", - "output column names.", typeConverter=TypeConverters.toListString) - - strategy = Param(Params._dummy(), "strategy", - "strategy for imputation. If mean, then replace missing values using the mean " - "value of the feature. If median, then replace missing values using the " - "median value of the feature.", - typeConverter=TypeConverters.toString) - - missingValue = Param(Params._dummy(), "missingValue", - "The placeholder for the missing values. All occurrences of missingValue " - "will be imputed.", typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, strategy="mean", missingValue=float("nan"), inputCols=None, outputCols=None): @@ -1222,20 +1255,6 @@ def setParams(self, strategy="mean", missingValue=float("nan"), inputCols=None, kwargs = self._input_kwargs return self._set(**kwargs) - @since("2.2.0") - def setOutputCols(self, value): - """ - Sets the value of :py:attr:`outputCols`. - """ - return self._set(outputCols=value) - - @since("2.2.0") - def getOutputCols(self): - """ - Gets the value of :py:attr:`outputCols` or its default value. - """ - return self.getOrDefault(self.outputCols) - @since("2.2.0") def setStrategy(self, value): """ @@ -1243,13 +1262,6 @@ def setStrategy(self, value): """ return self._set(strategy=value) - @since("2.2.0") - def getStrategy(self): - """ - Gets the value of :py:attr:`strategy` or its default value. - """ - return self.getOrDefault(self.strategy) - @since("2.2.0") def setMissingValue(self, value): """ @@ -1257,18 +1269,11 @@ def setMissingValue(self, value): """ return self._set(missingValue=value) - @since("2.2.0") - def getMissingValue(self): - """ - Gets the value of :py:attr:`missingValue` or its default value. - """ - return self.getOrDefault(self.missingValue) - def _create_model(self, java_model): return ImputerModel(java_model) -class ImputerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class ImputerModel(JavaModel, ImputerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`Imputer`. @@ -1338,8 +1343,16 @@ def setParams(self, inputCols=None, outputCol=None): return self._set(**kwargs) +class MaxAbsScalerParams(JavaParams, HasInputCol, HasOutputCol): + """ + (Private) Params for MaxAbsScalerParams. + .. versionadded:: 3.0.0 + """ + pass + + @inherit_doc -class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class MaxAbsScaler(JavaEstimator, MaxAbsScalerParams, JavaMLReadable, JavaMLWritable): """ Rescale each feature individually to range [-1, 1] by dividing through the largest maximum absolute value in each feature. It does not shift/center the data, and thus does not destroy @@ -1349,6 +1362,8 @@ class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav >>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") >>> model = maScaler.fit(df) + >>> model.getOutputCol() + 'scaled' >>> model.transform(df).show() +-----+------+ | a|scaled| @@ -1398,7 +1413,7 @@ def _create_model(self, java_model): return MaxAbsScalerModel(java_model) -class MaxAbsScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class MaxAbsScalerModel(JavaModel, MaxAbsScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`MaxAbsScaler`. @@ -1509,8 +1524,34 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): """ +class MinMaxScalerParams(JavaParams, HasInputCol, HasOutputCol): + """ + (Private) Params for MinMaxScalerParams. + .. versionadded:: 3.0.0 + """ + + min = Param(Params._dummy(), "min", "Lower bound of the output feature range", + typeConverter=TypeConverters.toFloat) + max = Param(Params._dummy(), "max", "Upper bound of the output feature range", + typeConverter=TypeConverters.toFloat) + + @since("1.6.0") + def getMin(self): + """ + Gets the value of min or its default value. + """ + return self.getOrDefault(self.min) + + @since("1.6.0") + def getMax(self): + """ + Gets the value of max or its default value. + """ + return self.getOrDefault(self.max) + + @inherit_doc -class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class MinMaxScaler(JavaEstimator, MinMaxScalerParams, JavaMLReadable, JavaMLWritable): """ Rescale each feature individually to a common range [min, max] linearly using column summary statistics, which is also known as min-max normalization or Rescaling. The rescaled value for @@ -1527,17 +1568,19 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") >>> model = mmScaler.fit(df) + >>> model.setOutputCol("scaledOutput") + MinMaxScaler... >>> model.originalMin DenseVector([0.0]) >>> model.originalMax DenseVector([2.0]) >>> model.transform(df).show() - +-----+------+ - | a|scaled| - +-----+------+ - |[0.0]| [0.0]| - |[2.0]| [1.0]| - +-----+------+ + +-----+------------+ + | a|scaledOutput| + +-----+------------+ + |[0.0]| [0.0]| + |[2.0]| [1.0]| + +-----+------------+ ... >>> minMaxScalerPath = temp_path + "/min-max-scaler" >>> mmScaler.save(minMaxScalerPath) @@ -1557,11 +1600,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav .. versionadded:: 1.6.0 """ - min = Param(Params._dummy(), "min", "Lower bound of the output feature range", - typeConverter=TypeConverters.toFloat) - max = Param(Params._dummy(), "max", "Upper bound of the output feature range", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): """ @@ -1590,13 +1628,6 @@ def setMin(self, value): """ return self._set(min=value) - @since("1.6.0") - def getMin(self): - """ - Gets the value of min or its default value. - """ - return self.getOrDefault(self.min) - @since("1.6.0") def setMax(self, value): """ @@ -1604,18 +1635,11 @@ def setMax(self, value): """ return self._set(max=value) - @since("1.6.0") - def getMax(self): - """ - Gets the value of max or its default value. - """ - return self.getOrDefault(self.max) - def _create_model(self, java_model): return MinMaxScalerModel(java_model) -class MinMaxScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class MinMaxScalerModel(JavaModel, MinMaxScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`MinMaxScaler`. @@ -1780,9 +1804,32 @@ def getP(self): return self.getOrDefault(self.p) +class OneHotEncoderParams(JavaParams, HasInputCols, HasOutputCols, HasHandleInvalid): + """ + (Private) Params for OneHotEncoderParams. + .. versionadded:: 3.0.0 + """ + + handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " + + "transform(). Options are 'keep' (invalid data presented as an extra " + + "categorical feature) or error (throw an error). Note that this Param " + + "is only used during transform; during fitting, invalid data will " + + "result in an error.", + typeConverter=TypeConverters.toString) + + dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", + typeConverter=TypeConverters.toBoolean) + + @since("2.3.0") + def getDropLast(self): + """ + Gets the value of dropLast or its default value. + """ + return self.getOrDefault(self.dropLast) + + @inherit_doc -class OneHotEncoder(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid, - JavaMLReadable, JavaMLWritable): +class OneHotEncoder(JavaEstimator, OneHotEncoderParams, JavaMLReadable, JavaMLWritable): """ A one-hot encoder that maps a column of category indices to a column of binary vectors, with at most a single one-value per row that indicates the input category index. @@ -1809,6 +1856,8 @@ class OneHotEncoder(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"]) >>> ohe = OneHotEncoder(inputCols=["input"], outputCols=["output"]) >>> model = ohe.fit(df) + >>> model.getHandleInvalid() + 'error' >>> model.transform(df).head().output SparseVector(2, {0: 1.0}) >>> ohePath = temp_path + "/ohe" @@ -1825,16 +1874,6 @@ class OneHotEncoder(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid .. versionadded:: 2.3.0 """ - handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " + - "transform(). Options are 'keep' (invalid data presented as an extra " + - "categorical feature) or error (throw an error). Note that this Param " + - "is only used during transform; during fitting, invalid data will " + - "result in an error.", - typeConverter=TypeConverters.toString) - - dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", - typeConverter=TypeConverters.toBoolean) - @keyword_only def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True): """ @@ -1864,18 +1903,11 @@ def setDropLast(self, value): """ return self._set(dropLast=value) - @since("2.3.0") - def getDropLast(self): - """ - Gets the value of dropLast or its default value. - """ - return self.getOrDefault(self.dropLast) - def _create_model(self, java_model): return OneHotEncoderModel(java_model) -class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable): +class OneHotEncoderModel(JavaModel, OneHotEncoderParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`OneHotEncoder`. @@ -2157,8 +2189,52 @@ def _create_model(self, java_model): handleInvalid=self.getHandleInvalid()) +class RobustScalerParams(JavaParams, HasInputCol, HasOutputCol): + """ + (Private) Params for RobustScalerParams. + .. versionadded:: 3.0.0 + """ + + lower = Param(Params._dummy(), "lower", "Lower quantile to calculate quantile range", + typeConverter=TypeConverters.toFloat) + upper = Param(Params._dummy(), "upper", "Upper quantile to calculate quantile range", + typeConverter=TypeConverters.toFloat) + withCentering = Param(Params._dummy(), "withCentering", "Whether to center data with median", + typeConverter=TypeConverters.toBoolean) + withScaling = Param(Params._dummy(), "withScaling", "Whether to scale the data to " + "quantile range", typeConverter=TypeConverters.toBoolean) + + @since("3.0.0") + def getLower(self): + """ + Gets the value of lower or its default value. + """ + return self.getOrDefault(self.lower) + + @since("3.0.0") + def getUpper(self): + """ + Gets the value of upper or its default value. + """ + return self.getOrDefault(self.upper) + + @since("3.0.0") + def getWithCentering(self): + """ + Gets the value of withCentering or its default value. + """ + return self.getOrDefault(self.withCentering) + + @since("3.0.0") + def getWithScaling(self): + """ + Gets the value of withScaling or its default value. + """ + return self.getOrDefault(self.withScaling) + + @inherit_doc -class RobustScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class RobustScaler(JavaEstimator, RobustScalerParams, JavaMLReadable, JavaMLWritable): """ RobustScaler removes the median and scales the data according to the quantile range. The quantile range is by default IQR (Interquartile Range, quantile range between the @@ -2176,11 +2252,13 @@ class RobustScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav >>> df = spark.createDataFrame(data, ["id", "features"]) >>> scaler = RobustScaler(inputCol="features", outputCol="scaled") >>> model = scaler.fit(df) + >>> model.setOutputCol("output") + RobustScaler... >>> model.median DenseVector([2.0, -2.0]) >>> model.range DenseVector([2.0, 2.0]) - >>> model.transform(df).collect()[1].scaled + >>> model.transform(df).collect()[1].output DenseVector([0.5, -0.5]) >>> scalerPath = temp_path + "/robust-scaler" >>> scaler.save(scalerPath) @@ -2200,15 +2278,6 @@ class RobustScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav .. versionadded:: 3.0.0 """ - lower = Param(Params._dummy(), "lower", "Lower quantile to calculate quantile range", - typeConverter=TypeConverters.toFloat) - upper = Param(Params._dummy(), "upper", "Upper quantile to calculate quantile range", - typeConverter=TypeConverters.toFloat) - withCentering = Param(Params._dummy(), "withCentering", "Whether to center data with median", - typeConverter=TypeConverters.toBoolean) - withScaling = Param(Params._dummy(), "withScaling", "Whether to scale the data to " - "quantile range", typeConverter=TypeConverters.toBoolean) - @keyword_only def __init__(self, lower=0.25, upper=0.75, withCentering=False, withScaling=True, inputCol=None, outputCol=None): @@ -2241,13 +2310,6 @@ def setLower(self, value): """ return self._set(lower=value) - @since("3.0.0") - def getLower(self): - """ - Gets the value of lower or its default value. - """ - return self.getOrDefault(self.lower) - @since("3.0.0") def setUpper(self, value): """ @@ -2255,13 +2317,6 @@ def setUpper(self, value): """ return self._set(upper=value) - @since("3.0.0") - def getUpper(self): - """ - Gets the value of upper or its default value. - """ - return self.getOrDefault(self.upper) - @since("3.0.0") def setWithCentering(self, value): """ @@ -2269,13 +2324,6 @@ def setWithCentering(self, value): """ return self._set(withCentering=value) - @since("3.0.0") - def getWithCentering(self): - """ - Gets the value of withCentering or its default value. - """ - return self.getOrDefault(self.withCentering) - @since("3.0.0") def setWithScaling(self, value): """ @@ -2283,18 +2331,11 @@ def setWithScaling(self, value): """ return self._set(withScaling=value) - @since("3.0.0") - def getWithScaling(self): - """ - Gets the value of withScaling or its default value. - """ - return self.getOrDefault(self.withScaling) - def _create_model(self, java_model): return RobustScalerModel(java_model) -class RobustScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class RobustScalerModel(JavaModel, RobustScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`RobustScaler`. @@ -2507,8 +2548,34 @@ def getStatement(self): return self.getOrDefault(self.statement) +class StandardScalerParams(JavaParams, HasInputCol, HasOutputCol): + """ + (Private) Params for StandardScalerParams. + .. versionadded:: 3.0.0 + """ + + withMean = Param(Params._dummy(), "withMean", "Center data with mean", + typeConverter=TypeConverters.toBoolean) + withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", + typeConverter=TypeConverters.toBoolean) + + @since("1.4.0") + def getWithMean(self): + """ + Gets the value of withMean or its default value. + """ + return self.getOrDefault(self.withMean) + + @since("1.4.0") + def getWithStd(self): + """ + Gets the value of withStd or its default value. + """ + return self.getOrDefault(self.withStd) + + @inherit_doc -class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class StandardScaler(JavaEstimator, StandardScalerParams, JavaMLReadable, JavaMLWritable): """ Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. @@ -2521,11 +2588,15 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") >>> model = standardScaler.fit(df) + >>> model.getInputCol() + 'a' + >>> model.setOutputCol("output") + StandardScaler... >>> model.mean DenseVector([1.0]) >>> model.std DenseVector([1.4142]) - >>> model.transform(df).collect()[1].scaled + >>> model.transform(df).collect()[1].output DenseVector([1.4142]) >>> standardScalerPath = temp_path + "/standard-scaler" >>> standardScaler.save(standardScalerPath) @@ -2545,11 +2616,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J .. versionadded:: 1.4.0 """ - withMean = Param(Params._dummy(), "withMean", "Center data with mean", - typeConverter=TypeConverters.toBoolean) - withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", - typeConverter=TypeConverters.toBoolean) - @keyword_only def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): """ @@ -2578,13 +2644,6 @@ def setWithMean(self, value): """ return self._set(withMean=value) - @since("1.4.0") - def getWithMean(self): - """ - Gets the value of withMean or its default value. - """ - return self.getOrDefault(self.withMean) - @since("1.4.0") def setWithStd(self, value): """ @@ -2592,18 +2651,11 @@ def setWithStd(self, value): """ return self._set(withStd=value) - @since("1.4.0") - def getWithStd(self): - """ - Gets the value of withStd or its default value. - """ - return self.getOrDefault(self.withStd) - def _create_model(self, java_model): return StandardScalerModel(java_model) -class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class StandardScalerModel(JavaModel, StandardScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`StandardScaler`. @@ -3126,9 +3178,34 @@ def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): return self._set(**kwargs) +class VectorIndexerParams(JavaParams, HasInputCol, HasOutputCol, HasHandleInvalid): + """ + (Private) Params for VectorIndexerParams. + .. versionadded:: 3.0.0 + """ + + maxCategories = Param(Params._dummy(), "maxCategories", + "Threshold for the number of values a categorical feature can take " + + "(>= 2). If a feature is found to have > maxCategories values, then " + + "it is declared continuous.", typeConverter=TypeConverters.toInt) + + handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data " + + "(unseen labels or NULL values). Options are 'skip' (filter out " + + "rows with invalid data), 'error' (throw an error), or 'keep' (put " + + "invalid data in a special additional bucket, at index of the number " + + "of categories of the feature).", + typeConverter=TypeConverters.toString) + + @since("1.4.0") + def getMaxCategories(self): + """ + Gets the value of maxCategories or its default value. + """ + return self.getOrDefault(self.maxCategories) + + @inherit_doc -class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, JavaMLReadable, - JavaMLWritable): +class VectorIndexer(JavaEstimator, VectorIndexerParams, JavaMLReadable, JavaMLWritable): """ Class for indexing categorical feature columns in a dataset of `Vector`. @@ -3169,7 +3246,11 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") >>> model = indexer.fit(df) - >>> model.transform(df).head().indexed + >>> indexer.getHandleInvalid() + 'error' + >>> model.setOutputCol("output") + VectorIndexer... + >>> model.transform(df).head().output DenseVector([1.0, 0.0]) >>> model.numFeatures 2 @@ -3206,18 +3287,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, .. versionadded:: 1.4.0 """ - maxCategories = Param(Params._dummy(), "maxCategories", - "Threshold for the number of values a categorical feature can take " + - "(>= 2). If a feature is found to have > maxCategories values, then " + - "it is declared continuous.", typeConverter=TypeConverters.toInt) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data " + - "(unseen labels or NULL values). Options are 'skip' (filter out " + - "rows with invalid data), 'error' (throw an error), or 'keep' (put " + - "invalid data in a special additional bucket, at index of the number " + - "of categories of the feature).", - typeConverter=TypeConverters.toString) - @keyword_only def __init__(self, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error"): """ @@ -3246,18 +3315,11 @@ def setMaxCategories(self, value): """ return self._set(maxCategories=value) - @since("1.4.0") - def getMaxCategories(self): - """ - Gets the value of maxCategories or its default value. - """ - return self.getOrDefault(self.maxCategories) - def _create_model(self, java_model): return VectorIndexerModel(java_model) -class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class VectorIndexerModel(JavaModel, VectorIndexerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`VectorIndexer`. @@ -3383,10 +3445,69 @@ def getNames(self): return self.getOrDefault(self.names) +class Word2VecParams(JavaParams, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): + """ + (Private) Params for Word2VecParams. + .. versionadded:: 3.0.0 + """ + + vectorSize = Param(Params._dummy(), "vectorSize", + "the dimension of codes after transforming from words", + typeConverter=TypeConverters.toInt) + numPartitions = Param(Params._dummy(), "numPartitions", + "number of partitions for sentences of words", + typeConverter=TypeConverters.toInt) + minCount = Param(Params._dummy(), "minCount", + "the minimum number of times a token must appear to be included in the " + + "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) + windowSize = Param(Params._dummy(), "windowSize", + "the window size (context words from [-window, window]). Default value is 5", + typeConverter=TypeConverters.toInt) + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", + "Maximum length (in words) of each sentence in the input data. " + + "Any sentence longer than this threshold will " + + "be divided into chunks up to the size.", + typeConverter=TypeConverters.toInt) + + @since("1.4.0") + def getVectorSize(self): + """ + Gets the value of vectorSize or its default value. + """ + return self.getOrDefault(self.vectorSize) + + @since("1.4.0") + def getNumPartitions(self): + """ + Gets the value of numPartitions or its default value. + """ + return self.getOrDefault(self.numPartitions) + + @since("1.4.0") + def getMinCount(self): + """ + Gets the value of minCount or its default value. + """ + return self.getOrDefault(self.minCount) + + @since("2.0.0") + def getWindowSize(self): + """ + Gets the value of windowSize or its default value. + """ + return self.getOrDefault(self.windowSize) + + @since("2.0.0") + def getMaxSentenceLength(self): + """ + Gets the value of maxSentenceLength or its default value. + """ + return self.getOrDefault(self.maxSentenceLength) + + @inherit_doc @ignore_unicode_prefix -class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol, - JavaMLReadable, JavaMLWritable): +class Word2Vec(JavaEstimator, Word2VecParams, JavaMLReadable, JavaMLWritable): """ Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further natural language processing or machine learning process. @@ -3395,6 +3516,8 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has >>> doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model") >>> model = word2Vec.fit(doc) + >>> model.getMinCount() + 5 >>> model.getVectors().show() +----+--------------------+ |word| vector| @@ -3437,24 +3560,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has .. versionadded:: 1.4.0 """ - vectorSize = Param(Params._dummy(), "vectorSize", - "the dimension of codes after transforming from words", - typeConverter=TypeConverters.toInt) - numPartitions = Param(Params._dummy(), "numPartitions", - "number of partitions for sentences of words", - typeConverter=TypeConverters.toInt) - minCount = Param(Params._dummy(), "minCount", - "the minimum number of times a token must appear to be included in the " + - "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) - windowSize = Param(Params._dummy(), "windowSize", - "the window size (context words from [-window, window]). Default value is 5", - typeConverter=TypeConverters.toInt) - maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", - "Maximum length (in words) of each sentence in the input data. " + - "Any sentence longer than this threshold will " + - "be divided into chunks up to the size.", - typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000): @@ -3488,13 +3593,6 @@ def setVectorSize(self, value): """ return self._set(vectorSize=value) - @since("1.4.0") - def getVectorSize(self): - """ - Gets the value of vectorSize or its default value. - """ - return self.getOrDefault(self.vectorSize) - @since("1.4.0") def setNumPartitions(self, value): """ @@ -3502,13 +3600,6 @@ def setNumPartitions(self, value): """ return self._set(numPartitions=value) - @since("1.4.0") - def getNumPartitions(self): - """ - Gets the value of numPartitions or its default value. - """ - return self.getOrDefault(self.numPartitions) - @since("1.4.0") def setMinCount(self, value): """ @@ -3516,13 +3607,6 @@ def setMinCount(self, value): """ return self._set(minCount=value) - @since("1.4.0") - def getMinCount(self): - """ - Gets the value of minCount or its default value. - """ - return self.getOrDefault(self.minCount) - @since("2.0.0") def setWindowSize(self, value): """ @@ -3530,13 +3614,6 @@ def setWindowSize(self, value): """ return self._set(windowSize=value) - @since("2.0.0") - def getWindowSize(self): - """ - Gets the value of windowSize or its default value. - """ - return self.getOrDefault(self.windowSize) - @since("2.0.0") def setMaxSentenceLength(self, value): """ @@ -3555,7 +3632,7 @@ def _create_model(self, java_model): return Word2VecModel(java_model) -class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable): +class Word2VecModel(JavaModel, Word2VecParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`Word2Vec`. @@ -3596,8 +3673,25 @@ def findSynonymsArray(self, word, num): return list(map(lambda st: (st._1(), st._2()), list(tuples))) +class PCAParams(JavaParams, HasInputCol, HasOutputCol): + """ + (Private) Params for PCAParams. + .. versionadded:: 3.0.0 + """ + + k = Param(Params._dummy(), "k", "the number of principal components", + typeConverter=TypeConverters.toInt) + + @since("1.5.0") + def getK(self): + """ + Gets the value of k or its default value. + """ + return self.getOrDefault(self.k) + + @inherit_doc -class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class PCA(JavaEstimator, PCAParams, JavaMLReadable, JavaMLWritable): """ PCA trains a model to project vectors to a lower dimensional space of the top :py:attr:`k` principal components. @@ -3609,7 +3703,11 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab >>> df = spark.createDataFrame(data,["features"]) >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features") >>> model = pca.fit(df) - >>> model.transform(df).collect()[0].pca_features + >>> model.getK() + 2 + >>> model.setOutputCol("output") + PCA... + >>> model.transform(df).collect()[0].output DenseVector([1.648..., -4.013...]) >>> model.explainedVariance DenseVector([0.794..., 0.205...]) @@ -3629,9 +3727,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab .. versionadded:: 1.5.0 """ - k = Param(Params._dummy(), "k", "the number of principal components", - typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, k=None, inputCol=None, outputCol=None): """ @@ -3659,18 +3754,11 @@ def setK(self, value): """ return self._set(k=value) - @since("1.5.0") - def getK(self): - """ - Gets the value of k or its default value. - """ - return self.getOrDefault(self.k) - def _create_model(self, java_model): return PCAModel(java_model) -class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable): +class PCAModel(JavaModel, PCAParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space. @@ -3696,9 +3784,58 @@ def explainedVariance(self): return self._call_java("explainedVariance") +class RFormulaParams(JavaParams, HasFeaturesCol, HasLabelCol, HasHandleInvalid): + """ + (Private) Params for RFormulaParams. + .. versionadded:: 3.0.0 + """ + + formula = Param(Params._dummy(), "formula", "R model formula", + typeConverter=TypeConverters.toString) + + forceIndexLabel = Param(Params._dummy(), "forceIndexLabel", + "Force to index label whether it is numeric or string", + typeConverter=TypeConverters.toBoolean) + + stringIndexerOrderType = Param(Params._dummy(), "stringIndexerOrderType", + "How to order categories of a string feature column used by " + + "StringIndexer. The last category after ordering is dropped " + + "when encoding strings. Supported options: frequencyDesc, " + + "frequencyAsc, alphabetDesc, alphabetAsc. The default value " + + "is frequencyDesc. When the ordering is set to alphabetDesc, " + + "RFormula drops the same category as R when encoding strings.", + typeConverter=TypeConverters.toString) + + handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + + "Options are 'skip' (filter out rows with invalid values), " + + "'error' (throw an error), or 'keep' (put invalid data in a special " + + "additional bucket, at index numLabels).", + typeConverter=TypeConverters.toString) + + @since("1.5.0") + def getFormula(self): + """ + Gets the value of :py:attr:`formula`. + """ + return self.getOrDefault(self.formula) + + @since("2.1.0") + def getForceIndexLabel(self): + """ + Gets the value of :py:attr:`forceIndexLabel`. + """ + return self.getOrDefault(self.forceIndexLabel) + + @since("2.3.0") + def getStringIndexerOrderType(self): + """ + Gets the value of :py:attr:`stringIndexerOrderType` or its default value 'frequencyDesc'. + """ + return self.getOrDefault(self.stringIndexerOrderType) + + @inherit_doc -class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, HasHandleInvalid, - JavaMLReadable, JavaMLWritable): +class RFormula(JavaEstimator, RFormulaParams, JavaMLReadable, JavaMLWritable): """ Implements the transforms required for fitting a dataset against an R model formula. Currently we support a limited subset of the R @@ -3713,6 +3850,8 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, HasHandleInvalid, ... ], ["y", "x", "s"]) >>> rf = RFormula(formula="y ~ x + s") >>> model = rf.fit(df) + >>> model.getLabelCol() + 'label' >>> model.transform(df).show() +---+---+---+---------+-----+ | y| x| s| features|label| @@ -3764,28 +3903,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, HasHandleInvalid, .. versionadded:: 1.5.0 """ - formula = Param(Params._dummy(), "formula", "R model formula", - typeConverter=TypeConverters.toString) - - forceIndexLabel = Param(Params._dummy(), "forceIndexLabel", - "Force to index label whether it is numeric or string", - typeConverter=TypeConverters.toBoolean) - - stringIndexerOrderType = Param(Params._dummy(), "stringIndexerOrderType", - "How to order categories of a string feature column used by " + - "StringIndexer. The last category after ordering is dropped " + - "when encoding strings. Supported options: frequencyDesc, " + - "frequencyAsc, alphabetDesc, alphabetAsc. The default value " + - "is frequencyDesc. When the ordering is set to alphabetDesc, " + - "RFormula drops the same category as R when encoding strings.", - typeConverter=TypeConverters.toString) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + - "Options are 'skip' (filter out rows with invalid values), " + - "'error' (throw an error), or 'keep' (put invalid data in a special " + - "additional bucket, at index numLabels).", - typeConverter=TypeConverters.toString) - @keyword_only def __init__(self, formula=None, featuresCol="features", labelCol="label", forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", @@ -3823,13 +3940,6 @@ def setFormula(self, value): """ return self._set(formula=value) - @since("1.5.0") - def getFormula(self): - """ - Gets the value of :py:attr:`formula`. - """ - return self.getOrDefault(self.formula) - @since("2.1.0") def setForceIndexLabel(self, value): """ @@ -3837,13 +3947,6 @@ def setForceIndexLabel(self, value): """ return self._set(forceIndexLabel=value) - @since("2.1.0") - def getForceIndexLabel(self): - """ - Gets the value of :py:attr:`forceIndexLabel`. - """ - return self.getOrDefault(self.forceIndexLabel) - @since("2.3.0") def setStringIndexerOrderType(self, value): """ @@ -3851,13 +3954,6 @@ def setStringIndexerOrderType(self, value): """ return self._set(stringIndexerOrderType=value) - @since("2.3.0") - def getStringIndexerOrderType(self): - """ - Gets the value of :py:attr:`stringIndexerOrderType` or its default value 'frequencyDesc'. - """ - return self.getOrDefault(self.stringIndexerOrderType) - def _create_model(self, java_model): return RFormulaModel(java_model) @@ -3866,7 +3962,7 @@ def __str__(self): return "RFormula(%s) (uid=%s)" % (formulaStr, self.uid) -class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable): +class RFormulaModel(JavaModel, RFormulaParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`RFormula`. Fitting is required to determine the factor levels of formula terms. @@ -3879,9 +3975,81 @@ def __str__(self): return "RFormulaModel(%s) (uid=%s)" % (resolvedFormula, self.uid) +class ChiSqSelectorParams(JavaParams, HasFeaturesCol, HasOutputCol, HasLabelCol): + """ + (Private) Params for ChiSqSelectorParams. + .. versionadded:: 3.0.0 + """ + + selectorType = Param(Params._dummy(), "selectorType", + "The selector type of the ChisqSelector. " + + "Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe.", + typeConverter=TypeConverters.toString) + + numTopFeatures = \ + Param(Params._dummy(), "numTopFeatures", + "Number of features that selector will select, ordered by ascending p-value. " + + "If the number of features is < numTopFeatures, then this will select " + + "all features.", typeConverter=TypeConverters.toInt) + + percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " + + "will select, ordered by ascending p-value.", + typeConverter=TypeConverters.toFloat) + + fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.", + typeConverter=TypeConverters.toFloat) + + fdr = Param(Params._dummy(), "fdr", "The upper bound of the expected false discovery rate.", + typeConverter=TypeConverters.toFloat) + + fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.", + typeConverter=TypeConverters.toFloat) + + @since("2.1.0") + def getSelectorType(self): + """ + Gets the value of selectorType or its default value. + """ + return self.getOrDefault(self.selectorType) + + @since("2.0.0") + def getNumTopFeatures(self): + """ + Gets the value of numTopFeatures or its default value. + """ + return self.getOrDefault(self.numTopFeatures) + + @since("2.1.0") + def getPercentile(self): + """ + Gets the value of percentile or its default value. + """ + return self.getOrDefault(self.percentile) + + @since("2.1.0") + def getFpr(self): + """ + Gets the value of fpr or its default value. + """ + return self.getOrDefault(self.fpr) + + @since("2.2.0") + def getFdr(self): + """ + Gets the value of fdr or its default value. + """ + return self.getOrDefault(self.fdr) + + @since("2.2.0") + def getFwe(self): + """ + Gets the value of fwe or its default value. + """ + return self.getOrDefault(self.fwe) + + @inherit_doc -class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, JavaMLReadable, - JavaMLWritable): +class ChiSqSelector(JavaEstimator, ChiSqSelectorParams, JavaMLReadable, JavaMLWritable): """ Chi-Squared feature selection, which selects categorical features to use for predicting a categorical label. @@ -3915,6 +4083,8 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja ... ["features", "label"]) >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") >>> model = selector.fit(df) + >>> model.getFeaturesCol() + 'features' >>> model.transform(df).head().selectedFeatures DenseVector([18.0]) >>> model.selectedFeatures @@ -3933,30 +4103,6 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja .. versionadded:: 2.0.0 """ - selectorType = Param(Params._dummy(), "selectorType", - "The selector type of the ChisqSelector. " + - "Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe.", - typeConverter=TypeConverters.toString) - - numTopFeatures = \ - Param(Params._dummy(), "numTopFeatures", - "Number of features that selector will select, ordered by ascending p-value. " + - "If the number of features is < numTopFeatures, then this will select " + - "all features.", typeConverter=TypeConverters.toInt) - - percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " + - "will select, ordered by ascending p-value.", - typeConverter=TypeConverters.toFloat) - - fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.", - typeConverter=TypeConverters.toFloat) - - fdr = Param(Params._dummy(), "fdr", "The upper bound of the expected false discovery rate.", - typeConverter=TypeConverters.toFloat) - - fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, @@ -3994,13 +4140,6 @@ def setSelectorType(self, value): """ return self._set(selectorType=value) - @since("2.1.0") - def getSelectorType(self): - """ - Gets the value of selectorType or its default value. - """ - return self.getOrDefault(self.selectorType) - @since("2.0.0") def setNumTopFeatures(self, value): """ @@ -4009,13 +4148,6 @@ def setNumTopFeatures(self, value): """ return self._set(numTopFeatures=value) - @since("2.0.0") - def getNumTopFeatures(self): - """ - Gets the value of numTopFeatures or its default value. - """ - return self.getOrDefault(self.numTopFeatures) - @since("2.1.0") def setPercentile(self, value): """ @@ -4024,13 +4156,6 @@ def setPercentile(self, value): """ return self._set(percentile=value) - @since("2.1.0") - def getPercentile(self): - """ - Gets the value of percentile or its default value. - """ - return self.getOrDefault(self.percentile) - @since("2.1.0") def setFpr(self, value): """ @@ -4039,13 +4164,6 @@ def setFpr(self, value): """ return self._set(fpr=value) - @since("2.1.0") - def getFpr(self): - """ - Gets the value of fpr or its default value. - """ - return self.getOrDefault(self.fpr) - @since("2.2.0") def setFdr(self, value): """ @@ -4054,13 +4172,6 @@ def setFdr(self, value): """ return self._set(fdr=value) - @since("2.2.0") - def getFdr(self): - """ - Gets the value of fdr or its default value. - """ - return self.getOrDefault(self.fdr) - @since("2.2.0") def setFwe(self, value): """ @@ -4069,18 +4180,11 @@ def setFwe(self, value): """ return self._set(fwe=value) - @since("2.2.0") - def getFwe(self): - """ - Gets the value of fwe or its default value. - """ - return self.getOrDefault(self.fwe) - def _create_model(self, java_model): return ChiSqSelectorModel(java_model) -class ChiSqSelectorModel(JavaModel, JavaMLReadable, JavaMLWritable): +class ChiSqSelectorModel(JavaModel, ChiSqSelectorParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`ChiSqSelector`. diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index f12310c7d8eba..2f6eeeef756e1 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -353,7 +353,7 @@ def test_java_params(self): for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \ - and not name.startswith('Java'): + and not name.startswith('Java') and name != 'LSH': # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) From 74de7a49c9690fa2f0557a3bd3a82c8ff328f02f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 24 Sep 2019 11:13:00 -0700 Subject: [PATCH 2/7] address comments --- python/pyspark/ml/feature.py | 77 ++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 4dcdb3a56ac37..f1569b93e7c21 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -129,7 +129,7 @@ def getThreshold(self): return self.getOrDefault(self.threshold) -class LSHParams(JavaParams, HasInputCol, HasOutputCol): +class LSHParams(HasInputCol, HasOutputCol): """ Mixin for Locality Sensitive Hashing (LSH) algorithm parameters. """ @@ -203,7 +203,7 @@ def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol") return self._call_java("approxSimilarityJoin", datasetA, datasetB, threshold, distCol) -class BucketedRandomProjectionLSHParams(JavaParams): +class BucketedRandomProjectionLSHParams(): """ (Private) Params for BucketedRandomProjectionParams. .. versionadded:: 3.0.0 @@ -1031,8 +1031,26 @@ def indexOf(self, term): return self._java_obj.indexOf(term) +class IDFParams(HasInputCol, HasOutputCol): + """ + (Private) Params for IDFParams. + .. versionadded:: 3.0.0 + """ + + minDocFreq = Param(Params._dummy(), "minDocFreq", + "minimum number of documents in which a term should appear for filtering", + typeConverter=TypeConverters.toInt) + + @since("1.4.0") + def getMinDocFreq(self): + """ + Gets the value of minDocFreq or its default value. + """ + return self.getOrDefault(self.minDocFreq) + + @inherit_doc -class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class IDF(JavaEstimator, IDFParams, JavaMLReadable, JavaMLWritable): """ Compute the Inverse Document Frequency (IDF) given a collection of documents. @@ -1041,6 +1059,8 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf") >>> model = idf.fit(df) + >>> model.getMinDocFreq() + 3 >>> model.idf DenseVector([0.0, 0.0]) >>> model.docFreq @@ -1068,10 +1088,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab .. versionadded:: 1.4.0 """ - minDocFreq = Param(Params._dummy(), "minDocFreq", - "minimum number of documents in which a term should appear for filtering", - typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): """ @@ -1100,18 +1116,11 @@ def setMinDocFreq(self, value): """ return self._set(minDocFreq=value) - @since("1.4.0") - def getMinDocFreq(self): - """ - Gets the value of minDocFreq or its default value. - """ - return self.getOrDefault(self.minDocFreq) - def _create_model(self, java_model): return IDFModel(java_model) -class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable): +class IDFModel(JavaModel, IDFParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`IDF`. @@ -1143,7 +1152,7 @@ def numDocs(self): return self._call_java("numDocs") -class ImputerParams(JavaParams, HasInputCols, HasOutputCols): +class ImputerParams(HasInputCols, HasOutputCols): """ (Private) Params for ImputerParams. .. versionadded:: 3.0.0 @@ -1343,7 +1352,7 @@ def setParams(self, inputCols=None, outputCol=None): return self._set(**kwargs) -class MaxAbsScalerParams(JavaParams, HasInputCol, HasOutputCol): +class MaxAbsScalerParams(HasInputCol, HasOutputCol): """ (Private) Params for MaxAbsScalerParams. .. versionadded:: 3.0.0 @@ -1362,15 +1371,15 @@ class MaxAbsScaler(JavaEstimator, MaxAbsScalerParams, JavaMLReadable, JavaMLWrit >>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") >>> model = maScaler.fit(df) - >>> model.getOutputCol() - 'scaled' + >>> model.setOutputCol("scaledOutput") + MaxAbsScaler... >>> model.transform(df).show() - +-----+------+ - | a|scaled| - +-----+------+ - |[1.0]| [0.5]| - |[2.0]| [1.0]| - +-----+------+ + +-----+------------+ + | a|scaledOutput| + +-----+------------+ + |[1.0]| [0.5]| + |[2.0]| [1.0]| + +-----+------------+ ... >>> scalerPath = temp_path + "/max-abs-scaler" >>> maScaler.save(scalerPath) @@ -1524,7 +1533,7 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): """ -class MinMaxScalerParams(JavaParams, HasInputCol, HasOutputCol): +class MinMaxScalerParams(HasInputCol, HasOutputCol): """ (Private) Params for MinMaxScalerParams. .. versionadded:: 3.0.0 @@ -1804,7 +1813,7 @@ def getP(self): return self.getOrDefault(self.p) -class OneHotEncoderParams(JavaParams, HasInputCols, HasOutputCols, HasHandleInvalid): +class OneHotEncoderParams(HasInputCols, HasOutputCols, HasHandleInvalid): """ (Private) Params for OneHotEncoderParams. .. versionadded:: 3.0.0 @@ -2189,7 +2198,7 @@ def _create_model(self, java_model): handleInvalid=self.getHandleInvalid()) -class RobustScalerParams(JavaParams, HasInputCol, HasOutputCol): +class RobustScalerParams(HasInputCol, HasOutputCol): """ (Private) Params for RobustScalerParams. .. versionadded:: 3.0.0 @@ -2548,7 +2557,7 @@ def getStatement(self): return self.getOrDefault(self.statement) -class StandardScalerParams(JavaParams, HasInputCol, HasOutputCol): +class StandardScalerParams(HasInputCol, HasOutputCol): """ (Private) Params for StandardScalerParams. .. versionadded:: 3.0.0 @@ -3178,7 +3187,7 @@ def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): return self._set(**kwargs) -class VectorIndexerParams(JavaParams, HasInputCol, HasOutputCol, HasHandleInvalid): +class VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): """ (Private) Params for VectorIndexerParams. .. versionadded:: 3.0.0 @@ -3445,7 +3454,7 @@ def getNames(self): return self.getOrDefault(self.names) -class Word2VecParams(JavaParams, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): +class Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): """ (Private) Params for Word2VecParams. .. versionadded:: 3.0.0 @@ -3673,7 +3682,7 @@ def findSynonymsArray(self, word, num): return list(map(lambda st: (st._1(), st._2()), list(tuples))) -class PCAParams(JavaParams, HasInputCol, HasOutputCol): +class PCAParams(HasInputCol, HasOutputCol): """ (Private) Params for PCAParams. .. versionadded:: 3.0.0 @@ -3784,7 +3793,7 @@ def explainedVariance(self): return self._call_java("explainedVariance") -class RFormulaParams(JavaParams, HasFeaturesCol, HasLabelCol, HasHandleInvalid): +class RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): """ (Private) Params for RFormulaParams. .. versionadded:: 3.0.0 @@ -3975,7 +3984,7 @@ def __str__(self): return "RFormulaModel(%s) (uid=%s)" % (resolvedFormula, self.uid) -class ChiSqSelectorParams(JavaParams, HasFeaturesCol, HasOutputCol, HasLabelCol): +class ChiSqSelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): """ (Private) Params for ChiSqSelectorParams. .. versionadded:: 3.0.0 From 425959e49655bbb3c8138a0a2f234eba5a09c20d Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 26 Sep 2019 09:56:12 -0700 Subject: [PATCH 3/7] add _ in front of xxxParams to indicate internal use (PEP8) --- python/pyspark/ml/feature.py | 113 ++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index f1569b93e7c21..d0b02ce95e94e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -129,7 +129,7 @@ def getThreshold(self): return self.getOrDefault(self.threshold) -class LSHParams(HasInputCol, HasOutputCol): +class _LSHParams(HasInputCol, HasOutputCol): """ Mixin for Locality Sensitive Hashing (LSH) algorithm parameters. """ @@ -146,7 +146,7 @@ def getNumHashTables(self): return self.getOrDefault(self.numHashTables) -class LSH(JavaEstimator, LSHParams, JavaMLReadable, JavaMLWritable): +class LSH(JavaEstimator, _LSHParams, JavaMLReadable, JavaMLWritable): """ Mixin for Locality Sensitive Hashing (LSH). """ @@ -158,7 +158,7 @@ def setNumHashTables(self, value): return self._set(numHashTables=value) -class LSHModel(JavaModel, LSHParams): +class LSHModel(JavaModel, _LSHParams): """ Mixin for Locality Sensitive Hashing (LSH) models. """ @@ -203,9 +203,10 @@ def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol") return self._call_java("approxSimilarityJoin", datasetA, datasetB, threshold, distCol) -class BucketedRandomProjectionLSHParams(): +class _BucketedRandomProjectionLSHParams(): """ - (Private) Params for BucketedRandomProjectionParams. + Params for :py:attr:`BucketedRandomProjectionLSH` and + :py:attr:`BucketedRandomProjectionLSHModel`. .. versionadded:: 3.0.0 """ @@ -222,7 +223,7 @@ def getBucketLength(self): @inherit_doc -class BucketedRandomProjectionLSH(LSH, BucketedRandomProjectionLSHParams, +class BucketedRandomProjectionLSH(LSH, _BucketedRandomProjectionLSHParams, HasSeed, JavaMLReadable, JavaMLWritable): """ LSH class for Euclidean distance metrics. @@ -326,7 +327,7 @@ def _create_model(self, java_model): return BucketedRandomProjectionLSHModel(java_model) -class BucketedRandomProjectionLSHModel(LSHModel, BucketedRandomProjectionLSHParams, JavaMLReadable, +class BucketedRandomProjectionLSHModel(LSHModel, _BucketedRandomProjectionLSHParams, JavaMLReadable, JavaMLWritable): r""" Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are @@ -1031,9 +1032,9 @@ def indexOf(self, term): return self._java_obj.indexOf(term) -class IDFParams(HasInputCol, HasOutputCol): +class _IDFParams(HasInputCol, HasOutputCol): """ - (Private) Params for IDFParams. + Params for :py:attr:`IDF` and :py:attr:`IDFModel`. .. versionadded:: 3.0.0 """ @@ -1050,7 +1051,7 @@ def getMinDocFreq(self): @inherit_doc -class IDF(JavaEstimator, IDFParams, JavaMLReadable, JavaMLWritable): +class IDF(JavaEstimator, _IDFParams, JavaMLReadable, JavaMLWritable): """ Compute the Inverse Document Frequency (IDF) given a collection of documents. @@ -1120,7 +1121,7 @@ def _create_model(self, java_model): return IDFModel(java_model) -class IDFModel(JavaModel, IDFParams, JavaMLReadable, JavaMLWritable): +class IDFModel(JavaModel, _IDFParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`IDF`. @@ -1152,9 +1153,9 @@ def numDocs(self): return self._call_java("numDocs") -class ImputerParams(HasInputCols, HasOutputCols): +class _ImputerParams(HasInputCols, HasOutputCols): """ - (Private) Params for ImputerParams. + Params for :py:attr:`Imputer` and :py:attr:`ImputerModel`. .. versionadded:: 3.0.0 """ @@ -1184,7 +1185,7 @@ def getMissingValue(self): @inherit_doc -class Imputer(JavaEstimator, ImputerParams, JavaMLReadable, JavaMLWritable): +class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable): """ Imputation estimator for completing missing values, either using the mean or the median of the columns in which the missing values are located. The input columns should be of @@ -1282,7 +1283,7 @@ def _create_model(self, java_model): return ImputerModel(java_model) -class ImputerModel(JavaModel, ImputerParams, JavaMLReadable, JavaMLWritable): +class ImputerModel(JavaModel, _ImputerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`Imputer`. @@ -1352,16 +1353,16 @@ def setParams(self, inputCols=None, outputCol=None): return self._set(**kwargs) -class MaxAbsScalerParams(HasInputCol, HasOutputCol): +class _MaxAbsScalerParams(HasInputCol, HasOutputCol): """ - (Private) Params for MaxAbsScalerParams. + Params for :py:attr:`MaxAbsScaler` and :py:attr:`MaxAbsScalerModel`. .. versionadded:: 3.0.0 """ pass @inherit_doc -class MaxAbsScaler(JavaEstimator, MaxAbsScalerParams, JavaMLReadable, JavaMLWritable): +class MaxAbsScaler(JavaEstimator, _MaxAbsScalerParams, JavaMLReadable, JavaMLWritable): """ Rescale each feature individually to range [-1, 1] by dividing through the largest maximum absolute value in each feature. It does not shift/center the data, and thus does not destroy @@ -1422,7 +1423,7 @@ def _create_model(self, java_model): return MaxAbsScalerModel(java_model) -class MaxAbsScalerModel(JavaModel, MaxAbsScalerParams, JavaMLReadable, JavaMLWritable): +class MaxAbsScalerModel(JavaModel, _MaxAbsScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`MaxAbsScaler`. @@ -1439,7 +1440,7 @@ def maxAbs(self): @inherit_doc -class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed, +class MinHashLSH(JavaEstimator, _LSHParams, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, JavaMLWritable): """ @@ -1533,9 +1534,9 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): """ -class MinMaxScalerParams(HasInputCol, HasOutputCol): +class _MinMaxScalerParams(HasInputCol, HasOutputCol): """ - (Private) Params for MinMaxScalerParams. + Params for :py:attr:`MinMaxScaler` and :py:attr:`MinMaxScalerModel`. .. versionadded:: 3.0.0 """ @@ -1560,7 +1561,7 @@ def getMax(self): @inherit_doc -class MinMaxScaler(JavaEstimator, MinMaxScalerParams, JavaMLReadable, JavaMLWritable): +class MinMaxScaler(JavaEstimator, _MinMaxScalerParams, JavaMLReadable, JavaMLWritable): """ Rescale each feature individually to a common range [min, max] linearly using column summary statistics, which is also known as min-max normalization or Rescaling. The rescaled value for @@ -1648,7 +1649,7 @@ def _create_model(self, java_model): return MinMaxScalerModel(java_model) -class MinMaxScalerModel(JavaModel, MinMaxScalerParams, JavaMLReadable, JavaMLWritable): +class MinMaxScalerModel(JavaModel, _MinMaxScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`MinMaxScaler`. @@ -1813,9 +1814,9 @@ def getP(self): return self.getOrDefault(self.p) -class OneHotEncoderParams(HasInputCols, HasOutputCols, HasHandleInvalid): +class _OneHotEncoderParams(HasInputCols, HasOutputCols, HasHandleInvalid): """ - (Private) Params for OneHotEncoderParams. + Params for :py:attr:`OneHotEncoder` and :py:attr:`OneHotEncoderModel`. .. versionadded:: 3.0.0 """ @@ -1838,7 +1839,7 @@ def getDropLast(self): @inherit_doc -class OneHotEncoder(JavaEstimator, OneHotEncoderParams, JavaMLReadable, JavaMLWritable): +class OneHotEncoder(JavaEstimator, _OneHotEncoderParams, JavaMLReadable, JavaMLWritable): """ A one-hot encoder that maps a column of category indices to a column of binary vectors, with at most a single one-value per row that indicates the input category index. @@ -1916,7 +1917,7 @@ def _create_model(self, java_model): return OneHotEncoderModel(java_model) -class OneHotEncoderModel(JavaModel, OneHotEncoderParams, JavaMLReadable, JavaMLWritable): +class OneHotEncoderModel(JavaModel, _OneHotEncoderParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`OneHotEncoder`. @@ -2198,9 +2199,9 @@ def _create_model(self, java_model): handleInvalid=self.getHandleInvalid()) -class RobustScalerParams(HasInputCol, HasOutputCol): +class _RobustScalerParams(HasInputCol, HasOutputCol): """ - (Private) Params for RobustScalerParams. + Params for :py:attr:`RobustScaler` and :py:attr:`RobustScalerModel`. .. versionadded:: 3.0.0 """ @@ -2243,7 +2244,7 @@ def getWithScaling(self): @inherit_doc -class RobustScaler(JavaEstimator, RobustScalerParams, JavaMLReadable, JavaMLWritable): +class RobustScaler(JavaEstimator, _RobustScalerParams, JavaMLReadable, JavaMLWritable): """ RobustScaler removes the median and scales the data according to the quantile range. The quantile range is by default IQR (Interquartile Range, quantile range between the @@ -2344,7 +2345,7 @@ def _create_model(self, java_model): return RobustScalerModel(java_model) -class RobustScalerModel(JavaModel, RobustScalerParams, JavaMLReadable, JavaMLWritable): +class RobustScalerModel(JavaModel, _RobustScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`RobustScaler`. @@ -2557,9 +2558,9 @@ def getStatement(self): return self.getOrDefault(self.statement) -class StandardScalerParams(HasInputCol, HasOutputCol): +class _StandardScalerParams(HasInputCol, HasOutputCol): """ - (Private) Params for StandardScalerParams. + Params for :py:attr:`StandardScaler` and :py:attr:`StandardScalerModel`. .. versionadded:: 3.0.0 """ @@ -2584,7 +2585,7 @@ def getWithStd(self): @inherit_doc -class StandardScaler(JavaEstimator, StandardScalerParams, JavaMLReadable, JavaMLWritable): +class StandardScaler(JavaEstimator, _StandardScalerParams, JavaMLReadable, JavaMLWritable): """ Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. @@ -2664,7 +2665,7 @@ def _create_model(self, java_model): return StandardScalerModel(java_model) -class StandardScalerModel(JavaModel, StandardScalerParams, JavaMLReadable, JavaMLWritable): +class StandardScalerModel(JavaModel, _StandardScalerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`StandardScaler`. @@ -3187,9 +3188,9 @@ def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): return self._set(**kwargs) -class VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): +class _VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): """ - (Private) Params for VectorIndexerParams. + Params for :py:attr:`VectorIndexer` and :py:attr:`VectorIndexerModel`. .. versionadded:: 3.0.0 """ @@ -3214,7 +3215,7 @@ def getMaxCategories(self): @inherit_doc -class VectorIndexer(JavaEstimator, VectorIndexerParams, JavaMLReadable, JavaMLWritable): +class VectorIndexer(JavaEstimator, _VectorIndexerParams, JavaMLReadable, JavaMLWritable): """ Class for indexing categorical feature columns in a dataset of `Vector`. @@ -3328,7 +3329,7 @@ def _create_model(self, java_model): return VectorIndexerModel(java_model) -class VectorIndexerModel(JavaModel, VectorIndexerParams, JavaMLReadable, JavaMLWritable): +class VectorIndexerModel(JavaModel, _VectorIndexerParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`VectorIndexer`. @@ -3454,9 +3455,9 @@ def getNames(self): return self.getOrDefault(self.names) -class Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): +class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): """ - (Private) Params for Word2VecParams. + Params for :py:attr:`Word2Vec` and :py:attr:`Word2VecModel`. .. versionadded:: 3.0.0 """ @@ -3516,7 +3517,7 @@ def getMaxSentenceLength(self): @inherit_doc @ignore_unicode_prefix -class Word2Vec(JavaEstimator, Word2VecParams, JavaMLReadable, JavaMLWritable): +class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable): """ Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further natural language processing or machine learning process. @@ -3641,7 +3642,7 @@ def _create_model(self, java_model): return Word2VecModel(java_model) -class Word2VecModel(JavaModel, Word2VecParams, JavaMLReadable, JavaMLWritable): +class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`Word2Vec`. @@ -3682,9 +3683,9 @@ def findSynonymsArray(self, word, num): return list(map(lambda st: (st._1(), st._2()), list(tuples))) -class PCAParams(HasInputCol, HasOutputCol): +class _PCAParams(HasInputCol, HasOutputCol): """ - (Private) Params for PCAParams. + Params for :py:attr:`PCA` and :py:attr:`PCAModel`. .. versionadded:: 3.0.0 """ @@ -3700,7 +3701,7 @@ def getK(self): @inherit_doc -class PCA(JavaEstimator, PCAParams, JavaMLReadable, JavaMLWritable): +class PCA(JavaEstimator, _PCAParams, JavaMLReadable, JavaMLWritable): """ PCA trains a model to project vectors to a lower dimensional space of the top :py:attr:`k` principal components. @@ -3767,7 +3768,7 @@ def _create_model(self, java_model): return PCAModel(java_model) -class PCAModel(JavaModel, PCAParams, JavaMLReadable, JavaMLWritable): +class PCAModel(JavaModel, _PCAParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space. @@ -3793,9 +3794,9 @@ def explainedVariance(self): return self._call_java("explainedVariance") -class RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): +class _RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): """ - (Private) Params for RFormulaParams. + Params for :py:attr:`RFormula` and :py:attr:`RFormula`. .. versionadded:: 3.0.0 """ @@ -3844,7 +3845,7 @@ def getStringIndexerOrderType(self): @inherit_doc -class RFormula(JavaEstimator, RFormulaParams, JavaMLReadable, JavaMLWritable): +class RFormula(JavaEstimator, _RFormulaParams, JavaMLReadable, JavaMLWritable): """ Implements the transforms required for fitting a dataset against an R model formula. Currently we support a limited subset of the R @@ -3971,7 +3972,7 @@ def __str__(self): return "RFormula(%s) (uid=%s)" % (formulaStr, self.uid) -class RFormulaModel(JavaModel, RFormulaParams, JavaMLReadable, JavaMLWritable): +class RFormulaModel(JavaModel, _RFormulaParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`RFormula`. Fitting is required to determine the factor levels of formula terms. @@ -3984,9 +3985,9 @@ def __str__(self): return "RFormulaModel(%s) (uid=%s)" % (resolvedFormula, self.uid) -class ChiSqSelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): +class _ChiSqSelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): """ - (Private) Params for ChiSqSelectorParams. + Params for :py:attr:`ChiSqSelector` and :py:attr:`ChiSqSelectorModel`. .. versionadded:: 3.0.0 """ @@ -4058,7 +4059,7 @@ def getFwe(self): @inherit_doc -class ChiSqSelector(JavaEstimator, ChiSqSelectorParams, JavaMLReadable, JavaMLWritable): +class ChiSqSelector(JavaEstimator, _ChiSqSelectorParams, JavaMLReadable, JavaMLWritable): """ Chi-Squared feature selection, which selects categorical features to use for predicting a categorical label. @@ -4193,7 +4194,7 @@ def _create_model(self, java_model): return ChiSqSelectorModel(java_model) -class ChiSqSelectorModel(JavaModel, ChiSqSelectorParams, JavaMLReadable, JavaMLWritable): +class ChiSqSelectorModel(JavaModel, _ChiSqSelectorParams, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`ChiSqSelector`. From 50e44677b624792fd8a087498aff4053cafa0268 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 27 Sep 2019 11:11:24 -0700 Subject: [PATCH 4/7] address comments --- python/pyspark/ml/feature.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index d0b02ce95e94e..e7c5ed1969256 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1440,8 +1440,7 @@ def maxAbs(self): @inherit_doc -class MinHashLSH(JavaEstimator, _LSHParams, HasInputCol, HasOutputCol, HasSeed, - JavaMLReadable, JavaMLWritable): +class MinHashLSH(LSH, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, JavaMLWritable): """ LSH class for Jaccard distance. From bc25799e0ec7dfce32b2a9d956a30a7a97819b70 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 3 Oct 2019 14:16:28 -0700 Subject: [PATCH 5/7] change :py:attr: to :py:class for classes --- python/pyspark/ml/feature.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index e7c5ed1969256..417b0c6bf63cc 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -205,8 +205,8 @@ def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol") class _BucketedRandomProjectionLSHParams(): """ - Params for :py:attr:`BucketedRandomProjectionLSH` and - :py:attr:`BucketedRandomProjectionLSHModel`. + Params for :py:class:`BucketedRandomProjectionLSH` and + :py:class:`BucketedRandomProjectionLSHModel`. .. versionadded:: 3.0.0 """ @@ -486,7 +486,7 @@ def getSplitsArray(self): class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol): """ - Params for :py:attr:`CountVectorizer` and :py:attr:`CountVectorizerModel`. + Params for :py:class:`CountVectorizer` and :py:class:`CountVectorizerModel`. """ minTF = Param( @@ -1034,7 +1034,7 @@ def indexOf(self, term): class _IDFParams(HasInputCol, HasOutputCol): """ - Params for :py:attr:`IDF` and :py:attr:`IDFModel`. + Params for :py:class:`IDF` and :py:class:`IDFModel`. .. versionadded:: 3.0.0 """ @@ -1155,7 +1155,7 @@ def numDocs(self): class _ImputerParams(HasInputCols, HasOutputCols): """ - Params for :py:attr:`Imputer` and :py:attr:`ImputerModel`. + Params for :py:class:`Imputer` and :py:class:`ImputerModel`. .. versionadded:: 3.0.0 """ @@ -1355,7 +1355,7 @@ def setParams(self, inputCols=None, outputCol=None): class _MaxAbsScalerParams(HasInputCol, HasOutputCol): """ - Params for :py:attr:`MaxAbsScaler` and :py:attr:`MaxAbsScalerModel`. + Params for :py:class:`MaxAbsScaler` and :py:class:`MaxAbsScalerModel`. .. versionadded:: 3.0.0 """ pass @@ -1535,7 +1535,7 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): class _MinMaxScalerParams(HasInputCol, HasOutputCol): """ - Params for :py:attr:`MinMaxScaler` and :py:attr:`MinMaxScalerModel`. + Params for :py:class:`MinMaxScaler` and :py:class:`MinMaxScalerModel`. .. versionadded:: 3.0.0 """ @@ -1815,7 +1815,7 @@ def getP(self): class _OneHotEncoderParams(HasInputCols, HasOutputCols, HasHandleInvalid): """ - Params for :py:attr:`OneHotEncoder` and :py:attr:`OneHotEncoderModel`. + Params for :py:class:`OneHotEncoder` and :py:class:`OneHotEncoderModel`. .. versionadded:: 3.0.0 """ @@ -2200,7 +2200,7 @@ def _create_model(self, java_model): class _RobustScalerParams(HasInputCol, HasOutputCol): """ - Params for :py:attr:`RobustScaler` and :py:attr:`RobustScalerModel`. + Params for :py:class:`RobustScaler` and :py:class:`RobustScalerModel`. .. versionadded:: 3.0.0 """ @@ -2559,7 +2559,7 @@ def getStatement(self): class _StandardScalerParams(HasInputCol, HasOutputCol): """ - Params for :py:attr:`StandardScaler` and :py:attr:`StandardScalerModel`. + Params for :py:class:`StandardScaler` and :py:class:`StandardScalerModel`. .. versionadded:: 3.0.0 """ @@ -2691,7 +2691,7 @@ def mean(self): class _StringIndexerParams(JavaParams, HasHandleInvalid, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols): """ - Params for :py:attr:`StringIndexer` and :py:attr:`StringIndexerModel`. + Params for :py:class:`StringIndexer` and :py:class:`StringIndexerModel`. """ stringOrderType = Param(Params._dummy(), "stringOrderType", @@ -3189,7 +3189,7 @@ def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): class _VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): """ - Params for :py:attr:`VectorIndexer` and :py:attr:`VectorIndexerModel`. + Params for :py:class:`VectorIndexer` and :py:class:`VectorIndexerModel`. .. versionadded:: 3.0.0 """ @@ -3456,7 +3456,7 @@ def getNames(self): class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): """ - Params for :py:attr:`Word2Vec` and :py:attr:`Word2VecModel`. + Params for :py:class:`Word2Vec` and :py:class:`Word2VecModel`. .. versionadded:: 3.0.0 """ @@ -3684,7 +3684,7 @@ def findSynonymsArray(self, word, num): class _PCAParams(HasInputCol, HasOutputCol): """ - Params for :py:attr:`PCA` and :py:attr:`PCAModel`. + Params for :py:class:`PCA` and :py:class:`PCAModel`. .. versionadded:: 3.0.0 """ @@ -3795,7 +3795,7 @@ def explainedVariance(self): class _RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): """ - Params for :py:attr:`RFormula` and :py:attr:`RFormula`. + Params for :py:class:`RFormula` and :py:class:`RFormula`. .. versionadded:: 3.0.0 """ @@ -3986,7 +3986,7 @@ def __str__(self): class _ChiSqSelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): """ - Params for :py:attr:`ChiSqSelector` and :py:attr:`ChiSqSelectorModel`. + Params for :py:class:`ChiSqSelector` and :py:class:`ChiSqSelectorModel`. .. versionadded:: 3.0.0 """ From c305a433a4f8258abc113149155ce6749a78f5f0 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 3 Oct 2019 22:05:57 -0700 Subject: [PATCH 6/7] address comments --- python/pyspark/ml/feature.py | 25 +++++++++++++++++++------ python/pyspark/ml/tests/test_param.py | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 417b0c6bf63cc..0284607801ec9 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -146,7 +146,7 @@ def getNumHashTables(self): return self.getOrDefault(self.numHashTables) -class LSH(JavaEstimator, _LSHParams, JavaMLReadable, JavaMLWritable): +class _LSH(JavaEstimator, _LSHParams, JavaMLReadable, JavaMLWritable): """ Mixin for Locality Sensitive Hashing (LSH). """ @@ -158,7 +158,7 @@ def setNumHashTables(self, value): return self._set(numHashTables=value) -class LSHModel(JavaModel, _LSHParams): +class _LSHModel(JavaModel, _LSHParams): """ Mixin for Locality Sensitive Hashing (LSH) models. """ @@ -207,6 +207,7 @@ class _BucketedRandomProjectionLSHParams(): """ Params for :py:class:`BucketedRandomProjectionLSH` and :py:class:`BucketedRandomProjectionLSHModel`. + .. versionadded:: 3.0.0 """ @@ -223,7 +224,7 @@ def getBucketLength(self): @inherit_doc -class BucketedRandomProjectionLSH(LSH, _BucketedRandomProjectionLSHParams, +class BucketedRandomProjectionLSH(_LSH, _BucketedRandomProjectionLSHParams, HasSeed, JavaMLReadable, JavaMLWritable): """ LSH class for Euclidean distance metrics. @@ -327,7 +328,7 @@ def _create_model(self, java_model): return BucketedRandomProjectionLSHModel(java_model) -class BucketedRandomProjectionLSHModel(LSHModel, _BucketedRandomProjectionLSHParams, JavaMLReadable, +class BucketedRandomProjectionLSHModel(_LSHModel, _BucketedRandomProjectionLSHParams, JavaMLReadable, JavaMLWritable): r""" Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are @@ -1035,6 +1036,7 @@ def indexOf(self, term): class _IDFParams(HasInputCol, HasOutputCol): """ Params for :py:class:`IDF` and :py:class:`IDFModel`. + .. versionadded:: 3.0.0 """ @@ -1156,6 +1158,7 @@ def numDocs(self): class _ImputerParams(HasInputCols, HasOutputCols): """ Params for :py:class:`Imputer` and :py:class:`ImputerModel`. + .. versionadded:: 3.0.0 """ @@ -1356,6 +1359,7 @@ def setParams(self, inputCols=None, outputCol=None): class _MaxAbsScalerParams(HasInputCol, HasOutputCol): """ Params for :py:class:`MaxAbsScaler` and :py:class:`MaxAbsScalerModel`. + .. versionadded:: 3.0.0 """ pass @@ -1440,7 +1444,7 @@ def maxAbs(self): @inherit_doc -class MinHashLSH(LSH, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, JavaMLWritable): +class MinHashLSH(_LSH, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, JavaMLWritable): """ LSH class for Jaccard distance. @@ -1518,7 +1522,7 @@ def _create_model(self, java_model): return MinHashLSHModel(java_model) -class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): +class MinHashLSHModel(_LSHModel, JavaMLReadable, JavaMLWritable): r""" Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each hash function is picked from the following family of hash functions, where :math:`a_i` and @@ -1536,6 +1540,7 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): class _MinMaxScalerParams(HasInputCol, HasOutputCol): """ Params for :py:class:`MinMaxScaler` and :py:class:`MinMaxScalerModel`. + .. versionadded:: 3.0.0 """ @@ -1816,6 +1821,7 @@ def getP(self): class _OneHotEncoderParams(HasInputCols, HasOutputCols, HasHandleInvalid): """ Params for :py:class:`OneHotEncoder` and :py:class:`OneHotEncoderModel`. + .. versionadded:: 3.0.0 """ @@ -2201,6 +2207,7 @@ def _create_model(self, java_model): class _RobustScalerParams(HasInputCol, HasOutputCol): """ Params for :py:class:`RobustScaler` and :py:class:`RobustScalerModel`. + .. versionadded:: 3.0.0 """ @@ -2560,6 +2567,7 @@ def getStatement(self): class _StandardScalerParams(HasInputCol, HasOutputCol): """ Params for :py:class:`StandardScaler` and :py:class:`StandardScalerModel`. + .. versionadded:: 3.0.0 """ @@ -3190,6 +3198,7 @@ def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): class _VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): """ Params for :py:class:`VectorIndexer` and :py:class:`VectorIndexerModel`. + .. versionadded:: 3.0.0 """ @@ -3457,6 +3466,7 @@ def getNames(self): class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): """ Params for :py:class:`Word2Vec` and :py:class:`Word2VecModel`. + .. versionadded:: 3.0.0 """ @@ -3685,6 +3695,7 @@ def findSynonymsArray(self, word, num): class _PCAParams(HasInputCol, HasOutputCol): """ Params for :py:class:`PCA` and :py:class:`PCAModel`. + .. versionadded:: 3.0.0 """ @@ -3796,6 +3807,7 @@ def explainedVariance(self): class _RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): """ Params for :py:class:`RFormula` and :py:class:`RFormula`. + .. versionadded:: 3.0.0 """ @@ -3987,6 +3999,7 @@ def __str__(self): class _ChiSqSelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): """ Params for :py:class:`ChiSqSelector` and :py:class:`ChiSqSelectorModel`. + .. versionadded:: 3.0.0 """ diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 2f6eeeef756e1..ba6f483699706 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -353,7 +353,7 @@ def test_java_params(self): for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \ - and not name.startswith('Java') and name != 'LSH': + and not name.startswith('Java') and name != '_LSH': # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) From 64fca95ac9ef8aedd7f00705540ed788a39f3ed0 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 3 Oct 2019 23:09:09 -0700 Subject: [PATCH 7/7] fix pycodestyle error --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 0284607801ec9..dc717165b7a4d 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -328,8 +328,8 @@ def _create_model(self, java_model): return BucketedRandomProjectionLSHModel(java_model) -class BucketedRandomProjectionLSHModel(_LSHModel, _BucketedRandomProjectionLSHParams, JavaMLReadable, - JavaMLWritable): +class BucketedRandomProjectionLSHModel(_LSHModel, _BucketedRandomProjectionLSHParams, + JavaMLReadable, JavaMLWritable): r""" Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are stored. The vectors are normalized to be unit vectors and each vector is used in a hash