From b6fb1e556c9777909771ab8a0b01f7ab13571860 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 Jan 2025 17:50:20 +0100 Subject: [PATCH 1/5] feat: consistently name `selector` parameters in `Table` --- src/safeds/data/tabular/containers/_table.py | 42 +++++++++---------- .../test_remove_rows_with_missing_values.py | 4 +- .../_table/test_remove_rows_with_outliers.py | 4 +- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 17ff76d2d..5d90a07cd 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -775,7 +775,7 @@ def has_column(self, name: str) -> bool: def remove_columns( self, - names: str | list[str], + selector: str | list[str], *, ignore_unknown_names: bool = False, ) -> Table: @@ -786,8 +786,8 @@ def remove_columns( Parameters ---------- - names: - The names of the columns to remove. + selector: + The columns to remove. ignore_unknown_names: If set to True, columns that are not present in the table will be ignored. If set to False, an error will be raised if any of the specified columns do not exist. @@ -835,14 +835,14 @@ def remove_columns( - [remove_columns_with_missing_values][safeds.data.tabular.containers._table.Table.remove_columns_with_missing_values] - [remove_non_numeric_columns][safeds.data.tabular.containers._table.Table.remove_non_numeric_columns] """ - if isinstance(names, str): - names = [names] + if isinstance(selector, str): + selector = [selector] if not ignore_unknown_names: - _check_columns_exist(self, names) + _check_columns_exist(self, selector) return Table._from_polars_lazy_frame( - self._lazy_frame.drop(names, strict=not ignore_unknown_names), + self._lazy_frame.drop(selector, strict=not ignore_unknown_names), ) def remove_columns_with_missing_values( @@ -1611,7 +1611,7 @@ def remove_rows_by_column( def remove_rows_with_missing_values( self, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, ) -> Table: """ Remove rows that contain missing values in the specified columns and return the result as a new table. @@ -1624,8 +1624,8 @@ def remove_rows_with_missing_values( Parameters ---------- - column_names: - The names of the columns to check. If None, all columns are checked. + selector: + The columns to check. If None, all columns are checked. Returns ------- @@ -1645,7 +1645,7 @@ def remove_rows_with_missing_values( | 1 | 4 | +-----+-----+ - >>> table.remove_rows_with_missing_values(column_names=["b"]) + >>> table.remove_rows_with_missing_values(selector=["b"]) +------+-----+ | a | b | | --- | --- | @@ -1669,18 +1669,18 @@ def remove_rows_with_missing_values( - [remove_duplicate_rows][safeds.data.tabular.containers._table.Table.remove_duplicate_rows] - [remove_rows_with_outliers][safeds.data.tabular.containers._table.Table.remove_rows_with_outliers] """ - if isinstance(column_names, list) and not column_names: + if isinstance(selector, list) and not selector: # polars panics in this case return self return Table._from_polars_lazy_frame( - self._lazy_frame.drop_nulls(subset=column_names), + self._lazy_frame.drop_nulls(subset=selector), ) def remove_rows_with_outliers( self, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, z_score_threshold: float = 3, ) -> Table: """ @@ -1701,8 +1701,8 @@ def remove_rows_with_outliers( Parameters ---------- - column_names: - Names of the columns to consider. If None, all numeric columns are considered. + selector: + The columns to check. If None, all columns are checked. z_score_threshold: The z-score threshold for detecting outliers. Must be greater than or equal to 0. @@ -1755,14 +1755,14 @@ def remove_rows_with_outliers( lower_bound=_ClosedBound(0), ) - if column_names is None: - column_names = self.column_names + if selector is None: + selector = self.column_names import polars as pl import polars.selectors as cs # polar's `all_horizontal` raises a `ComputeError` if there are no columns - selected = self._lazy_frame.select(cs.numeric() & cs.by_name(column_names)) + selected = self._lazy_frame.select(cs.numeric() & cs.by_name(selector)) if not selected.collect_schema().names(): return self @@ -2268,9 +2268,9 @@ def join( right_table: The table to join with the left table. left_names: - Name or list of names of columns to join on in the left table. + Names of columns to join on in the left table. right_names: - Name or list of names of columns to join on in the right table. + Names of columns to join on in the right table. mode: Specify which type of join you want to use. diff --git a/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_missing_values.py b/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_missing_values.py index 0e1e1e39c..b8d7a8484 100644 --- a/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_missing_values.py +++ b/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_missing_values.py @@ -68,7 +68,7 @@ def test_should_remove_rows_with_missing_values( column_names: str | list[str] | None, expected: Table, ) -> None: - actual = table_factory().remove_rows_with_missing_values(column_names=column_names) + actual = table_factory().remove_rows_with_missing_values(selector=column_names) assert actual == expected def test_should_not_mutate_receiver( @@ -78,5 +78,5 @@ def test_should_not_mutate_receiver( expected: Table, # noqa: ARG002 ) -> None: original = table_factory() - original.remove_rows_with_missing_values(column_names=column_names) + original.remove_rows_with_missing_values(selector=column_names) assert original == table_factory() diff --git a/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py b/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py index db08bb701..15381a25c 100644 --- a/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py +++ b/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py @@ -94,7 +94,7 @@ def test_should_remove_rows_with_outliers( expected: Table, ) -> None: actual = table_factory().remove_rows_with_outliers( - column_names=column_names, + selector=column_names, z_score_threshold=z_score_threshold, ) assert actual == expected @@ -108,7 +108,7 @@ def test_should_not_mutate_receiver( ) -> None: original = table_factory() original.remove_rows_with_outliers( - column_names=column_names, + selector=column_names, z_score_threshold=z_score_threshold, ) assert original == table_factory() From 95e5459857aa5471ec0ab809f89b152fe8be6305 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 Jan 2025 17:53:13 +0100 Subject: [PATCH 2/5] feat: consistently name `selector` parameters in `TableTransformer`s --- docs/tutorials/classification.ipynb | 4 +-- docs/tutorials/data_processing.ipynb | 10 +++---- .../data/labeled/containers/_image_dataset.py | 2 +- .../tabular/transformation/_discretizer.py | 20 ++++++------- .../_k_nearest_neighbors_imputer.py | 18 ++++++------ .../tabular/transformation/_label_encoder.py | 28 +++++++++---------- .../transformation/_one_hot_encoder.py | 22 +++++++-------- .../tabular/transformation/_range_scaler.py | 28 +++++++++---------- .../tabular/transformation/_robust_scaler.py | 28 +++++++++---------- .../tabular/transformation/_simple_imputer.py | 23 ++++++++------- .../transformation/_standard_scaler.py | 28 +++++++++---------- .../transformation/_table_transformer.py | 12 ++++---- src/safeds/ml/nn/_model.py | 2 +- .../transformation/test_discretizer.py | 10 +++---- .../test_k_nearest_neighbors_imputer.py | 6 ++-- .../transformation/test_label_encoder.py | 14 +++++----- .../transformation/test_one_hot_encoder.py | 16 +++++------ .../transformation/test_range_scaler.py | 16 +++++------ .../transformation/test_robust_scaler.py | 14 +++++----- .../test_sequential_table_transformer.py | 10 +++---- .../transformation/test_simple_imputer.py | 8 +++--- .../transformation/test_standard_scaler.py | 14 +++++----- .../transformation/test_table_transformer.py | 12 ++++---- tests/safeds/ml/nn/test_cnn_workflow.py | 2 +- tests/safeds/ml/nn/test_forward_workflow.py | 2 +- tests/safeds/ml/nn/test_lstm_workflow.py | 2 +- 26 files changed, 174 insertions(+), 177 deletions(-) diff --git a/docs/tutorials/classification.ipynb b/docs/tutorials/classification.ipynb index 361c4d5d2..335d22ea7 100644 --- a/docs/tutorials/classification.ipynb +++ b/docs/tutorials/classification.ipynb @@ -208,7 +208,7 @@ "source": [ "from safeds.data.tabular.transformation import SimpleImputer\n", "\n", - "simple_imputer = SimpleImputer(column_names=[\"age\", \"fare\"], strategy=SimpleImputer.Strategy.mean())\n", + "simple_imputer = SimpleImputer(selector=[\"age\", \"fare\"], strategy=SimpleImputer.Strategy.mean())\n", "fitted_simple_imputer_train, transformed_train_data = simple_imputer.fit_and_transform(train_table)\n", "transformed_test_data = fitted_simple_imputer_train.transform(test_table)" ] @@ -241,7 +241,7 @@ "from safeds.data.tabular.transformation import OneHotEncoder\n", "\n", "fitted_one_hot_encoder_train, transformed_train_data = OneHotEncoder(\n", - " column_names=[\"sex\", \"port_embarked\"],\n", + " selector=[\"sex\", \"port_embarked\"],\n", ").fit_and_transform(transformed_train_data)\n", "transformed_test_data = fitted_one_hot_encoder_train.transform(transformed_test_data)" ] diff --git a/docs/tutorials/data_processing.ipynb b/docs/tutorials/data_processing.ipynb index a49e73663..c73d6162a 100644 --- a/docs/tutorials/data_processing.ipynb +++ b/docs/tutorials/data_processing.ipynb @@ -510,7 +510,7 @@ "source": [ "from safeds.data.tabular.transformation import SimpleImputer\n", "\n", - "imputer = SimpleImputer(SimpleImputer.Strategy.constant(0), column_names=[\"age\", \"fare\", \"cabin\", \"port_embarked\"]).fit(\n", + "imputer = SimpleImputer(SimpleImputer.Strategy.constant(0), selector=[\"age\", \"fare\", \"cabin\", \"port_embarked\"]).fit(\n", " titanic,\n", ")\n", "imputer.transform(titanic_slice)" @@ -583,7 +583,7 @@ "source": [ "from safeds.data.tabular.transformation import LabelEncoder\n", "\n", - "encoder = LabelEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n", + "encoder = LabelEncoder(selector=[\"sex\", \"port_embarked\"]).fit(titanic)\n", "encoder.transform(titanic_slice)" ] }, @@ -674,7 +674,7 @@ "source": [ "from safeds.data.tabular.transformation import OneHotEncoder\n", "\n", - "encoder = OneHotEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n", + "encoder = OneHotEncoder(selector=[\"sex\", \"port_embarked\"]).fit(titanic)\n", "encoder.transform(titanic_slice)" ] }, @@ -745,7 +745,7 @@ "source": [ "from safeds.data.tabular.transformation import RangeScaler\n", "\n", - "scaler = RangeScaler(column_names=\"age\", min_=0.0, max_=1.0).fit(titanic)\n", + "scaler = RangeScaler(selector=\"age\", min_=0.0, max_=1.0).fit(titanic)\n", "scaler.transform(titanic_slice)" ] }, @@ -816,7 +816,7 @@ "source": [ "from safeds.data.tabular.transformation import StandardScaler\n", "\n", - "scaler = StandardScaler(column_names=[\"age\", \"travel_class\"]).fit(titanic)\n", + "scaler = StandardScaler(selector=[\"age\", \"travel_class\"]).fit(titanic)\n", "scaler.transform(titanic_slice)" ] }, diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py index 4434d1c43..aaff20991 100644 --- a/src/safeds/data/labeled/containers/_image_dataset.py +++ b/src/safeds/data/labeled/containers/_image_dataset.py @@ -448,7 +448,7 @@ def __init__(self, column: Column) -> None: ) # TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not # be done automatically? - self._one_hot_encoder = OneHotEncoder(column_names=self._column_name).fit(column_as_table) + self._one_hot_encoder = OneHotEncoder(selector=self._column_name).fit(column_as_table) self._tensor = torch.Tensor( self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch(dtype=pl.Float32), ).to(_get_device()) diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py index e55b66d77..68cfbbc9e 100644 --- a/src/safeds/data/tabular/transformation/_discretizer.py +++ b/src/safeds/data/tabular/transformation/_discretizer.py @@ -24,7 +24,7 @@ class Discretizer(TableTransformer): ---------- bin_count: The number of bins to be created. - column_names: + selector: The list of columns used to fit the transformer. If `None`, all numeric columns are used. Raises @@ -41,9 +41,9 @@ def __init__( self, bin_count: int = 5, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, ) -> None: - TableTransformer.__init__(self, column_names) + TableTransformer.__init__(self, selector) _check_bounds("bin_count", bin_count, lower_bound=_ClosedBound(2)) @@ -104,10 +104,10 @@ def fit(self, table: Table) -> Discretizer: if table.row_count == 0: raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows") - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a Discretizer") @@ -117,7 +117,7 @@ def fit(self, table: Table) -> Discretizer: table.select_columns(column_names)._data_frame, ) - result = Discretizer(self._bin_count, column_names=column_names) + result = Discretizer(self._bin_count, selector=column_names) result._wrapped_transformer = wrapped_transformer return result @@ -150,21 +150,21 @@ def transform(self, table: Table) -> Table: If one of the columns, that should be fitted is non-numeric. """ # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + if self._wrapped_transformer is None or self._selector is None: raise NotFittedError(kind="transformer") if table.row_count == 0: raise ValueError("The table cannot be transformed because it contains 0 rows") # Input table does not contain all columns used to fit the transformer - _check_columns_exist(table, self._column_names) + _check_columns_exist(table, self._selector) - for column in self._column_names: + for column in self._selector: if not table.get_column(column).type.is_numeric: raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") new_data = self._wrapped_transformer.transform( - table.select_columns(self._column_names)._data_frame, + table.select_columns(self._selector)._data_frame, ) return Table._from_polars_lazy_frame( table._lazy_frame.update(new_data.lazy()), diff --git a/src/safeds/data/tabular/transformation/_k_nearest_neighbors_imputer.py b/src/safeds/data/tabular/transformation/_k_nearest_neighbors_imputer.py index 1b0b19ab7..193e39cdf 100644 --- a/src/safeds/data/tabular/transformation/_k_nearest_neighbors_imputer.py +++ b/src/safeds/data/tabular/transformation/_k_nearest_neighbors_imputer.py @@ -21,7 +21,7 @@ class KNearestNeighborsImputer(TableTransformer): ---------- neighbor_count: The number of neighbors to consider when imputing missing values. - column_names: + selector: The list of columns used to impute missing values. If 'None', all columns are used. value_to_replace: The placeholder for the missing values. All occurrences of`missing_values` will be imputed. @@ -35,10 +35,10 @@ def __init__( self, neighbor_count: int, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, value_to_replace: float | str | None = None, ) -> None: - super().__init__(column_names) + super().__init__(selector) _check_bounds(name="neighbor_count", actual=neighbor_count, lower_bound=_ClosedBound(1)) @@ -106,10 +106,10 @@ def fit(self, table: Table) -> KNearestNeighborsImputer: if table.row_count == 0: raise ValueError("The KNearestNeighborsImputer cannot be fitted because the table contains 0 rows.") - if self._column_names is None: + if self._selector is None: column_names = table.column_names else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) value_to_replace = self._value_to_replace @@ -125,7 +125,7 @@ def fit(self, table: Table) -> KNearestNeighborsImputer: table.select_columns(column_names)._data_frame, ) - result = KNearestNeighborsImputer(self._neighbor_count, column_names=column_names) + result = KNearestNeighborsImputer(self._neighbor_count, selector=column_names) result._wrapped_transformer = wrapped_transformer return result @@ -153,13 +153,13 @@ def transform(self, table: Table) -> Table: ColumnNotFoundError If one of the columns, that should be transformed is not in the table. """ - if self._column_names is None or self._wrapped_transformer is None: + if self._selector is None or self._wrapped_transformer is None: raise NotFittedError(kind="transformer") - _check_columns_exist(table, self._column_names) + _check_columns_exist(table, self._selector) new_data = self._wrapped_transformer.transform( - table.select_columns(self._column_names)._data_frame, + table.select_columns(self._selector)._data_frame, ) return Table._from_polars_lazy_frame( diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py index cf1dfcbd5..601d7e20b 100644 --- a/src/safeds/data/tabular/transformation/_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_label_encoder.py @@ -17,7 +17,7 @@ class LabelEncoder(InvertibleTableTransformer): Parameters ---------- - column_names: + selector: The list of columns used to fit the transformer. If `None`, all non-numeric columns are used. partial_order: The partial order of the labels. The labels are encoded in the order of the given list. Additional values are @@ -31,10 +31,10 @@ class LabelEncoder(InvertibleTableTransformer): def __init__( self, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, partial_order: list[Any] | None = None, ) -> None: - super().__init__(column_names) + super().__init__(selector) if partial_order is None: partial_order = [] @@ -94,10 +94,10 @@ def fit(self, table: Table) -> LabelEncoder: ValueError If the table contains 0 rows. """ - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _warn_if_columns_are_numeric(table, column_names) @@ -121,7 +121,7 @@ def fit(self, table: Table) -> LabelEncoder: reverse_mapping[name][label] = value # Create a copy with the learned transformation - result = LabelEncoder(column_names=column_names, partial_order=self._partial_order) + result = LabelEncoder(selector=column_names, partial_order=self._partial_order) result._mapping = mapping result._inverse_mapping = reverse_mapping @@ -155,14 +155,14 @@ def transform(self, table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._mapping is None: + if self._selector is None or self._mapping is None: raise NotFittedError(kind="transformer") - _check_columns_exist(table, self._column_names) + _check_columns_exist(table, self._selector) columns = [ pl.col(name).replace_strict(self._mapping[name], default=None, return_dtype=pl.UInt32) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( @@ -197,19 +197,17 @@ def inverse_transform(self, transformed_table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._inverse_mapping is None: + if self._selector is None or self._inverse_mapping is None: raise NotFittedError(kind="transformer") - _check_columns_exist(transformed_table, self._column_names) + _check_columns_exist(transformed_table, self._selector) _check_columns_are_numeric( transformed_table, - self._column_names, + self._selector, operation="inverse-transform with a LabelEncoder", ) - columns = [ - pl.col(name).replace_strict(self._inverse_mapping[name], default=None) for name in self._column_names - ] + columns = [pl.col(name).replace_strict(self._inverse_mapping[name], default=None) for name in self._selector] return Table._from_polars_lazy_frame( transformed_table._lazy_frame.with_columns(columns), diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index 5b4005244..5dc2d21eb 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -42,7 +42,7 @@ class OneHotEncoder(InvertibleTableTransformer): Parameters ---------- - column_names: + selector: The list of columns used to fit the transformer. If `None`, all non-numeric columns are used. separator: The separator used to separate the original column name from the value in the new column names. @@ -73,10 +73,10 @@ class OneHotEncoder(InvertibleTableTransformer): def __init__( self, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, separator: str = "__", ) -> None: - super().__init__(column_names) + super().__init__(selector) # Parameters self._separator = separator @@ -142,10 +142,10 @@ def fit(self, table: Table) -> OneHotEncoder: ValueError If the table contains 0 rows. """ - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _warn_if_columns_are_numeric(table, column_names) @@ -175,7 +175,7 @@ def fit(self, table: Table) -> OneHotEncoder: mapping[name].append((new_name, value)) # Create a copy with the learned transformation - result = OneHotEncoder(column_names=column_names, separator=self._separator) + result = OneHotEncoder(selector=column_names, separator=self._separator) result._new_column_names = new_column_names result._mapping = mapping @@ -207,21 +207,21 @@ def transform(self, table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._mapping is None: + if self._selector is None or self._mapping is None: raise NotFittedError(kind="transformer") # TODO: raise schema error instead - _check_columns_exist(table, self._column_names) + _check_columns_exist(table, self._selector) expressions = [ # UInt8 can be used without conversion in scikit-learn pl.col(column_name).eq_missing(value).alias(new_name).cast(pl.UInt8) - for column_name in self._column_names + for column_name in self._selector for new_name, value in self._mapping[column_name] ] return Table._from_polars_lazy_frame( - table._lazy_frame.with_columns(expressions).drop(self._column_names), + table._lazy_frame.with_columns(expressions).drop(self._selector), ) def inverse_transform(self, transformed_table: Table) -> Table: @@ -252,7 +252,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._new_column_names is None or self._mapping is None: + if self._selector is None or self._new_column_names is None or self._mapping is None: raise NotFittedError(kind="transformer") _check_columns_exist(transformed_table, self._new_column_names) diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py index 5d1d320c0..b025939ba 100644 --- a/src/safeds/data/tabular/transformation/_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_range_scaler.py @@ -23,7 +23,7 @@ class RangeScaler(InvertibleTableTransformer): The minimum of the new range after the transformation max_: The maximum of the new range after the transformation - column_names: + selector: The list of columns used to fit the transformer. If `None`, all numeric columns are used. Raises @@ -36,8 +36,8 @@ class RangeScaler(InvertibleTableTransformer): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, *, column_names: str | list[str] | None = None, min_: float = 0.0, max_: float = 1.0) -> None: - super().__init__(column_names) + def __init__(self, *, selector: str | list[str] | None = None, min_: float = 0.0, max_: float = 1.0) -> None: + super().__init__(selector) if min_ >= max_: raise ValueError('Parameter "max_" must be greater than parameter "min_".') @@ -106,10 +106,10 @@ def fit(self, table: Table) -> RangeScaler: ValueError If the table contains 0 rows. """ - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a RangeScaler") @@ -121,7 +121,7 @@ def fit(self, table: Table) -> RangeScaler: _data_max = table._lazy_frame.select(column_names).max().collect() # Create a copy with the learned transformation - result = RangeScaler(min_=self._min, max_=self._max, column_names=column_names) + result = RangeScaler(min_=self._min, max_=self._max, selector=column_names) result._data_min = _data_min result._data_max = _data_max @@ -155,11 +155,11 @@ def transform(self, table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._data_min is None or self._data_max is None: + if self._selector is None or self._data_min is None or self._data_max is None: raise NotFittedError(kind="transformer") - _check_columns_exist(table, self._column_names) - _check_columns_are_numeric(table, self._column_names, operation="transform with a RangeScaler") + _check_columns_exist(table, self._selector) + _check_columns_are_numeric(table, self._selector, operation="transform with a RangeScaler") columns = [ ( @@ -168,7 +168,7 @@ def transform(self, table: Table) -> Table: * (self._max - self._min) + self._min ) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( @@ -203,13 +203,13 @@ def inverse_transform(self, transformed_table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._data_min is None or self._data_max is None: + if self._selector is None or self._data_min is None or self._data_max is None: raise NotFittedError(kind="transformer") - _check_columns_exist(transformed_table, self._column_names) + _check_columns_exist(transformed_table, self._selector) _check_columns_are_numeric( transformed_table, - self._column_names, + self._selector, operation="inverse-transform with a RangeScaler", ) @@ -220,7 +220,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: * (self._data_max.get_column(name) - self._data_min.get_column(name)) + self._data_min.get_column(name) ) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index aaed50711..a0565c72f 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -21,7 +21,7 @@ class RobustScaler(InvertibleTableTransformer): Parameters ---------- - column_names: + selector: The list of columns used to fit the transformer. If `None`, all numeric columns are used. """ @@ -29,8 +29,8 @@ class RobustScaler(InvertibleTableTransformer): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, *, column_names: str | list[str] | None = None) -> None: - super().__init__(column_names) + def __init__(self, *, selector: str | list[str] | None = None) -> None: + super().__init__(selector) # Internal state self._data_median: pl.DataFrame | None = None @@ -80,10 +80,10 @@ def fit(self, table: Table) -> RobustScaler: """ import polars as pl - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a RobustScaler") @@ -102,7 +102,7 @@ def fit(self, table: Table) -> RobustScaler: ) # Create a copy with the learned transformation - result = RobustScaler(column_names=column_names) + result = RobustScaler(selector=column_names) result._data_median = _data_median result._data_scale = _data_scale @@ -136,15 +136,15 @@ def transform(self, table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._data_median is None or self._data_scale is None: + if self._selector is None or self._data_median is None or self._data_scale is None: raise NotFittedError(kind="transformer") - _check_columns_exist(table, self._column_names) - _check_columns_are_numeric(table, self._column_names, operation="transform with a RobustScaler") + _check_columns_exist(table, self._selector) + _check_columns_are_numeric(table, self._selector, operation="transform with a RobustScaler") columns = [ (pl.col(name) - self._data_median.get_column(name)) / self._data_scale.get_column(name) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( @@ -179,19 +179,19 @@ def inverse_transform(self, transformed_table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._data_median is None or self._data_scale is None: + if self._selector is None or self._data_median is None or self._data_scale is None: raise NotFittedError(kind="transformer") - _check_columns_exist(transformed_table, self._column_names) + _check_columns_exist(transformed_table, self._selector) _check_columns_are_numeric( transformed_table, - self._column_names, + self._selector, operation="inverse-transform with a RobustScaler", ) columns = [ pl.col(name) * self._data_scale.get_column(name) + self._data_median.get_column(name) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( diff --git a/src/safeds/data/tabular/transformation/_simple_imputer.py b/src/safeds/data/tabular/transformation/_simple_imputer.py index 22b2d76b3..41639677d 100644 --- a/src/safeds/data/tabular/transformation/_simple_imputer.py +++ b/src/safeds/data/tabular/transformation/_simple_imputer.py @@ -22,7 +22,7 @@ class SimpleImputer(TableTransformer): How to replace missing values. value_to_replace: The value that should be replaced. - column_names: + selector: The list of columns used to fit the transformer. If `None`, all columns are used. Examples @@ -95,10 +95,10 @@ def __init__( self, strategy: SimpleImputer.Strategy, *, - column_names: str | list[str] | None = None, + selector: str | list[str] | None = None, value_to_replace: float | str | None = None, ) -> None: - super().__init__(column_names) + super().__init__(selector) # Parameters self._strategy = strategy @@ -165,17 +165,17 @@ def fit(self, table: Table) -> SimpleImputer: data. """ if isinstance(self._strategy, _Mean | _Median): - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a SimpleImputer") else: # noqa: PLR5501 - if self._column_names is None: + if self._selector is None: column_names = table.column_names else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) if table.row_count == 0: @@ -185,7 +185,7 @@ def fit(self, table: Table) -> SimpleImputer: replacement = self._strategy._get_replacement(table) # Create a copy with the learned transformation - result = SimpleImputer(self._strategy, column_names=column_names, value_to_replace=self._value_to_replace) + result = SimpleImputer(self._strategy, selector=column_names, value_to_replace=self._value_to_replace) result._replacement = replacement return result @@ -216,14 +216,13 @@ def transform(self, table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._replacement is None: + if self._selector is None or self._replacement is None: raise NotFittedError(kind="transformer") - _check_columns_exist(table, self._column_names) + _check_columns_exist(table, self._selector) columns = [ - (pl.col(name).replace(old=self._value_to_replace, new=self._replacement[name])) - for name in self._column_names + (pl.col(name).replace(old=self._value_to_replace, new=self._replacement[name])) for name in self._selector ] return Table._from_polars_lazy_frame( diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py index 322dd4dbc..5db98dade 100644 --- a/src/safeds/data/tabular/transformation/_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_standard_scaler.py @@ -18,7 +18,7 @@ class StandardScaler(InvertibleTableTransformer): Parameters ---------- - column_names: + selector: The list of columns used to fit the transformer. If `None`, all numeric columns are used. """ @@ -26,8 +26,8 @@ class StandardScaler(InvertibleTableTransformer): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, *, column_names: str | list[str] | None = None) -> None: - super().__init__(column_names) + def __init__(self, *, selector: str | list[str] | None = None) -> None: + super().__init__(selector) # Internal state self._data_mean: pl.DataFrame | None = None @@ -75,10 +75,10 @@ def fit(self, table: Table) -> StandardScaler: ValueError If the table contains 0 rows. """ - if self._column_names is None: + if self._selector is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: - column_names = self._column_names + column_names = self._selector _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a StandardScaler") @@ -90,7 +90,7 @@ def fit(self, table: Table) -> StandardScaler: _data_standard_deviation = table._lazy_frame.select(column_names).std(ddof=0).collect() # Create a copy with the learned transformation - result = StandardScaler(column_names=column_names) + result = StandardScaler(selector=column_names) result._data_mean = _data_mean result._data_standard_deviation = _data_standard_deviation @@ -124,15 +124,15 @@ def transform(self, table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._data_mean is None or self._data_standard_deviation is None: + if self._selector is None or self._data_mean is None or self._data_standard_deviation is None: raise NotFittedError(kind="transformer") - _check_columns_exist(table, self._column_names) - _check_columns_are_numeric(table, self._column_names, operation="transform with a StandardScaler") + _check_columns_exist(table, self._selector) + _check_columns_are_numeric(table, self._selector, operation="transform with a StandardScaler") columns = [ (pl.col(name) - self._data_mean.get_column(name)) / self._data_standard_deviation.get_column(name) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( @@ -167,19 +167,19 @@ def inverse_transform(self, transformed_table: Table) -> Table: import polars as pl # Used in favor of is_fitted, so the type checker is happy - if self._column_names is None or self._data_mean is None or self._data_standard_deviation is None: + if self._selector is None or self._data_mean is None or self._data_standard_deviation is None: raise NotFittedError(kind="transformer") - _check_columns_exist(transformed_table, self._column_names) + _check_columns_exist(transformed_table, self._selector) _check_columns_are_numeric( transformed_table, - self._column_names, + self._selector, operation="inverse-transform with a StandardScaler", ) columns = [ pl.col(name) * self._data_standard_deviation.get_column(name) + self._data_mean.get_column(name) - for name in self._column_names + for name in self._selector ] return Table._from_polars_lazy_frame( diff --git a/src/safeds/data/tabular/transformation/_table_transformer.py b/src/safeds/data/tabular/transformation/_table_transformer.py index bebdbfd42..019cda385 100644 --- a/src/safeds/data/tabular/transformation/_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_table_transformer.py @@ -15,7 +15,7 @@ class TableTransformer(ABC): Parameters ---------- - column_names: + selector: The list of columns used to fit the transformer. If `None`, all suitable columns are used. """ @@ -25,18 +25,18 @@ class TableTransformer(ABC): # The decorator is needed so the class really cannot be instantiated @abstractmethod - def __init__(self, column_names: str | list[str] | None) -> None: - if isinstance(column_names, str): - column_names = [column_names] + def __init__(self, selector: str | list[str] | None) -> None: + if isinstance(selector, str): + selector = [selector] - self._column_names: list[str] | None = column_names + self._selector: list[str] | None = selector # The decorator ensures that the method is overridden in all subclasses @abstractmethod def __hash__(self) -> int: return _structural_hash( self.__class__.__qualname__, - self._column_names, + self._selector, self.is_fitted, ) diff --git a/src/safeds/ml/nn/_model.py b/src/safeds/ml/nn/_model.py index 9b2d4f024..e2bc1743c 100644 --- a/src/safeds/ml/nn/_model.py +++ b/src/safeds/ml/nn/_model.py @@ -756,7 +756,7 @@ def from_pretrained_model(huggingface_repo: str) -> NeuralNetworkClassifier: # label_dict: dict[str, str] = config.id2label column_name = "label" labels_table = Table({column_name: [label for _, label in label_dict.items()]}) - one_hot_encoder = OneHotEncoder(column_names=[column_name]).fit(labels_table) + one_hot_encoder = OneHotEncoder(selector=[column_name]).fit(labels_table) in_conversion = InputConversionImageToColumn(input_size) diff --git a/tests/safeds/data/tabular/transformation/test_discretizer.py b/tests/safeds/data/tabular/transformation/test_discretizer.py index ffac3be94..7b5dd78a8 100644 --- a/tests/safeds/data/tabular/transformation/test_discretizer.py +++ b/tests/safeds/data/tabular/transformation/test_discretizer.py @@ -66,7 +66,7 @@ def test_should_raise_errors( error_message: str | None, ) -> None: with pytest.raises(error, match=error_message): - Discretizer(column_names=columns).fit(table) + Discretizer(selector=columns).fit(table) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -79,7 +79,7 @@ def test_should_not_change_original_transformer(self) -> None: transformer.fit(table) assert transformer._wrapped_transformer is None - assert transformer._column_names is None + assert transformer._selector is None class TestTransform: @@ -134,7 +134,7 @@ def test_should_raise_errors( }, ) - transformer = Discretizer(column_names=columns).fit(table_to_fit) + transformer = Discretizer(selector=columns).fit(table_to_fit) with pytest.raises(error, match=error_message): transformer.transform(table_to_transform) @@ -210,7 +210,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = Discretizer(column_names=column_names).fit_and_transform(table) + fitted_transformer, transformed_table = Discretizer(selector=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -252,7 +252,7 @@ def test_should_return_transformed_table_with_correct_number_of_bins( bin_count: int, expected: Table, ) -> None: - fitted_transformer, transformed_table = Discretizer(bin_count, column_names="col1").fit_and_transform(table) + fitted_transformer, transformed_table = Discretizer(bin_count, selector="col1").fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected diff --git a/tests/safeds/data/tabular/transformation/test_k_nearest_neighbors_imputer.py b/tests/safeds/data/tabular/transformation/test_k_nearest_neighbors_imputer.py index 53c11fb17..8e8ee8276 100644 --- a/tests/safeds/data/tabular/transformation/test_k_nearest_neighbors_imputer.py +++ b/tests/safeds/data/tabular/transformation/test_k_nearest_neighbors_imputer.py @@ -36,7 +36,7 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - KNearestNeighborsImputer(neighbor_count=5, column_names=["col2", "col3"]).fit(table) + KNearestNeighborsImputer(neighbor_count=5, selector=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises( @@ -55,7 +55,7 @@ def test_should_not_change_original_transformer(self) -> None: transformer = KNearestNeighborsImputer(neighbor_count=5) transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._wrapped_transformer is None @@ -154,7 +154,7 @@ def test_should_return_fitted_transformer_and_transformed_table( ) -> None: fitted_transformer, transformed_table = KNearestNeighborsImputer( neighbor_count=1, - column_names=None, + selector=None, value_to_replace=None, ).fit_and_transform(table) assert fitted_transformer.is_fitted diff --git a/tests/safeds/data/tabular/transformation/test_label_encoder.py b/tests/safeds/data/tabular/transformation/test_label_encoder.py index b19a72cda..4aa67f821 100644 --- a/tests/safeds/data/tabular/transformation/test_label_encoder.py +++ b/tests/safeds/data/tabular/transformation/test_label_encoder.py @@ -14,11 +14,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - LabelEncoder(column_names=["col2", "col3"]).fit(table) + LabelEncoder(selector=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The LabelEncoder cannot be fitted because the table contains 0 rows"): - LabelEncoder(column_names="col1").fit(Table({"col1": []})) + LabelEncoder(selector="col1").fit(Table({"col1": []})) def test_should_warn_if_table_contains_numerical_data(self) -> None: with pytest.warns( @@ -28,7 +28,7 @@ def test_should_warn_if_table_contains_numerical_data(self) -> None: r" values into numerical values" ), ): - LabelEncoder(column_names="col1").fit(Table({"col1": [1, 2]})) + LabelEncoder(selector="col1").fit(Table({"col1": [1, 2]})) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -40,7 +40,7 @@ def test_should_not_change_original_transformer(self) -> None: transformer = LabelEncoder() transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._mapping is None assert transformer._inverse_mapping is None @@ -136,7 +136,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = LabelEncoder(column_names=column_names).fit_and_transform(table) + fitted_transformer, transformed_table = LabelEncoder(selector=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -208,12 +208,12 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - LabelEncoder(column_names=["col1", "col2"]).fit( + LabelEncoder(selector=["col1", "col2"]).fit( Table({"col1": ["one", "two"], "col2": ["three", "four"]}), ).inverse_transform(Table({"col3": [1.0, 0.0]})) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - LabelEncoder(column_names=["col1", "col2"]).fit( + LabelEncoder(selector=["col1", "col2"]).fit( Table({"col1": ["one", "two"], "col2": ["three", "four"]}), ).inverse_transform(Table({"col1": ["1", "null"], "col2": ["2", "apple"]})) diff --git a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py index 67431c63c..57ea91c29 100644 --- a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py +++ b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py @@ -22,11 +22,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - OneHotEncoder(column_names=["col2", "col3"]).fit(table) + OneHotEncoder(selector=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The OneHotEncoder cannot be fitted because the table contains 0 rows"): - OneHotEncoder(column_names="col1").fit(Table({"col1": []})) + OneHotEncoder(selector="col1").fit(Table({"col1": []})) def test_should_warn_if_table_contains_numerical_data(self) -> None: with pytest.warns( @@ -36,7 +36,7 @@ def test_should_warn_if_table_contains_numerical_data(self) -> None: r" values into numerical values" ), ): - OneHotEncoder(column_names="col1").fit(Table({"col1": [1, 2, 3]})) + OneHotEncoder(selector="col1").fit(Table({"col1": [1, 2, 3]})) @pytest.mark.parametrize( "table", @@ -58,7 +58,7 @@ def test_should_not_change_original_transformer(self, table: Table) -> None: transformer = OneHotEncoder() transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._new_column_names is None assert transformer._mapping is None @@ -248,7 +248,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = OneHotEncoder(column_names=column_names).fit_and_transform(table) + fitted_transformer, transformed_table = OneHotEncoder(selector=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -340,7 +340,7 @@ def test_should_return_original_table( column_names: list[str], table_to_transform: Table, ) -> None: - transformer = OneHotEncoder(column_names=column_names).fit(table_to_fit) + transformer = OneHotEncoder(selector=column_names).fit(table_to_fit) result = transformer.inverse_transform(transformer.transform(table_to_transform)) @@ -388,12 +388,12 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - OneHotEncoder(column_names="col1").fit(Table({"col1": ["one", "two"]})).inverse_transform( + OneHotEncoder(selector="col1").fit(Table({"col1": ["one", "two"]})).inverse_transform( Table({"col1": [1.0, 0.0]}), ) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - OneHotEncoder(column_names="col1").fit(Table({"col1": ["one", "two"]})).inverse_transform( + OneHotEncoder(selector="col1").fit(Table({"col1": ["one", "two"]})).inverse_transform( Table({"col1__one": ["1", "null"], "col1__two": ["2", "ok"]}), ) diff --git a/tests/safeds/data/tabular/transformation/test_range_scaler.py b/tests/safeds/data/tabular/transformation/test_range_scaler.py index 85e4069b6..55a2fcfb1 100644 --- a/tests/safeds/data/tabular/transformation/test_range_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_range_scaler.py @@ -20,11 +20,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - RangeScaler(column_names=["col2", "col3"]).fit(table) + RangeScaler(selector=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RangeScaler(column_names=["col1", "col2"]).fit(Table({"col1": ["a", "b"], "col2": [1, "c"]})) + RangeScaler(selector=["col1", "col2"]).fit(Table({"col1": ["a", "b"], "col2": [1, "c"]})) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The RangeScaler cannot be fitted because the table contains 0 rows"): @@ -40,7 +40,7 @@ def test_should_not_change_original_transformer(self) -> None: transformer = RangeScaler() transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._data_min is None assert transformer._data_max is None @@ -79,7 +79,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RangeScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( + RangeScaler(selector=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( Table({"col1": ["a", "b", "c"], "col2": ["c", "d", "e"]}), ) @@ -142,7 +142,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = RangeScaler(column_names=column_names).fit_and_transform(table) + fitted_transformer, transformed_table = RangeScaler(selector=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -189,7 +189,7 @@ def test_should_return_fitted_transformer_and_transformed_table_with_correct_ran fitted_transformer, transformed_table = RangeScaler( min_=-10.0, max_=10.0, - column_names=column_names, + selector=column_names, ).fit_and_transform( table, ) @@ -263,7 +263,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - RangeScaler(column_names=["col1", "col2"]).fit( + RangeScaler(selector=["col1", "col2"]).fit( Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ).inverse_transform( Table({"col3": [1, 2, 3]}), @@ -271,7 +271,7 @@ def test_should_raise_if_column_not_found(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RangeScaler(column_names=["col1", "col2"]).fit( + RangeScaler(selector=["col1", "col2"]).fit( Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ).inverse_transform( Table({"col1": ["1", "2", "three"], "col2": [1, 2, "four"]}), diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index 49e110c64..6e56b5075 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -15,11 +15,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - RobustScaler(column_names=["col2", "col3"]).fit(table) + RobustScaler(selector=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RobustScaler(column_names=["col1", "col2"]).fit( + RobustScaler(selector=["col1", "col2"]).fit( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), ) @@ -37,7 +37,7 @@ def test_should_not_change_original_transformer(self) -> None: transformer = RobustScaler() transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._data_median is None assert transformer._data_scale is None @@ -99,7 +99,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RobustScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( + RobustScaler(selector=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( Table({"col1": ["a", "b", "c"], "col2": ["b", "c", "e"]}), ) @@ -149,7 +149,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = RobustScaler(column_names=column_names).fit_and_transform(table) + fitted_transformer, transformed_table = RobustScaler(selector=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert_tables_are_equal(transformed_table, expected) @@ -221,7 +221,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - RobustScaler(column_names=["col1", "col2"]).fit( + RobustScaler(selector=["col1", "col2"]).fit( Table({"col1": [1, 2, 3, 4], "col2": [2, 3, 4, 5]}), ).inverse_transform( Table({"col3": [0, 1, 2, 3]}), @@ -229,7 +229,7 @@ def test_should_raise_if_column_not_found(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RobustScaler(column_names=["col1", "col2"]).fit( + RobustScaler(selector=["col1", "col2"]).fit( Table({"col1": [1, 2, 3, 4], "col2": [2, 3, 4, 5]}), ).inverse_transform( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), diff --git a/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py b/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py index 0fa39e612..b63622ec6 100644 --- a/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py +++ b/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py @@ -139,11 +139,11 @@ class TestInverseTransform: @pytest.mark.parametrize( "transformers", [ - [Discretizer(bin_count=3, column_names="col1")], + [Discretizer(bin_count=3, selector="col1")], [SimpleImputer(SimpleImputer.Strategy.constant(0))], [SimpleImputer(SimpleImputer.Strategy.constant(0)), Discretizer(bin_count=3)], [ - LabelEncoder(column_names="col2", partial_order=["a", "b", "c"]), + LabelEncoder(selector="col2", partial_order=["a", "b", "c"]), SimpleImputer(SimpleImputer.Strategy.mean()), ], ], @@ -170,11 +170,11 @@ def test_should_raise_transformer_not_invertible_error_on_non_invertible_transfo "transformers", [ [OneHotEncoder()], - [OneHotEncoder(), StandardScaler(column_names=["col1", "col3"])], + [OneHotEncoder(), StandardScaler(selector=["col1", "col3"])], [ - LabelEncoder(column_names="col2", partial_order=["a", "b", "c"]), + LabelEncoder(selector="col2", partial_order=["a", "b", "c"]), OneHotEncoder(), - StandardScaler(column_names=["col1", "col3"]), + StandardScaler(selector=["col1", "col3"]), ], [LabelEncoder(), LabelEncoder()], ], diff --git a/tests/safeds/data/tabular/transformation/test_simple_imputer.py b/tests/safeds/data/tabular/transformation/test_simple_imputer.py index 047f68f0a..19e0528e1 100644 --- a/tests/safeds/data/tabular/transformation/test_simple_imputer.py +++ b/tests/safeds/data/tabular/transformation/test_simple_imputer.py @@ -175,7 +175,7 @@ def test_should_raise_if_column_not_found(self, strategy: SimpleImputer.Strategy ) with pytest.raises(ColumnNotFoundError): - SimpleImputer(strategy, column_names=["b", "c"]).fit(table) + SimpleImputer(strategy, selector=["b", "c"]).fit(table) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) def test_should_raise_if_table_contains_no_rows(self, strategy: SimpleImputer.Strategy) -> None: @@ -197,7 +197,7 @@ def test_should_raise_if_table_contains_non_numerical_data( strategy: SimpleImputer.Strategy, ) -> None: with pytest.raises(ColumnTypeError): - SimpleImputer(strategy, column_names=col_names).fit(table) + SimpleImputer(strategy, selector=col_names).fit(table) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) def test_should_not_change_original_transformer(self, strategy: SimpleImputer.Strategy) -> None: @@ -210,7 +210,7 @@ def test_should_not_change_original_transformer(self, strategy: SimpleImputer.St transformer = SimpleImputer(strategy) transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._replacement is None @@ -411,7 +411,7 @@ def test_should_return_fitted_transformer_and_transformed_table( ) fitted_transformer, transformed_table = SimpleImputer( strategy, - column_names=column_names, + selector=column_names, value_to_replace=value_to_replace, ).fit_and_transform(table) diff --git a/tests/safeds/data/tabular/transformation/test_standard_scaler.py b/tests/safeds/data/tabular/transformation/test_standard_scaler.py index fd355c92f..39334d0a9 100644 --- a/tests/safeds/data/tabular/transformation/test_standard_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_standard_scaler.py @@ -15,11 +15,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - StandardScaler(column_names=["col2", "col3"]).fit(table) + StandardScaler(selector=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - StandardScaler(column_names=["col1", "col2"]).fit( + StandardScaler(selector=["col1", "col2"]).fit( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), ) @@ -37,7 +37,7 @@ def test_should_not_change_original_transformer(self) -> None: transformer = StandardScaler() transformer.fit(table) - assert transformer._column_names is None + assert transformer._selector is None assert transformer._data_mean is None assert transformer._data_standard_deviation is None @@ -76,7 +76,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - StandardScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( + StandardScaler(selector=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( Table({"col1": ["a", "b", "c"], "col2": ["b", "c", "e"]}), ) @@ -126,7 +126,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = StandardScaler(column_names=column_names).fit_and_transform(table) + fitted_transformer, transformed_table = StandardScaler(selector=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert_tables_are_equal(transformed_table, expected) @@ -198,7 +198,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - StandardScaler(column_names=["col1", "col2"]).fit( + StandardScaler(selector=["col1", "col2"]).fit( Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), ).inverse_transform( Table({"col3": [0, 1, 2]}), @@ -206,7 +206,7 @@ def test_should_raise_if_column_not_found(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - StandardScaler(column_names=["col1", "col2"]).fit( + StandardScaler(selector=["col1", "col2"]).fit( Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), ).inverse_transform( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), diff --git a/tests/safeds/data/tabular/transformation/test_table_transformer.py b/tests/safeds/data/tabular/transformation/test_table_transformer.py index c4113c53c..38fb87aae 100644 --- a/tests/safeds/data/tabular/transformation/test_table_transformer.py +++ b/tests/safeds/data/tabular/transformation/test_table_transformer.py @@ -30,10 +30,10 @@ def transformers_numeric() -> list[TableTransformer]: The list of numeric transformers to test. """ return [ - StandardScaler(column_names="col1"), - RangeScaler(column_names="col1"), - Discretizer(column_names="col1"), - RobustScaler(column_names="col1"), + StandardScaler(selector="col1"), + RangeScaler(selector="col1"), + Discretizer(selector="col1"), + RobustScaler(selector="col1"), ] @@ -50,8 +50,8 @@ def transformers_non_numeric() -> list[TableTransformer]: The list of non-numeric transformers to test. """ return [ - OneHotEncoder(column_names="col1"), - LabelEncoder(column_names="col1"), + OneHotEncoder(selector="col1"), + LabelEncoder(selector="col1"), ] diff --git a/tests/safeds/ml/nn/test_cnn_workflow.py b/tests/safeds/ml/nn/test_cnn_workflow.py index e39f99d56..c46f8b414 100644 --- a/tests/safeds/ml/nn/test_cnn_workflow.py +++ b/tests/safeds/ml/nn/test_cnn_workflow.py @@ -79,7 +79,7 @@ def test_should_train_and_predict_model( if groups is not None: classes.append(groups.group(2)) image_classes = Table({"class": classes}) - one_hot_encoder = OneHotEncoder(column_names="class").fit(image_classes) + one_hot_encoder = OneHotEncoder(selector="class").fit(image_classes) image_classes_one_hot_encoded = one_hot_encoder.transform(image_classes) image_dataset = ImageDataset(image_list, image_classes_one_hot_encoded) num_of_classes: int = image_dataset.output_size if isinstance(image_dataset.output_size, int) else 0 diff --git a/tests/safeds/ml/nn/test_forward_workflow.py b/tests/safeds/ml/nn/test_forward_workflow.py index 2c2252634..b3c1f7bce 100644 --- a/tests/safeds/ml/nn/test_forward_workflow.py +++ b/tests/safeds/ml/nn/test_forward_workflow.py @@ -30,7 +30,7 @@ def test_forward_model(device: Device) -> None: table_2 = table_2.add_columns([(table_1.slice_rows(start=14)).get_column("value").rename("target")]) train_table, test_table = table_2.split_rows(0.8) - ss = StandardScaler(column_names="value") + ss = StandardScaler(selector="value") _, train_table = ss.fit_and_transform(train_table) _, test_table = ss.fit_and_transform(test_table) model = NeuralNetworkRegressor( diff --git a/tests/safeds/ml/nn/test_lstm_workflow.py b/tests/safeds/ml/nn/test_lstm_workflow.py index 31443afd2..9ba589d77 100644 --- a/tests/safeds/ml/nn/test_lstm_workflow.py +++ b/tests/safeds/ml/nn/test_lstm_workflow.py @@ -26,7 +26,7 @@ def test_lstm_model(device: Device) -> None: # Create a DataFrame _inflation_path = "_datas/US_Inflation_rates.csv" table = Table.from_csv_file(path=resolve_resource_path(_inflation_path)) - rs = RangeScaler(column_names="value") + rs = RangeScaler(selector="value") _, table = rs.fit_and_transform(table) train_table, test_table = table.split_rows(0.8) From c97c6572e1f89944cdd1270cbc548e38b6d41e58 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 Jan 2025 17:55:26 +0100 Subject: [PATCH 3/5] feat: consistently name `selector` parameters in validators --- .../_check_column_is_numeric_module.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/safeds/_validation/_check_column_is_numeric_module.py b/src/safeds/_validation/_check_column_is_numeric_module.py index 91399fbdd..d77395763 100644 --- a/src/safeds/_validation/_check_column_is_numeric_module.py +++ b/src/safeds/_validation/_check_column_is_numeric_module.py @@ -49,21 +49,19 @@ def _check_column_is_numeric( def _check_columns_are_numeric( table_or_schema: Table | Schema, - column_names: str | list[str], + selector: str | list[str], *, operation: str = "do a numeric operation", ) -> None: """ - Check if the columns with the specified names are numeric and raise an error if they are not. - - Missing columns are ignored. Use `_check_columns_exist` to check for missing columns. + Check if the specified columns are numeric and raise an error if they are not. Missing columns are ignored. Parameters ---------- table_or_schema: The table or schema to check. - column_names: - The column names to check. + selector: + The columns to check. operation: The operation that is performed on the columns. This is used in the error message. @@ -76,17 +74,17 @@ def _check_columns_are_numeric( if isinstance(table_or_schema, Table): table_or_schema = table_or_schema.schema - if isinstance(column_names, str): - column_names = [column_names] + if isinstance(selector, str): + selector = [selector] - if len(column_names) > 1: + if len(selector) > 1: # Create a set for faster containment checks known_names: Container = set(table_or_schema.column_names) else: known_names = table_or_schema.column_names non_numeric_names = [ - name for name in column_names if name in known_names and not table_or_schema.get_column_type(name).is_numeric + name for name in selector if name in known_names and not table_or_schema.get_column_type(name).is_numeric ] if non_numeric_names: message = _build_error_message(non_numeric_names, operation) From 220b67e395b3e55d17f0720983303bcb905b4b92 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 Jan 2025 18:04:01 +0100 Subject: [PATCH 4/5] test: ignore some branches --- src/safeds/_validation/_check_bounds_module.py | 4 ++-- src/safeds/_validation/_check_column_is_numeric_module.py | 2 +- src/safeds/_validation/_check_schema_module.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/safeds/_validation/_check_bounds_module.py b/src/safeds/_validation/_check_bounds_module.py index 5788e23f8..d54dc72fa 100644 --- a/src/safeds/_validation/_check_bounds_module.py +++ b/src/safeds/_validation/_check_bounds_module.py @@ -36,7 +36,7 @@ def _check_bounds( if actual is None: return # Skip the check if the actual value is None (i.e., not provided). - if lower_bound is None: + if lower_bound is None: # pragma: no cover lower_bound = _OpenBound(float("-inf")) if upper_bound is None: upper_bound = _OpenBound(float("inf")) @@ -148,7 +148,7 @@ def _to_string_as_upper_bound(self) -> str: def _float_to_string(value: float) -> str: - if value == float("-inf"): + if value == float("-inf"): # pragma: no cover return "-\u221e" elif value == float("inf"): return "\u221e" diff --git a/src/safeds/_validation/_check_column_is_numeric_module.py b/src/safeds/_validation/_check_column_is_numeric_module.py index d77395763..a92495b44 100644 --- a/src/safeds/_validation/_check_column_is_numeric_module.py +++ b/src/safeds/_validation/_check_column_is_numeric_module.py @@ -74,7 +74,7 @@ def _check_columns_are_numeric( if isinstance(table_or_schema, Table): table_or_schema = table_or_schema.schema - if isinstance(selector, str): + if isinstance(selector, str): # pragma: no cover selector = [selector] if len(selector) > 1: diff --git a/src/safeds/_validation/_check_schema_module.py b/src/safeds/_validation/_check_schema_module.py index 1ea7bbda3..70e50dc35 100644 --- a/src/safeds/_validation/_check_schema_module.py +++ b/src/safeds/_validation/_check_schema_module.py @@ -75,7 +75,7 @@ def _check_schema( def _check_types(expected_schema: Schema, actual_schema: Schema, *, check_types: _TypeCheckingMode) -> None: - if check_types == "off": + if check_types == "off": # pragma: no cover return mismatched_types: list[tuple[str, pl.DataType, pl.DataType]] = [] From 506b34f0adc8a6a9db6be8d276ed3be8c8ae3367 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 Jan 2025 18:09:57 +0100 Subject: [PATCH 5/5] feat: remove ability to select columns by custom predicate --- src/safeds/data/tabular/containers/_table.py | 36 ++++++------------- .../containers/_table/test_select_columns.py | 12 ------- 2 files changed, 10 insertions(+), 38 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 5d90a07cd..0e8df2ef7 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -831,7 +831,7 @@ def remove_columns( Related ------- - [select_columns][safeds.data.tabular.containers._table.Table.select_columns]: - Keep only a subset of the columns. This method accepts either column names, or a predicate. + Keep only a subset of the columns. - [remove_columns_with_missing_values][safeds.data.tabular.containers._table.Table.remove_columns_with_missing_values] - [remove_non_numeric_columns][safeds.data.tabular.containers._table.Table.remove_non_numeric_columns] """ @@ -900,7 +900,7 @@ def remove_columns_with_missing_values( - [KNearestNeighborsImputer][safeds.data.tabular.transformation._k_nearest_neighbors_imputer.KNearestNeighborsImputer]: Replace missing values with a value computed from the nearest neighbors. - [select_columns][safeds.data.tabular.containers._table.Table.select_columns]: - Keep only a subset of the columns. This method accepts either column names, or a predicate. + Keep only a subset of the columns. - [remove_columns][safeds.data.tabular.containers._table.Table.remove_columns]: Remove columns from the table by name. - [remove_non_numeric_columns][safeds.data.tabular.containers._table.Table.remove_non_numeric_columns] @@ -955,7 +955,7 @@ def remove_non_numeric_columns(self) -> Table: Related ------- - [select_columns][safeds.data.tabular.containers._table.Table.select_columns]: - Keep only a subset of the columns. This method accepts either column names, or a predicate. + Keep only a subset of the columns. - [remove_columns][safeds.data.tabular.containers._table.Table.remove_columns]: Remove columns from the table by name. - [remove_columns_with_missing_values][safeds.data.tabular.containers._table.Table.remove_columns_with_missing_values] @@ -1113,21 +1113,17 @@ def replace_column( def select_columns( self, - selector: str | list[str] | Callable[[Column], bool], + selector: str | list[str], ) -> Table: """ Select a subset of the columns and return the result as a new table. - **Notes:** - - - The original table is not modified. - - If the `selector` is a custom function, this operation must fully load the data into memory, which can be - expensive. + **Note:** The original table is not modified. Parameters ---------- selector: - The names of the columns to keep, or a predicate that decides whether to keep a column. + The columns to keep. Returns ------- @@ -1161,23 +1157,11 @@ def select_columns( - [remove_columns_with_missing_values][safeds.data.tabular.containers._table.Table.remove_columns_with_missing_values] - [remove_non_numeric_columns][safeds.data.tabular.containers._table.Table.remove_non_numeric_columns] """ - import polars as pl - - # Select by predicate - if callable(selector): - return Table._from_polars_lazy_frame( - pl.LazyFrame( - [column._series for column in self.to_columns() if selector(column)], - ), - ) - - # Select by column names - else: - _check_columns_exist(self, selector) + _check_columns_exist(self, selector) - return Table._from_polars_lazy_frame( - self._lazy_frame.select(selector), - ) + return Table._from_polars_lazy_frame( + self._lazy_frame.select(selector), + ) def transform_columns( self, diff --git a/tests/safeds/data/tabular/containers/_table/test_select_columns.py b/tests/safeds/data/tabular/containers/_table/test_select_columns.py index 324a5ca69..befc65309 100644 --- a/tests/safeds/data/tabular/containers/_table/test_select_columns.py +++ b/tests/safeds/data/tabular/containers/_table/test_select_columns.py @@ -14,11 +14,6 @@ [], Table({}), ), - ( - lambda: Table({}), - lambda column: column.name.endswith("1"), - Table({}), - ), ( lambda: Table({"col1": [], "col2": []}), [], @@ -34,11 +29,6 @@ ["col1", "col2"], Table({"col1": [], "col2": []}), ), - ( - lambda: Table({"col1": [], "col2": []}), - lambda column: column.name.endswith("1"), - Table({"col1": []}), - ), # Related to https://github.com/Safe-DS/Library/issues/115 ( lambda: Table({"A": [1], "B": [2], "C": [3]}), @@ -48,11 +38,9 @@ ], ids=[ "empty table, empty list", - "empty table, predicate", "non-empty table, empty list", "non-empty table, single column", "non-empty table, multiple columns", - "non-empty table, predicate", "swapped order", ], )