diff --git a/benchmarks/metrics/__init__.py b/benchmarks/metrics/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/metrics/classification.py b/benchmarks/metrics/classification.py new file mode 100644 index 000000000..6252a7889 --- /dev/null +++ b/benchmarks/metrics/classification.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from timeit import timeit +from typing import TYPE_CHECKING + +import polars as pl + +from benchmarks.table.utils import create_synthetic_table +from safeds.data.tabular.containers import Table +from safeds.ml.metrics import ClassificationMetrics + + +REPETITIONS = 10 + + +def _run_accuracy() -> None: + ClassificationMetrics.accuracy(table.get_column("predicted"), table.get_column("expected")) + + +def _run_f1_score() -> None: + ClassificationMetrics.f1_score(table.get_column("predicted"), table.get_column("expected"), 1) + + +def _run_precision() -> None: + ClassificationMetrics.precision(table.get_column("predicted"), table.get_column("expected"), 1) + + +def _run_recall() -> None: + ClassificationMetrics.recall(table.get_column("predicted"), table.get_column("expected"), 1) + + +if __name__ == "__main__": + # Create a synthetic Table + table = ( + create_synthetic_table(10000, 2) + .rename_column("column_0", "predicted") + .rename_column("column_1", "expected") + ) + + # Run the benchmarks + timings: dict[str, float] = { + "accuracy": timeit( + _run_accuracy, + number=REPETITIONS, + ), + "f1_score": timeit( + _run_f1_score, + number=REPETITIONS, + ), + "precision": timeit( + _run_precision, + number=REPETITIONS, + ), + "recall": timeit( + _run_recall, + number=REPETITIONS, + ), + } + + # Print the timings + with pl.Config( + tbl_rows=-1, + ): + print( + Table( + { + "method": list(timings.keys()), + "timing": list(timings.values()), + } + ) + ) diff --git a/benchmarks/table/column_operations_polars.py b/benchmarks/table/column_operations.py similarity index 84% rename from benchmarks/table/column_operations_polars.py rename to benchmarks/table/column_operations.py index c452edca5..8b3752149 100644 --- a/benchmarks/table/column_operations_polars.py +++ b/benchmarks/table/column_operations.py @@ -1,8 +1,8 @@ from timeit import timeit -from safeds.data.tabular.containers import ExperimentalTable +from safeds.data.tabular.containers import Table -from benchmarks.table.utils import create_synthetic_table_polars +from benchmarks.table.utils import create_synthetic_table REPETITIONS = 10 @@ -21,7 +21,7 @@ def _run_summarize_statistics() -> None: if __name__ == "__main__": # Create a synthetic Table - table = create_synthetic_table_polars(100, 5000) + table = create_synthetic_table(100, 5000) # Run the benchmarks timings: dict[str, float] = { @@ -41,7 +41,7 @@ def _run_summarize_statistics() -> None: # Print the timings print( - ExperimentalTable( + Table( { "method": list(timings.keys()), "timing": list(timings.values()), diff --git a/benchmarks/table/row_operations.py b/benchmarks/table/row_operations.py index d4f433bab..daf479e33 100644 --- a/benchmarks/table/row_operations.py +++ b/benchmarks/table/row_operations.py @@ -1,5 +1,7 @@ from timeit import timeit +import polars as pl + from safeds.data.tabular.containers import Table from benchmarks.table.utils import create_synthetic_table @@ -7,16 +9,12 @@ REPETITIONS = 10 -def _run_group_rows() -> None: - table.group_rows(lambda row: row.get_value("column_0") % 2 == 0) - - def _run_remove_duplicate_rows() -> None: - table.remove_duplicate_rows() + table.remove_duplicate_rows()._lazy_frame.collect() def _run_remove_rows_with_missing_values() -> None: - table.remove_rows_with_missing_values() + table.remove_rows_with_missing_values()._lazy_frame.collect() def _run_remove_rows_with_outliers() -> None: @@ -24,31 +22,37 @@ def _run_remove_rows_with_outliers() -> None: def _run_remove_rows() -> None: - table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0) + table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect() + + +def _run_remove_rows_by_column() -> None: + table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect() def _run_shuffle_rows() -> None: - table.shuffle_rows() + table.shuffle_rows()._lazy_frame.collect() def _run_slice_rows() -> None: - table.slice_rows(end=table.number_of_rows // 2) + table.slice_rows(length=table.number_of_rows // 2)._lazy_frame.collect() def _run_sort_rows() -> None: - table.sort_rows(lambda row1, row2: row1.get_value("column_0") - row2.get_value("column_0")) + table.sort_rows(lambda row: row.get_value("column_0"))._lazy_frame.collect() -def _run_split_rows() -> None: - table.split_rows(0.5) +def _run_sort_rows_by_column() -> None: + table.sort_rows_by_column("column_0")._lazy_frame.collect() -def _run_to_rows() -> None: - table.to_rows() +def _run_split_rows() -> None: + table_1, table_2 = table.split_rows(0.5) + table_1._lazy_frame.collect() + table_2._lazy_frame.collect() def _run_transform_column() -> None: - table.transform_column("column_0", lambda row: row.get_value("column_0") * 2) + table.transform_column("column_0", lambda value: value * 2)._lazy_frame.collect() if __name__ == "__main__": @@ -57,10 +61,6 @@ def _run_transform_column() -> None: # Run the benchmarks timings: dict[str, float] = { - "group_rows": timeit( - _run_group_rows, - number=REPETITIONS, - ), "remove_duplicate_rows": timeit( _run_remove_duplicate_rows, number=REPETITIONS, @@ -77,6 +77,10 @@ def _run_transform_column() -> None: _run_remove_rows, number=REPETITIONS, ), + "remove_rows_by_column": timeit( + _run_remove_rows_by_column, + number=REPETITIONS, + ), "shuffle_rows": timeit( _run_shuffle_rows, number=REPETITIONS, @@ -89,26 +93,29 @@ def _run_transform_column() -> None: _run_sort_rows, number=REPETITIONS, ), - "split_rows": timeit( - _run_split_rows, + "sort_rows_by_column": timeit( + _run_sort_rows_by_column, number=REPETITIONS, ), - "to_rows": timeit( - _run_to_rows, + "split_rows": timeit( + _run_split_rows, number=REPETITIONS, ), - "transform_colum": timeit( + "transform_column": timeit( _run_transform_column, number=REPETITIONS, ), } # Print the timings - print( - Table( - { # noqa: T201 - "method": list(timings.keys()), - "timing": list(timings.values()), - } + with pl.Config( + tbl_rows=-1, + ): + print( + Table( + { + "method": list(timings.keys()), + "timing": list(timings.values()), + } + ) ) - ) diff --git a/benchmarks/table/row_operations_polars.py b/benchmarks/table/row_operations_polars.py deleted file mode 100644 index 403bfb80d..000000000 --- a/benchmarks/table/row_operations_polars.py +++ /dev/null @@ -1,121 +0,0 @@ -from timeit import timeit - -import polars as pl - -from safeds.data.tabular.containers import ExperimentalTable - -from benchmarks.table.utils import create_synthetic_table_polars - -REPETITIONS = 10 - - -def _run_remove_duplicate_rows() -> None: - table.remove_duplicate_rows()._lazy_frame.collect() - - -def _run_remove_rows_with_missing_values() -> None: - table.remove_rows_with_missing_values()._lazy_frame.collect() - - -def _run_remove_rows_with_outliers() -> None: - table.remove_rows_with_outliers() - - -def _run_remove_rows() -> None: - table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect() - - -def _run_remove_rows_by_column() -> None: - table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect() - - -def _run_shuffle_rows() -> None: - table.shuffle_rows()._lazy_frame.collect() - - -def _run_slice_rows() -> None: - table.slice_rows(length=table.number_of_rows // 2)._lazy_frame.collect() - - -def _run_sort_rows() -> None: - table.sort_rows(lambda row: row.get_value("column_0"))._lazy_frame.collect() - - -def _run_sort_rows_by_column() -> None: - table.sort_rows_by_column("column_0")._lazy_frame.collect() - - -def _run_split_rows() -> None: - table_1, table_2 = table.split_rows(0.5) - table_1._lazy_frame.collect() - table_2._lazy_frame.collect() - - -def _run_transform_column() -> None: - table.transform_column("column_0", lambda value: value * 2)._lazy_frame.collect() - - -if __name__ == "__main__": - # Create a synthetic Table - table = create_synthetic_table_polars(1000, 50) - - # Run the benchmarks - timings: dict[str, float] = { - "remove_duplicate_rows": timeit( - _run_remove_duplicate_rows, - number=REPETITIONS, - ), - "remove_rows_with_missing_values": timeit( - _run_remove_rows_with_missing_values, - number=REPETITIONS, - ), - "remove_rows_with_outliers": timeit( - _run_remove_rows_with_outliers, - number=REPETITIONS, - ), - "remove_rows": timeit( - _run_remove_rows, - number=REPETITIONS, - ), - "remove_rows_by_column": timeit( - _run_remove_rows_by_column, - number=REPETITIONS, - ), - "shuffle_rows": timeit( - _run_shuffle_rows, - number=REPETITIONS, - ), - "slice_rows": timeit( - _run_slice_rows, - number=REPETITIONS, - ), - "sort_rows": timeit( - _run_sort_rows, - number=REPETITIONS, - ), - "sort_rows_by_column": timeit( - _run_sort_rows_by_column, - number=REPETITIONS, - ), - "split_rows": timeit( - _run_split_rows, - number=REPETITIONS, - ), - "transform_column": timeit( - _run_transform_column, - number=REPETITIONS, - ), - } - - # Print the timings - with pl.Config( - tbl_rows=-1, - ): - print( - ExperimentalTable( - { - "method": list(timings.keys()), - "timing": list(timings.values()), - } - ) - ) diff --git a/benchmarks/table/utils/__init__.py b/benchmarks/table/utils/__init__.py index bb2b4ee56..b46ac105d 100644 --- a/benchmarks/table/utils/__init__.py +++ b/benchmarks/table/utils/__init__.py @@ -1,7 +1,5 @@ from .create_synthetic_table import create_synthetic_table -from .create_synthetic_table_polars import create_synthetic_table_polars __all__ = [ "create_synthetic_table", - "create_synthetic_table_polars", ] diff --git a/benchmarks/table/utils/create_synthetic_table_polars.py b/benchmarks/table/utils/create_synthetic_table_polars.py deleted file mode 100644 index 34a354b13..000000000 --- a/benchmarks/table/utils/create_synthetic_table_polars.py +++ /dev/null @@ -1,37 +0,0 @@ -from random import randrange - -from safeds.data.tabular.containers import ExperimentalTable - - -def create_synthetic_table_polars( - number_of_rows: int, - number_of_columns: int, - *, - min_value: int = 0, - max_value: int = 1000, -) -> ExperimentalTable: - """ - Create a synthetic Table with random numerical data. - - Parameters - ---------- - number_of_rows: - Number of rows in the Table. - number_of_columns: - Number of columns in the Table. - min_value: - Minimum value of the random data. - max_value: - Maximum value of the random data. - - Returns - ------- - Table - A Table with random numerical data. - """ - return ExperimentalTable( - { - f"column_{i}": [randrange(min_value, max_value) for _ in range(number_of_rows)] - for i in range(number_of_columns) - } - ) diff --git a/docs/glossary.md b/docs/glossary.md index d66cac8a2..fef1f6206 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -53,7 +53,7 @@ It is analogous to a column within a table. Linear Regression is the supervised Machine Learning model in which the model finds the best fit linear line between the independent and dependent variable i.e. it finds the linear relationship between the dependent and independent variable. -Implemented in Safe-DS as [LinearRegression][safeds.ml.classical.regression.LinearRegressionRegressor]. +Implemented in Safe-DS as [LinearRegressor[safeds.ml.classical.regression.LinearRegressor]. ## Machine Learning (ML) Machine Learning is a generic term for artificially generating knowledge through experience. diff --git a/docs/tutorials/machine_learning.ipynb b/docs/tutorials/machine_learning.ipynb index 006da0103..e571d3f0f 100644 --- a/docs/tutorials/machine_learning.ipynb +++ b/docs/tutorials/machine_learning.ipynb @@ -19,7 +19,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.containers import Table\n", "\n", @@ -36,7 +35,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -52,7 +52,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.ml.classical.regression import LinearRegressionRegressor\n", "\n", @@ -61,7 +60,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -77,7 +77,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "test_set = Table({\n", " \"a\": [1, 1, 0, 2, 4],\n", @@ -88,7 +87,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -133,7 +133,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix\n", "\n", @@ -146,7 +145,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -177,7 +177,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", @@ -189,7 +188,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -217,7 +217,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from sklearn.metrics import f1_score\n", "\n", @@ -229,7 +228,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -257,7 +257,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from sklearn.metrics import precision_score\n", "\n", @@ -269,7 +268,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -297,7 +297,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from sklearn.metrics import recall_score\n", "\n", @@ -309,7 +308,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", diff --git a/mkdocs.yml b/mkdocs.yml index ecfab50e8..313688f7d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -70,7 +70,12 @@ plugins: docstring_style: numpy filters: - "!^_" # Hide internal & dunder methods + inherited_members: true show_if_no_docstring: true + show_labels: false + show_signature: false + show_symbol_type_heading: true + show_symbol_type_toc: true - gen-files: scripts: - docs/reference/generate_reference_pages.py diff --git a/src/safeds/data/image/containers/_image.py b/src/safeds/data/image/containers/_image.py index 1324f4eb1..fb104ce8d 100644 --- a/src/safeds/data/image/containers/_image.py +++ b/src/safeds/data/image/containers/_image.py @@ -165,7 +165,7 @@ def __sizeof__(self) -> int: """ return sys.getsizeof(self._image_tensor) + self._image_tensor.element_size() * self._image_tensor.nelement() - def __array__(self, numpy_dtype: str | dtype = None) -> ndarray: + def __array__(self, numpy_dtype: str | dtype | None = None) -> ndarray: """ Return the image as a numpy array. diff --git a/src/safeds/data/labeled/containers/__init__.py b/src/safeds/data/labeled/containers/__init__.py index 8eed70294..e6237ec24 100644 --- a/src/safeds/data/labeled/containers/__init__.py +++ b/src/safeds/data/labeled/containers/__init__.py @@ -5,7 +5,6 @@ import apipkg if TYPE_CHECKING: - from ._experimental_tabular_dataset import ExperimentalTabularDataset from ._image_dataset import ImageDataset from ._tabular_dataset import TabularDataset from ._time_series_dataset import TimeSeriesDataset @@ -13,7 +12,6 @@ apipkg.initpkg( __name__, { - "ExperimentalTabularDataset": "._experimental_tabular_dataset:ExperimentalTabularDataset", "ImageDataset": "._image_dataset:ImageDataset", "TabularDataset": "._tabular_dataset:TabularDataset", "TimeSeriesDataset": "._time_series_dataset:TimeSeriesDataset", @@ -21,7 +19,6 @@ ) __all__ = [ - "ExperimentalTabularDataset", "ImageDataset", "TabularDataset", "TimeSeriesDataset", diff --git a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py deleted file mode 100644 index b6711045a..000000000 --- a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import annotations - -import sys -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash - -if TYPE_CHECKING: - from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable - - -class ExperimentalTabularDataset: - """ - A dataset containing tabular data. It can be used to train machine learning models. - - Columns in a tabular dataset are divided into three categories: - - * The target column is the column that a model should predict. - * Feature columns are columns that a model should use to make predictions. - * Extra columns are columns that are neither feature nor target. They can be used to provide additional context, - like an ID column. - - Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns - are specified, all columns except the target column are used as features. - - Parameters - ---------- - data: - The data. - target_name: - Name of the target column. - extra_names: - Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but - the target column are used as features. - - Raises - ------ - KeyError - If a column name is not found in the data. - ValueError - If the target column is also an extra column. - ValueError - If no feature columns remains. - - Examples - -------- - >>> from safeds.data.labeled.containers import TabularDataset - >>> dataset = TabularDataset( - ... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]}, - ... target_name="target", - ... extra_names=["id"] - ... ) - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __init__( - self, - data: ExperimentalTable, - target_name: str, - extra_names: list[str] | None = None, - ): - # Preprocess inputs - if extra_names is None: - extra_names = [] - - # Derive feature names - non_feature_names = {target_name, *extra_names} # perf: Comprehensions evaluate their condition every iteration - feature_names = [name for name in data.column_names if name not in non_feature_names] - - # Validate inputs - if target_name in extra_names: - raise ValueError(f"Column '{target_name}' cannot be both target and extra.") - if len(feature_names) == 0: - raise ValueError("At least one feature column must remain.") - - # Set attributes - self._table: ExperimentalTable = data - self._features: ExperimentalTable = data.remove_columns_except(feature_names) - self._target: ExperimentalColumn = data.get_column(target_name) - self._extras: ExperimentalTable = data.remove_columns_except(extra_names) - - def __eq__(self, other: object) -> bool: - if not isinstance(other, ExperimentalTabularDataset): - return NotImplemented - if self is other: - return True - return self.target == other.target and self.features == other.features and self._extras == other._extras - - def __hash__(self) -> int: - return _structural_hash(self.target, self.features, self._extras) - - def __repr__(self) -> str: - return self._table.__repr__() - - def __sizeof__(self) -> int: - return sys.getsizeof(self._target) + sys.getsizeof(self._features) + sys.getsizeof(self._extras) - - def __str__(self) -> str: - return self._table.__str__() - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - def features(self) -> ExperimentalTable: - """The feature columns of the tabular dataset.""" - return self._features - - @property - def target(self) -> ExperimentalColumn: - """The target column of the tabular dataset.""" - return self._target - - @property - def extras(self) -> ExperimentalTable: - """ - Additional columns of the tabular dataset that are neither features nor target. - - These can be used to store additional information about instances, such as IDs. - """ - return self._extras - - # ------------------------------------------------------------------------------------------------------------------ - # Conversion - # ------------------------------------------------------------------------------------------------------------------ - - def to_table(self) -> ExperimentalTable: - """ - Return a table containing all columns of the tabular dataset. - - Returns - ------- - table: - A table containing all columns of the tabular dataset. - """ - return self._table - - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - def _repr_html_(self) -> str: - """ - Return a compact HTML representation of the tabular dataset for IPython. - - Returns - ------- - html: - The generated HTML. - """ - return self._table._repr_html_() diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py index 97b95551e..899432f50 100644 --- a/src/safeds/data/labeled/containers/_image_dataset.py +++ b/src/safeds/data/labeled/containers/_image_dataset.py @@ -74,11 +74,11 @@ def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, s non_numerical_columns = [] wrong_interval_columns = [] for column_name in output_data.column_names: - if not output_data.get_column_type(column_name).is_numeric(): + if not output_data.get_column_type(column_name).is_numeric: non_numerical_columns.append(column_name) elif ( - output_data.get_column(column_name).minimum() < 0 - or output_data.get_column(column_name).maximum() > 1 + output_data.get_column(column_name).min() < 0 + or output_data.get_column(column_name).max() > 1 ): wrong_interval_columns.append(column_name) if len(non_numerical_columns) > 0: @@ -295,7 +295,7 @@ def __init__(self, table: Table) -> None: _init_default_device() self._column_names = table.column_names - self._tensor = torch.Tensor(table._data.to_numpy(copy=True)).to(torch.get_default_device()) + self._tensor = torch.Tensor(table._data_frame.to_numpy()).to(torch.get_default_device()) if not torch.all(self._tensor.sum(dim=1) == torch.ones(self._tensor.size(dim=0))): raise ValueError( @@ -356,7 +356,7 @@ def __init__(self, column: Column) -> None: category=UserWarning, ) self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name]) - self._tensor = torch.Tensor(self._one_hot_encoder.transform(column_as_table)._data.to_numpy(copy=True)).to( + self._tensor = torch.Tensor(self._one_hot_encoder.transform(column_as_table)._data_frame.to_numpy()).to( torch.get_default_device(), ) diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py index 938e62469..776f60acd 100644 --- a/src/safeds/data/labeled/containers/_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_tabular_dataset.py @@ -1,26 +1,33 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash -from safeds.data.tabular.containers import Column, Table if TYPE_CHECKING: from collections.abc import Mapping, Sequence - from typing import Any - import torch from torch import Tensor from torch.utils.data import DataLoader, Dataset + from safeds.data.tabular.containers import Column, Table + class TabularDataset: """ - A tabular dataset maps feature columns to a target column. + A dataset containing tabular data. It can be used to train machine learning models. + + Columns in a tabular dataset are divided into three categories: + + - The target column is the column that a model should predict. + - Feature columns are columns that a model should use to make predictions. + - Extra columns are columns that are neither feature nor target. They can be used to provide additional context, + like an ID column. - Create a tabular dataset from a mapping of column names to their values. + Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns + are specified, all columns except the target column are used as features. Parameters ---------- @@ -34,8 +41,8 @@ class TabularDataset: Raises ------ - ColumnLengthMismatchError - If columns have different lengths. + KeyError + If a column name is not found in the data. ValueError If the target column is also an extra column. ValueError @@ -44,11 +51,14 @@ class TabularDataset: Examples -------- >>> from safeds.data.labeled.containers import TabularDataset - >>> dataset = TabularDataset( - ... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]}, - ... target_name="target", - ... extra_names=["id"] + >>> table = Table( + ... { + ... "id": [1, 2, 3], + ... "feature": [4, 5, 6], + ... "target": [1, 2, 3], + ... }, ... ) + >>> dataset = table.to_tabular_dataset(target_name="target", extra_names=["id"]) """ # ------------------------------------------------------------------------------------------------------------------ @@ -61,14 +71,17 @@ def __init__( target_name: str, extra_names: list[str] | None = None, ): + from safeds.data.tabular.containers import Table + # Preprocess inputs if not isinstance(data, Table): data = Table(data) if extra_names is None: extra_names = [] - # Derive feature names - feature_names = [name for name in data.column_names if name not in {target_name, *extra_names}] + # Derive feature names (build the set once, since comprehensions evaluate their condition every iteration) + non_feature_names = {target_name, *extra_names} + feature_names = [name for name in data.column_names if name not in non_feature_names] # Validate inputs if target_name in extra_names: @@ -78,19 +91,11 @@ def __init__( # Set attributes self._table: Table = data - self._features: Table = data.keep_only_columns(feature_names) + self._features: Table = data.remove_columns_except(feature_names) self._target: Column = data.get_column(target_name) - self._extras: Table = data.keep_only_columns(extra_names) + self._extras: Table = data.remove_columns_except(extra_names) def __eq__(self, other: object) -> bool: - """ - Compare two tabular datasets. - - Returns - ------- - equals: - 'True' if features and targets are equal, 'False' otherwise. - """ if not isinstance(other, TabularDataset): return NotImplemented if self is other: @@ -98,27 +103,17 @@ def __eq__(self, other: object) -> bool: return self.target == other.target and self.features == other.features and self._extras == other._extras def __hash__(self) -> int: - """ - Return a deterministic hash value for this tabular dataset. - - Returns - ------- - hash: - The hash value. - """ return _structural_hash(self.target, self.features, self._extras) - def __sizeof__(self) -> int: - """ - Return the complete size of this object. + def __repr__(self) -> str: + return self._table.__repr__() - Returns - ------- - size: - Size of this object in bytes. - """ + def __sizeof__(self) -> int: return sys.getsizeof(self._target) + sys.getsizeof(self._features) + sys.getsizeof(self._extras) + def __str__(self) -> str: + return self._table.__str__() + # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @@ -148,17 +143,31 @@ def extras(self) -> Table: def to_table(self) -> Table: """ - Return a new `Table` containing the feature columns and the target column. - - The original `TabularDataset` is not modified. + Return a table containing all columns of the tabular dataset. Returns ------- table: - A table containing the feature columns and the target column. + A table containing all columns of the tabular dataset. """ return self._table + # ------------------------------------------------------------------------------------------------------------------ + # IPython integration + # ------------------------------------------------------------------------------------------------------------------ + + def _repr_html_(self) -> str: + """ + Return a compact HTML representation of the tabular dataset for IPython. + + Returns + ------- + html: + The generated HTML. + """ + return self._table._repr_html_() + + # TODO def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> DataLoader: """ Return a Dataloader for the data stored in this table, used for training neural networks. @@ -184,8 +193,8 @@ def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> if num_of_classes <= 2: return DataLoader( dataset=_create_dataset( - torch.Tensor(self.features._data.values).to(_get_device()), - torch.Tensor(self.target._data).to(_get_device()).unsqueeze(dim=-1), + torch.Tensor(self.features._data_frame.to_numpy()).to(_get_device()), + torch.Tensor(self.target._series.to_numpy()).to(_get_device()).unsqueeze(dim=-1), ), batch_size=batch_size, shuffle=True, @@ -194,9 +203,9 @@ def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> else: return DataLoader( dataset=_create_dataset( - torch.Tensor(self.features._data.values).to(_get_device()), + torch.Tensor(self.features._data_frame.to_numpy()).to(_get_device()), torch.nn.functional.one_hot( - torch.LongTensor(self.target._data).to(_get_device()), + torch.LongTensor(self.target._series.to_numpy()).to(_get_device()), num_classes=num_of_classes, ), ), @@ -205,22 +214,7 @@ def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> generator=torch.Generator(device=_get_device()), ) - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - def _repr_html_(self) -> str: - """ - Return an HTML representation of the tabular dataset. - - Returns - ------- - output: - The generated HTML. - """ - return self._table._repr_html_() - - +# TODO def _create_dataset(features: Tensor, target: Tensor) -> Dataset: import torch from torch.utils.data import Dataset diff --git a/src/safeds/data/labeled/containers/_time_series_dataset.py b/src/safeds/data/labeled/containers/_time_series_dataset.py index 1603d21f6..b7b013c9a 100644 --- a/src/safeds/data/labeled/containers/_time_series_dataset.py +++ b/src/safeds/data/labeled/containers/_time_series_dataset.py @@ -1,20 +1,20 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from safeds._config import _init_default_device from safeds._utils import _structural_hash -from safeds.data.tabular.containers import Column, Table from safeds.exceptions import ClosedBound, OutOfBoundsError if TYPE_CHECKING: from collections.abc import Mapping, Sequence - from typing import Any import torch from torch.utils.data import DataLoader, Dataset + from safeds.data.tabular.containers import Column, Table + class TimeSeriesDataset: """ @@ -64,14 +64,17 @@ def __init__( time_name: str, extra_names: list[str] | None = None, ): + from safeds.data.tabular.containers import Table + # Preprocess inputs if not isinstance(data, Table): data = Table(data) if extra_names is None: extra_names = [] - # Derive feature names - feature_names = [name for name in data.column_names if name not in {target_name, *extra_names, time_name}] + # Derive feature names (build the set once, since comprehensions evaluate their condition every iteration) + non_feature_names = {target_name, time_name, *extra_names} + feature_names = [name for name in data.column_names if name not in non_feature_names] # Validate inputs if time_name in extra_names: @@ -83,10 +86,10 @@ def __init__( # Set attributes self._table: Table = data - self._features: Table = data.keep_only_columns(feature_names) + self._features: Table = data.remove_columns_except(feature_names) self._target: Column = data.get_column(target_name) self._time: Column = data.get_column(time_name) - self._extras: Table = data.keep_only_columns(extra_names) + self._extras: Table = data.remove_columns_except(extra_names) def __eq__(self, other: object) -> bool: """ @@ -211,7 +214,7 @@ def _into_dataloader_with_window(self, window_size: int, forecast_horizon: int, _init_default_device() - target_tensor = torch.tensor(self.target._data.values, dtype=torch.float32) + target_tensor = torch.tensor(self.target._series.to_numpy(), dtype=torch.float32) x_s = [] y_s = [] @@ -231,8 +234,8 @@ def _into_dataloader_with_window(self, window_size: int, forecast_horizon: int, window = target_tensor[i : i + window_size] label = target_tensor[i + window_size + forecast_horizon] for col in feature_cols: - data = torch.tensor(col._data.values, dtype=torch.float32) - window = torch.cat((window, data[i : i + window_size]), dim=0) + data = torch.tensor(col._series.to_numpy(), dtype=torch.float32) + window = torch.cat((window, data[i: i + window_size]), dim=0) x_s.append(window) y_s.append(label) x_s_tensor = torch.stack(x_s) @@ -276,7 +279,7 @@ def _into_dataloader_with_window_predict( _init_default_device() - target_tensor = torch.tensor(self.target._data.values, dtype=torch.float32) + target_tensor = self.target._series.to_torch() x_s = [] size = target_tensor.size(0) @@ -291,7 +294,7 @@ def _into_dataloader_with_window_predict( for i in range(size - (forecast_horizon + window_size)): window = target_tensor[i : i + window_size] for col in feature_cols: - data = torch.tensor(col._data.values, dtype=torch.float32) + data = torch.tensor(col._series.to_numpy(), dtype=torch.float32) window = torch.cat((window, data[i : i + window_size]), dim=-1) x_s.append(window) diff --git a/src/safeds/data/tabular/containers/__init__.py b/src/safeds/data/tabular/containers/__init__.py index 59968c086..5512b0b1e 100644 --- a/src/safeds/data/tabular/containers/__init__.py +++ b/src/safeds/data/tabular/containers/__init__.py @@ -5,33 +5,24 @@ import apipkg if TYPE_CHECKING: + from ._cell import Cell from ._column import Column - from ._experimental_cell import ExperimentalCell - from ._experimental_column import ExperimentalColumn - from ._experimental_row import ExperimentalRow - from ._experimental_table import ExperimentalTable from ._row import Row from ._table import Table apipkg.initpkg( __name__, { + "Cell": "._cell:Cell", "Column": "._column:Column", - "ExperimentalCell": "._experimental_cell:ExperimentalCell", - "ExperimentalColumn": "._experimental_column:ExperimentalColumn", - "ExperimentalRow": "._experimental_row:ExperimentalRow", - "ExperimentalTable": "._experimental_table:ExperimentalTable", "Row": "._row:Row", "Table": "._table:Table", }, ) __all__ = [ + "Cell", "Column", - "ExperimentalCell", - "ExperimentalColumn", - "ExperimentalRow", - "ExperimentalTable", "Row", "Table", ] diff --git a/src/safeds/data/tabular/containers/_experimental_cell.py b/src/safeds/data/tabular/containers/_cell.py similarity index 69% rename from src/safeds/data/tabular/containers/_experimental_cell.py rename to src/safeds/data/tabular/containers/_cell.py index 7c77046e5..79ff31327 100644 --- a/src/safeds/data/tabular/containers/_experimental_cell.py +++ b/src/safeds/data/tabular/containers/_cell.py @@ -11,7 +11,7 @@ R = TypeVar("R") -class ExperimentalCell(ABC, Generic[T]): +class Cell(ABC, Generic[T]): """ A single value in a table. @@ -25,106 +25,106 @@ class ExperimentalCell(ABC, Generic[T]): # "Boolean" operators (actually bitwise) ----------------------------------- @abstractmethod - def __invert__(self) -> ExperimentalCell[bool]: ... + def __invert__(self) -> Cell[bool]: ... @abstractmethod - def __and__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: ... + def __and__(self, other: bool | Cell[bool]) -> Cell[bool]: ... @abstractmethod - def __rand__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: ... + def __rand__(self, other: bool | Cell[bool]) -> Cell[bool]: ... @abstractmethod - def __or__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: ... + def __or__(self, other: bool | Cell[bool]) -> Cell[bool]: ... @abstractmethod - def __ror__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: ... + def __ror__(self, other: bool | Cell[bool]) -> Cell[bool]: ... @abstractmethod - def __xor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: ... + def __xor__(self, other: bool | Cell[bool]) -> Cell[bool]: ... @abstractmethod - def __rxor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: ... + def __rxor__(self, other: bool | Cell[bool]) -> Cell[bool]: ... # Comparison --------------------------------------------------------------- @abstractmethod - def __eq__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[override] + def __eq__(self, other: object) -> Cell[bool]: # type: ignore[override] ... @abstractmethod - def __ge__(self, other: Any) -> ExperimentalCell[bool]: ... + def __ge__(self, other: Any) -> Cell[bool]: ... @abstractmethod - def __gt__(self, other: Any) -> ExperimentalCell[bool]: ... + def __gt__(self, other: Any) -> Cell[bool]: ... @abstractmethod - def __le__(self, other: Any) -> ExperimentalCell[bool]: ... + def __le__(self, other: Any) -> Cell[bool]: ... @abstractmethod - def __lt__(self, other: Any) -> ExperimentalCell[bool]: ... + def __lt__(self, other: Any) -> Cell[bool]: ... @abstractmethod - def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[override] + def __ne__(self, other: object) -> Cell[bool]: # type: ignore[override] ... # Numeric operators -------------------------------------------------------- @abstractmethod - def __abs__(self) -> ExperimentalCell[R]: ... + def __abs__(self) -> Cell[R]: ... @abstractmethod - def __ceil__(self) -> ExperimentalCell[R]: ... + def __ceil__(self) -> Cell[R]: ... @abstractmethod - def __floor__(self) -> ExperimentalCell[R]: ... + def __floor__(self) -> Cell[R]: ... @abstractmethod - def __neg__(self) -> ExperimentalCell[R]: ... + def __neg__(self) -> Cell[R]: ... @abstractmethod - def __pos__(self) -> ExperimentalCell[R]: ... + def __pos__(self) -> Cell[R]: ... @abstractmethod - def __add__(self, other: Any) -> ExperimentalCell[R]: ... + def __add__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __radd__(self, other: Any) -> ExperimentalCell[R]: ... + def __radd__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __floordiv__(self, other: Any) -> ExperimentalCell[R]: ... + def __floordiv__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __rfloordiv__(self, other: Any) -> ExperimentalCell[R]: ... + def __rfloordiv__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __mod__(self, other: Any) -> ExperimentalCell[R]: ... + def __mod__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __rmod__(self, other: Any) -> ExperimentalCell[R]: ... + def __rmod__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __mul__(self, other: Any) -> ExperimentalCell[R]: ... + def __mul__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __rmul__(self, other: Any) -> ExperimentalCell[R]: ... + def __rmul__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __pow__(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: ... + def __pow__(self, other: float | Cell[P]) -> Cell[R]: ... @abstractmethod - def __rpow__(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: ... + def __rpow__(self, other: float | Cell[P]) -> Cell[R]: ... @abstractmethod - def __sub__(self, other: Any) -> ExperimentalCell[R]: ... + def __sub__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __rsub__(self, other: Any) -> ExperimentalCell[R]: ... + def __rsub__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __truediv__(self, other: Any) -> ExperimentalCell[R]: ... + def __truediv__(self, other: Any) -> Cell[R]: ... @abstractmethod - def __rtruediv__(self, other: Any) -> ExperimentalCell[R]: ... + def __rtruediv__(self, other: Any) -> Cell[R]: ... # Other -------------------------------------------------------------------- @@ -138,14 +138,14 @@ def __sizeof__(self) -> int: ... # Boolean operations # ------------------------------------------------------------------------------------------------------------------ - def not_(self) -> ExperimentalCell[bool]: + def not_(self) -> Cell[bool]: """ Negate a boolean. This is equivalent to the `~` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [True, False]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [True, False]) >>> column.transform(lambda cell: cell.not_()) +---------+ | example | @@ -168,14 +168,14 @@ def not_(self) -> ExperimentalCell[bool]: """ return self.__invert__() - def and_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def and_(self, other: bool | Cell[bool]) -> Cell[bool]: """ Perform a boolean AND operation. This is equivalent to the `&` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [True, False]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [True, False]) >>> column.transform(lambda cell: cell.and_(False)) +---------+ | example | @@ -198,14 +198,14 @@ def and_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: """ return self.__and__(other) - def or_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def or_(self, other: bool | Cell[bool]) -> Cell[bool]: """ Perform a boolean OR operation. This is equivalent to the `|` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [True, False]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [True, False]) >>> column.transform(lambda cell: cell.or_(True)) +---------+ | example | @@ -228,14 +228,14 @@ def or_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: """ return self.__or__(other) - def xor(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def xor(self, other: bool | Cell[bool]) -> Cell[bool]: """ Perform a boolean XOR operation. This is equivalent to the `^` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [True, False]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [True, False]) >>> column.transform(lambda cell: cell.xor(True)) +---------+ | example | @@ -262,14 +262,14 @@ def xor(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: # Numeric operations # ------------------------------------------------------------------------------------------------------------------ - def abs(self) -> ExperimentalCell[R]: + def abs(self) -> Cell[R]: """ Get the absolute value. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, -2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, -2]) >>> column.transform(lambda cell: cell.abs()) +---------+ | example | @@ -282,14 +282,14 @@ def abs(self) -> ExperimentalCell[R]: """ return self.__abs__() - def ceil(self) -> ExperimentalCell[R]: + def ceil(self) -> Cell[R]: """ Round up to the nearest integer. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1.1, 2.9]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1.1, 2.9]) >>> column.transform(lambda cell: cell.ceil()) +---------+ | example | @@ -302,14 +302,14 @@ def ceil(self) -> ExperimentalCell[R]: """ return self.__ceil__() - def floor(self) -> ExperimentalCell[R]: + def floor(self) -> Cell[R]: """ Round down to the nearest integer. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1.1, 2.9]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1.1, 2.9]) >>> column.transform(lambda cell: cell.floor()) +---------+ | example | @@ -322,14 +322,14 @@ def floor(self) -> ExperimentalCell[R]: """ return self.__floor__() - def neg(self) -> ExperimentalCell[R]: + def neg(self) -> Cell[R]: """ Negate the value. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, -2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, -2]) >>> column.transform(lambda cell: cell.neg()) +---------+ | example | @@ -342,14 +342,14 @@ def neg(self) -> ExperimentalCell[R]: """ return self.__neg__() - def add(self, other: Any) -> ExperimentalCell[R]: + def add(self, other: Any) -> Cell[R]: """ Add a value. This is equivalent to the `+` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, 2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, 2]) >>> column.transform(lambda cell: cell.add(3)) +---------+ | example | @@ -372,14 +372,14 @@ def add(self, other: Any) -> ExperimentalCell[R]: """ return self.__add__(other) - def mod(self, other: Any) -> ExperimentalCell[R]: + def mod(self, other: Any) -> Cell[R]: """ Perform a modulo operation. This is equivalent to the `%` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [5, 6]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [5, 6]) >>> column.transform(lambda cell: cell.mod(3)) +---------+ | example | @@ -402,14 +402,14 @@ def mod(self, other: Any) -> ExperimentalCell[R]: """ return self.__mod__(other) - def mul(self, other: Any) -> ExperimentalCell[R]: + def mul(self, other: Any) -> Cell[R]: """ Multiply by a value. This is equivalent to the `*` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [2, 3]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [2, 3]) >>> column.transform(lambda cell: cell.mul(4)) +---------+ | example | @@ -432,14 +432,14 @@ def mul(self, other: Any) -> ExperimentalCell[R]: """ return self.__mul__(other) - def pow(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + def pow(self, other: float | Cell[P]) -> Cell[R]: """ Raise to a power. This is equivalent to the `**` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [2, 3]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [2, 3]) >>> column.transform(lambda cell: cell.pow(3)) +---------+ | example | @@ -462,14 +462,14 @@ def pow(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: """ return self.__pow__(other) - def sub(self, other: Any) -> ExperimentalCell[R]: + def sub(self, other: Any) -> Cell[R]: """ Subtract a value. This is equivalent to the `-` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [5, 6]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [5, 6]) >>> column.transform(lambda cell: cell.sub(3)) +---------+ | example | @@ -492,14 +492,14 @@ def sub(self, other: Any) -> ExperimentalCell[R]: """ return self.__sub__(other) - def div(self, other: Any) -> ExperimentalCell[R]: + def div(self, other: Any) -> Cell[R]: """ Divide by a value. This is equivalent to the `/` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [6, 8]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [6, 8]) >>> column.transform(lambda cell: cell.div(2)) +---------+ | example | @@ -526,14 +526,14 @@ def div(self, other: Any) -> ExperimentalCell[R]: # Comparison operations # ------------------------------------------------------------------------------------------------------------------ - def eq(self, other: Any) -> ExperimentalCell[bool]: + def eq(self, other: Any) -> Cell[bool]: """ Check if equal to a value. This is equivalent to the `==` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, 2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, 2]) >>> column.transform(lambda cell: cell.eq(2)) +---------+ | example | @@ -556,14 +556,14 @@ def eq(self, other: Any) -> ExperimentalCell[bool]: """ return self.__eq__(other) - def ge(self, other: Any) -> ExperimentalCell[bool]: + def ge(self, other: Any) -> Cell[bool]: """ Check if greater than or equal to a value. This is equivalent to the `>=` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, 2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, 2]) >>> column.transform(lambda cell: cell.ge(2)) +---------+ | example | @@ -586,14 +586,14 @@ def ge(self, other: Any) -> ExperimentalCell[bool]: """ return self.__ge__(other) - def gt(self, other: Any) -> ExperimentalCell[bool]: + def gt(self, other: Any) -> Cell[bool]: """ Check if greater than a value. This is equivalent to the `>` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, 2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, 2]) >>> column.transform(lambda cell: cell.gt(2)) +---------+ | example | @@ -616,14 +616,14 @@ def gt(self, other: Any) -> ExperimentalCell[bool]: """ return self.__gt__(other) - def le(self, other: Any) -> ExperimentalCell[bool]: + def le(self, other: Any) -> Cell[bool]: """ Check if less than or equal to a value. This is equivalent to the `<=` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, 2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, 2]) >>> column.transform(lambda cell: cell.le(2)) +---------+ | example | @@ -646,14 +646,14 @@ def le(self, other: Any) -> ExperimentalCell[bool]: """ return self.__le__(other) - def lt(self, other: Any) -> ExperimentalCell[bool]: + def lt(self, other: Any) -> Cell[bool]: """ Check if less than a value. This is equivalent to the `<` operator. Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("example", [1, 2]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("example", [1, 2]) >>> column.transform(lambda cell: cell.lt(2)) +---------+ | example | diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index a71102a1d..07b13f99d 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -1,27 +1,23 @@ from __future__ import annotations -import io -import sys -from collections.abc import Sequence -from numbers import Number +from collections.abc import Callable, Iterator, Sequence from typing import TYPE_CHECKING, Any, TypeVar, overload from safeds._utils import _structural_hash -from safeds.data.image.containers import Image -from safeds.data.tabular.typing import ColumnType -from safeds.exceptions import ( - ColumnLengthMismatchError, - ColumnSizeError, - IndexOutOfBoundsError, - NonNumericColumnError, -) +from safeds.data.tabular.plotting import ColumnPlotter +from safeds.data.tabular.typing._polars_data_type import _PolarsDataType +from safeds.exceptions import IndexOutOfBoundsError + +from ._vectorized_cell import _VectorizedCell if TYPE_CHECKING: - from collections.abc import Callable, Iterator + from polars import Series + + from safeds.data.tabular.typing import DataType - import pandas as pd + from ._cell import Cell + from ._table import Table - from safeds.data.tabular.containers import Table T = TypeVar("T") R = TypeVar("R") @@ -29,54 +25,38 @@ class Column(Sequence[T]): """ - A column is a named collection of values. + A named, one-dimensional collection of homogeneous values. Parameters ---------- name: The name of the column. data: - The data. + The data of the column. If None, an empty column is created. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) + >>> Column("test", [1, 2, 3]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 2 | + | 3 | + +------+ """ # ------------------------------------------------------------------------------------------------------------------ - # Creation + # Import # ------------------------------------------------------------------------------------------------------------------ @staticmethod - def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Column: - """ - Create a column from a `pandas.Series`. - - Parameters - ---------- - data: - The data. - type_: - The type. If None, the type is inferred from the data. - - Returns - ------- - column: - The created column. - - Examples - -------- - >>> import pandas as pd - >>> from safeds.data.tabular.containers import Column - >>> column = Column._from_pandas_series(pd.Series([1, 2, 3], name="test")) - """ + def _from_polars_series(data: Series) -> Column: result = object.__new__(Column) - result._name = data.name - result._data = data - # noinspection PyProtectedMember - result._type = type_ if type_ is not None else ColumnType._data_type(data) - + result._series = data return result # ------------------------------------------------------------------------------------------------------------------ @@ -84,69 +64,22 @@ def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Col # ------------------------------------------------------------------------------------------------------------------ def __init__(self, name: str, data: Sequence[T] | None = None) -> None: - """ - Create a column. - - Parameters - ---------- - name: - The name of the column. - data: - The data. If None, an empty column is created. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - """ - import pandas as pd - - # Enable copy-on-write for pandas dataframes - pd.options.mode.copy_on_write = True + import polars as pl if data is None: data = [] - self._name: str = name - self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name) - # noinspection PyProtectedMember - self._type: ColumnType = ColumnType._data_type(self._data) + self._series: pl.Series = pl.Series(name, data) def __contains__(self, item: Any) -> bool: - return item in self._data + return self._series.__contains__(item) def __eq__(self, other: object) -> bool: - """ - Check whether this column is equal to another object. - - Parameters - ---------- - other: - The other object. - - Returns - ------- - equal: - True if the other object is an identical column. False if the other object is a different column. - NotImplemented if the other object is not a column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 2, 3]) - >>> column2 = Column("test", [1, 2, 3]) - >>> column1 == column2 - True - - >>> column3 = Column("test", [3, 4, 5]) - >>> column1 == column3 - False - """ if not isinstance(other, Column): return NotImplemented if self is other: return True - return self.name == other.name and self._data.equals(other._data) + return self._series.equals(other._series) @overload def __getitem__(self, index: int) -> T: ... @@ -155,238 +88,106 @@ def __getitem__(self, index: int) -> T: ... def __getitem__(self, index: slice) -> Column[T]: ... def __getitem__(self, index: int | slice) -> T | Column[T]: - """ - Return the value of the specified row or rows. - - Parameters - ---------- - index: - The index of the row, or a slice specifying the start and end index. - - Returns - ------- - value: - The single row's value, or rows' values. - - Raises - ------ - IndexOutOfBoundsError - If the given index or indices do not exist in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> column[0] - 1 - """ if isinstance(index, int): - if index < 0 or index >= self._data.size: - raise IndexOutOfBoundsError(index) - return self._data[index] - - if isinstance(index, slice): - if index.start < 0 or index.start > self._data.size: - raise IndexOutOfBoundsError(index) - if index.stop < 0 or index.stop > self._data.size: - raise IndexOutOfBoundsError(index) - data = self._data[index].reset_index(drop=True).rename(self.name) - return Column._from_pandas_series(data, self._type) + return self.get_value(index) + else: + return self._from_polars_series(self._series.__getitem__(index)) def __hash__(self) -> int: - """ - Return a deterministic hash value for this column. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self.name, self.type.__repr__(), self.number_of_rows) + return _structural_hash( + self.name, + self.type.__repr__(), + self.number_of_rows, + ) def __iter__(self) -> Iterator[T]: - r""" - Create an iterator for the data of this column. This way e.g. for-each loops can be used on it. - - Returns - ------- - iterator: - The iterator. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", ["A", "B", "C"]) - >>> string = "" - >>> for val in column: - ... string += val + ", " - >>> string - 'A, B, C, ' - """ - return iter(self._data) + return self._series.__iter__() def __len__(self) -> int: - """ - Return the size of the column. - - Returns - ------- - n_rows: - The size of the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> len(column) - 3 - """ - return len(self._data) + return self.number_of_rows def __repr__(self) -> str: - """ - Return an unambiguous string representation of this column. - - Returns - ------- - representation: - The string representation. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> repr(column) - "Column('test', [1, 2, 3])" - """ - return f"Column({self._name!r}, {list(self._data)!r})" + return self.to_table().__repr__() def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return sys.getsizeof(self._data) + sys.getsizeof(self._name) + sys.getsizeof(self._type) + return self._series.estimated_size() def __str__(self) -> str: - """ - Return a user-friendly string representation of this column. - - Returns - ------- - representation: - The string representation. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> str(column) - "'test': [1, 2, 3]" - """ - return f"{self._name!r}: {list(self._data)!r}" + return self.to_table().__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @property - def name(self) -> str: - """ - Return the name of the column. + def is_numeric(self) -> bool: + """Whether the column is numeric.""" + return self._series.dtype.is_numeric() - Returns - ------- - name: - The name of the column. + @property + def is_temporal(self) -> bool: + """Whether the column is temporal.""" + return self._series.dtype.is_temporal() - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> column.name - 'test' - """ - return self._name + @property + def name(self) -> str: + """The name of the column.""" + return self._series.name @property def number_of_rows(self) -> int: - """ - Return the number of elements in the column. - - Returns - ------- - number_of_rows: - The number of elements. - """ - return len(self._data) + """The number of rows in the column.""" + return self._series.len() @property - def type(self) -> ColumnType: - """ - Return the type of the column. - - Returns - ------- - type: - The type of the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> column.type - Integer + def plot(self) -> ColumnPlotter: + """The plotter for the column.""" + return ColumnPlotter(self) - >>> column = Column("test", ['a', 'b', 'c']) - >>> column.type - String - """ - return self._type + @property + def type(self) -> DataType: + """The type of the column.""" + return _PolarsDataType(self._series.dtype) # ------------------------------------------------------------------------------------------------------------------ - # Getters + # Value operations # ------------------------------------------------------------------------------------------------------------------ - def get_unique_values(self) -> list[T]: + def get_distinct_values(self) -> list[T]: """ - Return a list of all unique values in the column. + Return the distinct values in the column. Returns ------- - unique_values: - List of unique values in the column. + distinct_values: + The distinct values in the column. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3, 2, 4, 3]) - >>> column.get_unique_values() - [1, 2, 3, 4] + >>> column = Column("test", [1, 2, 3, 2]) + >>> column.get_distinct_values() + [1, 2, 3] """ - return list(self._data.unique()) + return self._series.unique().sort().to_list() def get_value(self, index: int) -> T: """ - Return column value at specified index, starting at 0. + Return the column value at specified index. Indexing starts at 0. Parameters ---------- index: - Index of requested element. + Index of requested value. Returns ------- value: - Value at index in column. + Value at index. Raises ------ - IndexOutOfBoundsError + IndexError If the given index does not exist in the column. Examples @@ -396,123 +197,168 @@ def get_value(self, index: int) -> T: >>> column.get_value(1) 2 """ - if index < 0 or index >= self._data.size: + if index < 0 or index >= self.number_of_rows: raise IndexOutOfBoundsError(index) - return self._data[index] + return self._series.__getitem__(index) # ------------------------------------------------------------------------------------------------------------------ - # Information + # Reductions # ------------------------------------------------------------------------------------------------------------------ - def all(self, predicate: Callable[[T], bool]) -> bool: + def all(self, predicate: Callable[[Cell[T]], Cell[bool]]) -> bool: """ - Check if all values have a given property. + Return whether all values in the column satisfy the predicate. Parameters ---------- predicate: - Callable that is used to find matches. + The predicate to apply to each value. Returns ------- - result: - True if all match. + all_satisfy_predicate: + Whether all values in the column satisfy the predicate. + + Raises + ------ + TypeError + If the predicate does not return a boolean cell. Examples -------- >>> from safeds.data.tabular.containers import Column >>> column = Column("test", [1, 2, 3]) - >>> column.all(lambda x: x < 4) + >>> column.all(lambda cell: cell > 0) True - >>> column.all(lambda x: x < 2) + >>> column.all(lambda cell: cell < 3) False """ - return all(predicate(value) for value in self._data) + import polars as pl + + result = predicate(_VectorizedCell(self)) + if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): + raise TypeError("The predicate must return a boolean cell.") + + return result._series.all() - def any(self, predicate: Callable[[T], bool]) -> bool: + def any(self, predicate: Callable[[Cell[T]], Cell[bool]]) -> bool: """ - Check if any value has a given property. + Return whether any value in the column satisfies the predicate. Parameters ---------- predicate: - Callable that is used to find matches. + The predicate to apply to each value. Returns ------- - result: - True if any match. + any_satisfy_predicate: + Whether any value in the column satisfies the predicate. + + Raises + ------ + TypeError + If the predicate does not return a boolean cell. Examples -------- >>> from safeds.data.tabular.containers import Column >>> column = Column("test", [1, 2, 3]) - >>> column.any(lambda x: x < 2) + >>> column.any(lambda cell: cell > 2) True - >>> column.any(lambda x: x < 1) + >>> column.any(lambda cell: cell < 0) False """ - return any(predicate(value) for value in self._data) + import polars as pl + + result = predicate(_VectorizedCell(self)) + if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): + raise TypeError("The predicate must return a boolean cell.") + + return result._series.any() - def none(self, predicate: Callable[[T], bool]) -> bool: + def count(self, predicate: Callable[[Cell[T]], Cell[bool]]) -> int: """ - Check if no values has a given property. + Return how many values in the column satisfy the predicate. Parameters ---------- predicate: - Callable that is used to find matches. + The predicate to apply to each value. Returns ------- - result: - True if none match. + count: + The number of values in the column that satisfy the predicate. + + Raises + ------ + TypeError + If the predicate does not return a boolean cell. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 2, 3]) - >>> column1.none(lambda x: x < 1) - True + >>> column = Column("test", [1, 2, 3]) + >>> column.count(lambda cell: cell > 1) + 2 - >>> column2 = Column("test", [1, 2, 3]) - >>> column2.none(lambda x: x > 1) - False + >>> column.count(lambda cell: cell < 0) + 0 """ - return all(not predicate(value) for value in self._data) + import polars as pl + + result = predicate(_VectorizedCell(self)) + if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): + raise TypeError("The predicate must return a boolean cell.") - def has_missing_values(self) -> bool: + return result._series.sum() + + def none(self, predicate: Callable[[Cell[T]], Cell[bool]]) -> bool: """ - Return whether the column has missing values. + Return whether no value in the column satisfies the predicate. + + Parameters + ---------- + predicate: + The predicate to apply to each value. Returns ------- - missing_values_exist: - True if missing values exist. + none_satisfy_predicate: + Whether no value in the column satisfies the predicate. + + Raises + ------ + TypeError + If the predicate does not return a boolean cell. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 2, 3, None]) - >>> column1.has_missing_values() + >>> column = Column("test", [1, 2, 3]) + >>> column.none(lambda cell: cell < 0) True - >>> column2 = Column("test", [1, 2, 3]) - >>> column2.has_missing_values() + >>> column.none(lambda cell: cell > 2) False """ - import numpy as np + import polars as pl + + result = predicate(_VectorizedCell(self)) + if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): + raise TypeError("The predicate must return a boolean cell.") - return self.any(lambda value: value is None or (isinstance(value, Number) and np.isnan(value))) + return (~result._series).all() # ------------------------------------------------------------------------------------------------------------------ # Transformations # ------------------------------------------------------------------------------------------------------------------ - def rename(self, new_name: str) -> Column: + def rename(self, new_name: str) -> Column[T]: """ Return a new column with a new name. @@ -525,41 +371,65 @@ def rename(self, new_name: str) -> Column: Returns ------- - column: + renamed_column: A new column with the new name. Examples -------- >>> from safeds.data.tabular.containers import Column >>> column = Column("test", [1, 2, 3]) - >>> column.rename("new name") - Column('new name', [1, 2, 3]) - """ - return Column._from_pandas_series(self._data.rename(new_name), self._type) - - def transform(self, transformer: Callable[[T], R]) -> Column[R]: - """ - Apply a transform method to every data point. + >>> column.rename("new_name") + +----------+ + | new_name | + | --- | + | i64 | + +==========+ + | 1 | + | 2 | + | 3 | + +----------+ + """ + return self._from_polars_series(self._series.rename(new_name)) + + def transform( + self, + transformer: Callable[[Cell[T]], Cell[R]], + ) -> Column[R]: + """ + Return a new column with values transformed by the transformer. The original column is not modified. Parameters ---------- transformer: - Function that will be applied to all data points. + The transformer to apply to each value. Returns ------- transformed_column: - The transformed column. + A new column with transformed values. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> price = Column("price", [4.99, 5.99, 2.49]) - >>> sale = price.transform(lambda amount: amount * 0.8) - """ - return Column(self.name, self._data.apply(transformer, convert_dtype=True)) + >>> column = Column("test", [1, 2, 3]) + >>> column.transform(lambda cell: 2 * cell) + +------+ + | test | + | --- | + | i64 | + +======+ + | 2 | + | 4 | + | 6 | + +------+ + """ + result = transformer(_VectorizedCell(self)) + if not isinstance(result, _VectorizedCell): + raise TypeError("The transformer must return a cell.") + + return self._from_polars_series(result._series) # ------------------------------------------------------------------------------------------------------------------ # Statistics @@ -567,9 +437,7 @@ def transform(self, transformer: Callable[[T], R]) -> Column[R]: def summarize_statistics(self) -> Table: """ - Return a table with a number of statistical key values. - - The original Column is not modified. + Return a table with important statistics about the column. Returns ------- @@ -581,38 +449,75 @@ def summarize_statistics(self) -> Table: >>> from safeds.data.tabular.containers import Column >>> column = Column("a", [1, 3]) >>> column.summarize_statistics() - metric a - 0 minimum 1 - 1 maximum 3 - 2 mean 2.0 - 3 mode [1, 3] - 4 median 2.0 - 5 variance 2.0 - 6 standard deviation 1.4142135623730951 - 7 missing value count 0 - 8 missing value ratio 0.0 - 9 idness 1.0 - 10 stability 0.5 - """ - from safeds.data.tabular.containers import Table + +----------------------+--------------------+ + | metric | a | + | --- | --- | + | str | str | + +===========================================+ + | min | 1 | + | max | 3 | + | mean | 2.0 | + | median | 2.0 | + | standard deviation | 1.4142135623730951 | + | distinct value count | 2 | + | idness | 1.0 | + | missing value ratio | 0.0 | + | stability | 0.5 | + +----------------------+--------------------+ + """ + from ._table import Table + + # TODO: turn this around (call table method, implement in table; allows parallelization) + mean = self.mean() or "-" + median = self.median() or "-" + standard_deviation = self.standard_deviation() or "-" + + return Table( + { + "metric": [ + "min", + "max", + "mean", + "median", + "standard deviation", + "distinct value count", + "idness", + "missing value ratio", + "stability", + ], + self.name: [ + str(self.min()), + str(self.max()), + str(mean), + str(median), + str(standard_deviation), + str(self.distinct_value_count()), + str(self.idness()), + str(self.missing_value_ratio()), + str(self.stability()), + ], + }, + ) + + def correlation_with(self, other: Column) -> float: + """ + Calculate the Pearson correlation between this column and another column. + + The Pearson correlation is a value between -1 and 1 that indicates how much the two columns are linearly related: + + - A correlation of -1 indicates a perfect negative linear relationship. + - A correlation of 0 indicates no linear relationship. + - A correlation of 1 indicates a perfect positive linear relationship. - return Table({self._name: self._data}).summarize_statistics() - - def correlation_with(self, other_column: Column) -> float: - """ - Calculate Pearson correlation between this and another column. Both columns have to be numerical. + Parameters + ---------- + other: + The other column to calculate the correlation with. Returns ------- correlation: - Correlation between the two columns. - - Raises - ------ - NonNumericColumnError - If one of the columns is not numerical. - ColumnLengthMismatchError - If the columns have different lengths. + The Pearson correlation between the two columns. Examples -------- @@ -622,41 +527,47 @@ def correlation_with(self, other_column: Column) -> float: >>> column1.correlation_with(column2) 1.0 - >>> column1 = Column("test", [1, 2, 3]) - >>> column2 = Column("test", [0.5, 4, -6]) - >>> column1.correlation_with(column2) - -0.6404640308067906 + >>> column4 = Column("test", [3, 2, 1]) + >>> column1.correlation_with(column4) + -1.0 + """ + import polars as pl + + return pl.DataFrame({"a": self._series, "b": other._series}).corr().item(row=1, column="a") + + def distinct_value_count(self) -> int: + """ + Return the number of distinct values in the column. + + Returns + ------- + distinct_value_count: + The number of distinct values in the column. + + Examples + -------- + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, 2, 3, 2]) + >>> column.distinct_value_count() + 3 """ - if not self._type.is_numeric() or not other_column._type.is_numeric(): - raise NonNumericColumnError( - f"Columns must be numerical. {self.name} is {self._type}, {other_column.name} is {other_column._type}.", - ) - if self._data.size != other_column._data.size: - raise ColumnLengthMismatchError( - f"{self.name} is of size {self._data.size}, {other_column.name} is of size {other_column._data.size}.", - ) - return self._data.corr(other_column._data) + return self._series.n_unique() def idness(self) -> float: - r""" + """ Calculate the idness of this column. - We define the idness as follows: + We define the idness as the number of distinct values divided by the number of rows. If the column is empty, + the idness is 1.0. - $$ - \frac{\text{number of different values}}{\text{number of rows}} - $$ + A high idness indicates that the column most values in the column are unique. In this case, you must be careful + when using the column for analysis, as a model may learn a mapping from this column to the target. Returns ------- idness: The idness of the column. - Raises - ------ - ColumnSizeError - If this column is empty. - Examples -------- >>> from safeds.data.tabular.containers import Column @@ -668,48 +579,39 @@ def idness(self) -> float: >>> column2.idness() 0.75 """ - if self._data.size == 0: - raise ColumnSizeError("> 0", "0") - return self._data.nunique() / self._data.size + if self.number_of_rows == 0: + return 1.0 # All values are unique (since there are none) + + return self.distinct_value_count() / self.number_of_rows - def maximum(self) -> float: + def max(self) -> T: """ - Return the maximum value of the column. The column has to be numerical. + Return the maximum value in the column. Returns ------- max: - The maximum value. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. + The maximum value in the column. Examples -------- >>> from safeds.data.tabular.containers import Column >>> column = Column("test", [1, 2, 3]) - >>> column.maximum() + >>> column.max() 3 """ - if not self._type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") - return self._data.max() + return self._series.max() - def mean(self) -> float: + def mean(self) -> T: """ - Return the mean value of the column. The column has to be numerical. + Return the mean of the values in the column. + + The mean is the sum of the values divided by the number of values. Returns ------- mean: - The mean value. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. + The mean of the values in the column. Examples -------- @@ -718,64 +620,46 @@ def mean(self) -> float: >>> column.mean() 2.0 """ - if not self._type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") - return self._data.mean() + return self._series.mean() - def median(self) -> float: + def median(self) -> T: """ - Return the median value of the column. The column has to be numerical. + Return the median of the values in the column. + + The median is the value in the middle of the sorted list of values. If the number of values is even, the median + is the mean of the two middle values. Returns ------- median: - The median value. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. + The median of the values in the column. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3, 4]) - >>> column.median() - 2.5 - - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3, 4, 5]) + >>> column = Column("test", [1, 2, 3]) >>> column.median() - 3.0 + 2.0 """ - if not self._type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") - return self._data.median() + return self._series.median() - def minimum(self) -> float: + def min(self) -> T: """ - Return the minimum value of the column. The column has to be numerical. + Return the minimum value in the column. Returns ------- min: - The minimum value. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. + The minimum value in the column. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3, 4]) - >>> column.minimum() + >>> column = Column("test", [1, 2, 3]) + >>> column.min() 1 """ - if not self._type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") - return self._data.min() + return self._series.min() def missing_value_count(self) -> int: """ @@ -783,329 +667,169 @@ def missing_value_count(self) -> int: Returns ------- - count: - The number of missing values. + missing_value_count: + The number of missing values in the column. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column = Column("col_1", [None, 'a', None]) + >>> column = Column("test", [1, None, 3]) >>> column.missing_value_count() - 2 + 1 """ - return self._data.isna().sum() + return self._series.null_count() def missing_value_ratio(self) -> float: """ - Return the ratio of missing values to the total number of elements in the column. + Return the missing value ratio. + + We define the missing value ratio as the number of missing values in the column divided by the number of rows. + If the column is empty, the missing value ratio is 1.0. + + A high missing value ratio indicates that the column is dominated by missing values. In this case, the column + may not be useful for analysis. Returns ------- - ratio: - The ratio of missing values to the total number of elements in the column. - - Raises - ------ - ColumnSizeError - If the column is empty. + missing_value_ratio: + The ratio of missing values in the column. + """ + if self.number_of_rows == 0: + return 1.0 # All values are missing (since there are none) - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 2, 3, 4]) - >>> column1.missing_value_ratio() - 0.0 + return self._series.null_count() / self.number_of_rows - >>> column2 = Column("test", [1, 2, 3, None]) - >>> column2.missing_value_ratio() - 0.25 + def mode(self) -> Column[T]: """ - if self._data.size == 0: - raise ColumnSizeError("> 0", "0") - return self.missing_value_count() / self._data.size + Return the mode of the values in the column. - def mode(self) -> list[T]: - """ - Return the mode of the column. + The mode is the value that appears most frequently in the column. If multiple values occur equally often, all + of them are returned. The values are sorted in ascending order. Returns ------- mode: - Returns a list with the most common values. + The mode of the values in the column. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 2, 3, 3, 4]) - >>> column1.mode() - [3] - - >>> column2 = Column("test", [1, 2, 3, 3, 4, 4]) - >>> column2.mode() - [3, 4] + >>> column = Column("test", [3, 1, 2, 1, 3]) + >>> column.mode() + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 3 | + +------+ """ - return self._data.mode().tolist() + return self._from_polars_series(self._series.mode().sort()) def stability(self) -> float: - r""" - Calculate the stability of this column. - - We define the stability as follows: + """ + Return the stability of the column. - $$ - \frac{\text{number of occurrences of most common non-null value}}{\text{number of non-null values}} - $$ + We define the stability as the number of occurrences of the most common non-missing value divided by the total + number of non-missing values. If the column is empty or all values are missing, the stability is 1.0. - The stability is not definded for a column with only null values. + A high stability indicates that the column is dominated by a single value. In this case, the column may not be + useful for analysis. Returns ------- stability: The stability of the column. - Raises - ------ - ColumnSizeError - If the column is empty. - Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 1, 2, 3]) - >>> column1.stability() + >>> column = Column("test", [1, 1, 2, 3, None]) + >>> column.stability() 0.5 - - >>> column2 = Column("test", [1, 2, 2, 2, 3]) - >>> column2.stability() - 0.6 """ - if self._data.size == 0: - raise ColumnSizeError("> 0", "0") + non_missing = self._series.drop_nulls() + if non_missing.len() == 0: + return 1.0 # All non-null values are the same (since there is are none) - if self.all(lambda x: x is None): - raise ValueError("Stability is not definded for a column with only null values.") + mode_count = non_missing.unique_counts().max() - return self._data.value_counts()[self.mode()[0]] / self._data.count() + return mode_count / non_missing.len() - def standard_deviation(self) -> float: + def standard_deviation(self) -> float | None: """ - Return the standard deviation of the column. The column has to be numerical. + Return the standard deviation of the values in the column. + + The standard deviation is the square root of the variance. Returns ------- - sum: - The standard deviation of all values. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. + standard_deviation: + The standard deviation of the values in the column. If no standard deviation can be calculated due to the + type of the column, None is returned. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column1 = Column("test", [1, 2, 3]) - >>> column1.standard_deviation() + >>> column = Column("test", [1, 2, 3]) + >>> column.standard_deviation() 1.0 - - >>> column2 = Column("test", [1, 2, 4, 8, 16]) - >>> column2.standard_deviation() - 6.099180272790763 """ - if not self.type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") - return self._data.std() + from polars.exceptions import InvalidOperationError + + try: + return self._series.std() + except InvalidOperationError: + return None - def variance(self) -> float: + def variance(self) -> float | None: """ - Return the variance of the column. The column has to be numerical. + Return the variance of the values in the column. + + The variance is the average of the squared differences from the mean. Returns ------- - sum: - The variance of all values. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. + variance: + The variance of the values in the column. If no variance can be calculated due to the type of the column, + None is returned. Examples -------- >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3, 4, 5]) + >>> column = Column("test", [1, 2, 3]) >>> column.variance() - 2.5 + 1.0 """ - if not self.type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") + from polars.exceptions import InvalidOperationError - return self._data.var() + try: + return self._series.var() + except InvalidOperationError: + return None # ------------------------------------------------------------------------------------------------------------------ - # Plotting + # Export # ------------------------------------------------------------------------------------------------------------------ - def plot_boxplot(self) -> Image: - """ - Plot this column in a boxplot. This function can only plot real numerical data. - - Returns - ------- - plot: - The plot as an image. - - Raises - ------ - NonNumericColumnError - If the data contains non-numerical data. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("test", [1, 2, 3]) - >>> boxplot = column.plot_boxplot() + def to_list(self) -> list[T]: """ - import matplotlib.pyplot as plt - import seaborn as sns - - if not self.type.is_numeric(): - raise NonNumericColumnError(f"{self.name} is of type {self._type}.") - - fig = plt.figure() - ax = sns.boxplot(data=self._data) - ax.set(title=self.name) - ax.set_xticks([]) - ax.set_ylabel("") - plt.tight_layout() - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - def plot_histogram(self, *, number_of_bins: int = 10) -> Image: - """ - Plot a column in a histogram. - - Parameters - ---------- - number_of_bins: - The number of bins to use in the histogram. Default is 10. + Return the values of the column in a list. Returns ------- - plot: - The plot as an image. + values: + The values of the column in a list. Examples -------- >>> from safeds.data.tabular.containers import Column >>> column = Column("test", [1, 2, 3]) - >>> histogram = column.plot_histogram() - """ - from safeds.data.tabular.containers import Table - - return Table({self._name: self._data}).plot_histograms(number_of_bins=number_of_bins) - - def plot_compare_columns(self, column_list: list[Column]) -> Image: - """ - Create a plot comparing the numerical values of columns using IDs as the x-axis. - - Parameters - ---------- - column_list: - A list of time columns to be plotted. - - Returns - ------- - plot: - A plot with all the Columns plotted by the ID on the x-axis. - - Raises - ------ - NonNumericColumnError - if the target column contains non numerical values - ValueError - if the columns do not have the same size - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> col1 =Column("target", [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) - >>> col2 =Column("target", [42, 51, 63, 71, 83, 91, 10, 11, 12, 13]) - >>> image = col1.plot_compare_columns([col2]) - """ - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - data = pd.DataFrame() - column_list.append(self) - size = len(column_list[0]) - data["INDEX"] = pd.DataFrame({"INDEX": range(size)}) - for index, col in enumerate(column_list): - if not col.type.is_numeric(): - raise NonNumericColumnError("The time series plotted column contains non-numerical columns.") - if len(col) != size: - raise ValueError("The columns must have the same size.") - data[col.name + " " + str(index)] = col._data - - fig = plt.figure() - data = pd.melt(data, ["INDEX"]) - sns.lineplot(x="INDEX", y="value", hue="variable", data=data) - plt.title("Multiple Series Plot") - plt.xlabel("Time") - - plt.tight_layout() - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - def plot_lagplot(self, lag: int) -> Image: - """ - Plot a lagplot for the given column. - - Parameters - ---------- - lag: - The amount of lag used to plot - - Returns - ------- - plot: - The plot as an image. - - Raises - ------ - NonNumericColumnError - If the column contains non-numerical values. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Column("values", [1,2,3,4,3,2]) - >>> image = table.plot_lagplot(2) + >>> column.to_list() + [1, 2, 3] """ - import matplotlib.pyplot as plt - import pandas as pd - - if not self.type.is_numeric(): - raise NonNumericColumnError("This time series target contains non-numerical columns.") - ax = pd.plotting.lag_plot(self._data, lag=lag) - fig = ax.figure - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - # ------------------------------------------------------------------------------------------------------------------ - # Conversion - # ------------------------------------------------------------------------------------------------------------------ + return self._series.to_list() def to_table(self) -> Table: """ @@ -1115,31 +839,25 @@ def to_table(self) -> Table: ------- table: The table with this column. - """ - # Must be imported here to avoid circular imports - from safeds.data.tabular.containers import Table - - return Table.from_columns([self]) - - def to_html(self) -> str: - """ - Return an HTML representation of the column. - - Returns - ------- - output: - The generated HTML. Examples -------- >>> from safeds.data.tabular.containers import Column >>> column = Column("test", [1, 2, 3]) - >>> html = column.to_html() + >>> column.to_table() + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 2 | + | 3 | + +------+ """ - frame = self._data.to_frame() - frame.columns = [self.name] + from ._table import Table - return frame.to_html(max_rows=self._data.size, max_cols=1) + return Table._from_polars_data_frame(self._series.to_frame()) # ------------------------------------------------------------------------------------------------------------------ # IPython integration @@ -1147,20 +865,13 @@ def to_html(self) -> str: def _repr_html_(self) -> str: """ - Return an HTML representation of the column. + Return a compact HTML representation of the column for IPython. + + Note that this operation must fully load the data into memory, which can be expensive. Returns ------- - output: + html: The generated HTML. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column - >>> column = Column("col_1", ['a', 'b', 'c']) - >>> html = column._repr_html_() """ - frame = self._data.to_frame() - frame.columns = [self.name] - - return frame.to_html(max_rows=self._data.size, max_cols=1, notebook=True) + return self._series._repr_html_() diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py deleted file mode 100644 index f54526465..000000000 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ /dev/null @@ -1,897 +0,0 @@ -from __future__ import annotations - -from collections.abc import Callable, Iterator, Sequence -from typing import TYPE_CHECKING, Any, TypeVar, overload - -from safeds._utils import _structural_hash -from safeds.data.tabular.plotting._experimental_column_plotter import ExperimentalColumnPlotter -from safeds.data.tabular.typing._experimental_polars_data_type import _PolarsDataType -from safeds.exceptions import IndexOutOfBoundsError - -from ._column import Column -from ._experimental_vectorized_cell import _VectorizedCell - -if TYPE_CHECKING: - from polars import Series - - from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType - - from ._experimental_cell import ExperimentalCell - from ._experimental_table import ExperimentalTable - - -T = TypeVar("T") -R = TypeVar("R") - - -class ExperimentalColumn(Sequence[T]): - """ - A named, one-dimensional collection of homogeneous values. - - Parameters - ---------- - name: - The name of the column. - data: - The data of the column. If None, an empty column is created. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> ExperimentalColumn("test", [1, 2, 3]) - +------+ - | test | - | --- | - | i64 | - +======+ - | 1 | - | 2 | - | 3 | - +------+ - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Import - # ------------------------------------------------------------------------------------------------------------------ - - @staticmethod - def _from_polars_series(data: Series) -> ExperimentalColumn: - result = object.__new__(ExperimentalColumn) - result._series = data - return result - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __init__(self, name: str, data: Sequence[T] | None = None) -> None: - import polars as pl - - if data is None: - data = [] - - self._series: pl.Series = pl.Series(name, data) - - def __contains__(self, item: Any) -> bool: - return self._series.__contains__(item) - - def __eq__(self, other: object) -> bool: - if not isinstance(other, ExperimentalColumn): - return NotImplemented - if self is other: - return True - return self._series.equals(other._series) - - @overload - def __getitem__(self, index: int) -> T: ... - - @overload - def __getitem__(self, index: slice) -> ExperimentalColumn[T]: ... - - def __getitem__(self, index: int | slice) -> T | ExperimentalColumn[T]: - return self._series.__getitem__(index) - - def __hash__(self) -> int: - return _structural_hash( - self.name, - self.type.__repr__(), - self.number_of_rows, - ) - - def __iter__(self) -> Iterator[T]: - return self._series.__iter__() - - def __len__(self) -> int: - return self.number_of_rows - - def __repr__(self) -> str: - return self.to_table().__repr__() - - def __sizeof__(self) -> int: - return self._series.estimated_size() - - def __str__(self) -> str: - return self.to_table().__str__() - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - def is_numeric(self) -> bool: - """Whether the column is numeric.""" - return self._series.dtype.is_numeric() - - @property - def is_temporal(self) -> bool: - """Whether the column is temporal.""" - return self._series.dtype.is_temporal() - - @property - def name(self) -> str: - """The name of the column.""" - return self._series.name - - @property - def number_of_rows(self) -> int: - """The number of rows in the column.""" - return self._series.len() - - @property - def plot(self) -> ExperimentalColumnPlotter: - """The plotter for the column.""" - return ExperimentalColumnPlotter(self) - - @property - def type(self) -> ExperimentalDataType: - """The type of the column.""" - return _PolarsDataType(self._series.dtype) - - # ------------------------------------------------------------------------------------------------------------------ - # Value operations - # ------------------------------------------------------------------------------------------------------------------ - - def get_distinct_values(self) -> list[T]: - """ - Return the distinct values in the column. - - Returns - ------- - distinct_values: - The distinct values in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3, 2]) - >>> column.get_distinct_values() - [1, 2, 3] - """ - return self._series.unique().sort().to_list() - - def get_value(self, index: int) -> T: - """ - Return the column value at specified index. Indexing starts at 0. - - Parameters - ---------- - index: - Index of requested value. - - Returns - ------- - value: - Value at index. - - Raises - ------ - IndexError - If the given index does not exist in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.get_value(1) - 2 - """ - if index < 0 or index >= self.number_of_rows: - raise IndexOutOfBoundsError(index) - - return self._series[index] - - # ------------------------------------------------------------------------------------------------------------------ - # Reductions - # ------------------------------------------------------------------------------------------------------------------ - - def all(self, predicate: Callable[[ExperimentalCell[T]], ExperimentalCell[bool]]) -> bool: - """ - Return whether all values in the column satisfy the predicate. - - Parameters - ---------- - predicate: - The predicate to apply to each value. - - Returns - ------- - all_satisfy_predicate: - Whether all values in the column satisfy the predicate. - - Raises - ------ - TypeError - If the predicate does not return a boolean cell. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.all(lambda cell: cell > 0) - True - - >>> column.all(lambda cell: cell < 3) - False - """ - import polars as pl - - result = predicate(_VectorizedCell(self)) - if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): - raise TypeError("The predicate must return a boolean cell.") - - return result._series.all() - - def any(self, predicate: Callable[[ExperimentalCell[T]], ExperimentalCell[bool]]) -> bool: - """ - Return whether any value in the column satisfies the predicate. - - Parameters - ---------- - predicate: - The predicate to apply to each value. - - Returns - ------- - any_satisfy_predicate: - Whether any value in the column satisfies the predicate. - - Raises - ------ - TypeError - If the predicate does not return a boolean cell. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.any(lambda cell: cell > 2) - True - - >>> column.any(lambda cell: cell < 0) - False - """ - import polars as pl - - result = predicate(_VectorizedCell(self)) - if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): - raise TypeError("The predicate must return a boolean cell.") - - return result._series.any() - - def count(self, predicate: Callable[[ExperimentalCell[T]], ExperimentalCell[bool]]) -> int: - """ - Return how many values in the column satisfy the predicate. - - Parameters - ---------- - predicate: - The predicate to apply to each value. - - Returns - ------- - count: - The number of values in the column that satisfy the predicate. - - Raises - ------ - TypeError - If the predicate does not return a boolean cell. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.count(lambda cell: cell > 1) - 2 - - >>> column.count(lambda cell: cell < 0) - 0 - """ - import polars as pl - - result = predicate(_VectorizedCell(self)) - if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): - raise TypeError("The predicate must return a boolean cell.") - - return result._series.sum() - - def none(self, predicate: Callable[[ExperimentalCell[T]], ExperimentalCell[bool]]) -> bool: - """ - Return whether no value in the column satisfies the predicate. - - Parameters - ---------- - predicate: - The predicate to apply to each value. - - Returns - ------- - none_satisfy_predicate: - Whether no value in the column satisfies the predicate. - - Raises - ------ - TypeError - If the predicate does not return a boolean cell. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.none(lambda cell: cell < 0) - True - - >>> column.none(lambda cell: cell > 2) - False - """ - import polars as pl - - result = predicate(_VectorizedCell(self)) - if not isinstance(result, _VectorizedCell) or not result._series.dtype.is_(pl.Boolean): - raise TypeError("The predicate must return a boolean cell.") - - return (~result._series).all() - - # ------------------------------------------------------------------------------------------------------------------ - # Transformations - # ------------------------------------------------------------------------------------------------------------------ - - def rename(self, new_name: str) -> ExperimentalColumn[T]: - """ - Return a new column with a new name. - - The original column is not modified. - - Parameters - ---------- - new_name: - The new name of the column. - - Returns - ------- - renamed_column: - A new column with the new name. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.rename("new_name") - +----------+ - | new_name | - | --- | - | i64 | - +==========+ - | 1 | - | 2 | - | 3 | - +----------+ - """ - return self._from_polars_series(self._series.rename(new_name)) - - def transform( - self, - transformer: Callable[[ExperimentalCell[T]], ExperimentalCell[R]], - ) -> ExperimentalColumn[R]: - """ - Return a new column with values transformed by the transformer. - - The original column is not modified. - - Parameters - ---------- - transformer: - The transformer to apply to each value. - - Returns - ------- - transformed_column: - A new column with transformed values. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.transform(lambda cell: 2 * cell) - +------+ - | test | - | --- | - | i64 | - +======+ - | 2 | - | 4 | - | 6 | - +------+ - """ - result = transformer(_VectorizedCell(self)) - if not isinstance(result, _VectorizedCell): - raise TypeError("The transformer must return a cell.") - - return self._from_polars_series(result._series) - - # ------------------------------------------------------------------------------------------------------------------ - # Statistics - # ------------------------------------------------------------------------------------------------------------------ - - def summarize_statistics(self) -> ExperimentalTable: - """ - Return a table with important statistics about the column. - - Returns - ------- - statistics: - The table with statistics. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("a", [1, 3]) - >>> column.summarize_statistics() - +----------------------+--------------------+ - | metric | a | - | --- | --- | - | str | str | - +===========================================+ - | min | 1 | - | max | 3 | - | mean | 2.0 | - | median | 2.0 | - | standard deviation | 1.4142135623730951 | - | distinct value count | 2 | - | idness | 1.0 | - | missing value ratio | 0.0 | - | stability | 0.5 | - +----------------------+--------------------+ - """ - from ._experimental_table import ExperimentalTable - - # TODO: turn this around (call table method, implement in table; allows parallelization) - mean = self.mean() or "-" - median = self.median() or "-" - standard_deviation = self.standard_deviation() or "-" - - return ExperimentalTable( - { - "metric": [ - "min", - "max", - "mean", - "median", - "standard deviation", - "distinct value count", - "idness", - "missing value ratio", - "stability", - ], - self.name: [ - str(self.min()), - str(self.max()), - str(mean), - str(median), - str(standard_deviation), - str(self.distinct_value_count()), - str(self.idness()), - str(self.missing_value_ratio()), - str(self.stability()), - ], - }, - ) - - def correlation_with(self, other: ExperimentalColumn) -> float: - """ - Calculate the Pearson correlation between this column and another column. - - The Pearson correlation is a value between -1 and 1 that indicates how much the two columns are linearly related: - - * A correlation of -1 indicates a perfect negative linear relationship. - * A correlation of 0 indicates no linear relationship. - * A correlation of 1 indicates a perfect positive linear relationship. - - Parameters - ---------- - other: - The other column to calculate the correlation with. - - Returns - ------- - correlation: - The Pearson correlation between the two columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column1 = ExperimentalColumn("test", [1, 2, 3]) - >>> column2 = ExperimentalColumn("test", [2, 4, 6]) - >>> column1.correlation_with(column2) - 1.0 - - >>> column4 = ExperimentalColumn("test", [3, 2, 1]) - >>> column1.correlation_with(column4) - -1.0 - """ - import polars as pl - - return pl.DataFrame({"a": self._series, "b": other._series}).corr().item(row=1, column="a") - - def distinct_value_count(self) -> int: - """ - Return the number of distinct values in the column. - - Returns - ------- - distinct_value_count: - The number of distinct values in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3, 2]) - >>> column.distinct_value_count() - 3 - """ - return self._series.n_unique() - - def idness(self) -> float: - """ - Calculate the idness of this column. - - We define the idness as the number of distinct values divided by the number of rows. If the column is empty, - the idness is 1.0. - - A high idness indicates that the column most values in the column are unique. In this case, you must be careful - when using the column for analysis, as a model may learn a mapping from this column to the target. - - Returns - ------- - idness: - The idness of the column. - - Raises - ------ - ColumnSizeError - If this column is empty. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column1 = ExperimentalColumn("test", [1, 2, 3]) - >>> column1.idness() - 1.0 - - >>> column2 = ExperimentalColumn("test", [1, 2, 3, 2]) - >>> column2.idness() - 0.75 - """ - if self.number_of_rows == 0: - return 1.0 - - return self.distinct_value_count() / self.number_of_rows - - def max(self) -> T: - """ - Return the maximum value in the column. - - Returns - ------- - max: - The maximum value in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.max() - 3 - """ - return self._series.max() - - def mean(self) -> T: - """ - Return the mean of the values in the column. - - The mean is the sum of the values divided by the number of values. - - Returns - ------- - mean: - The mean of the values in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.mean() - 2.0 - """ - return self._series.mean() - - def median(self) -> T: - """ - Return the median of the values in the column. - - The median is the value in the middle of the sorted list of values. If the number of values is even, the median - is the mean of the two middle values. - - Returns - ------- - median: - The median of the values in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.median() - 2.0 - """ - return self._series.median() - - def min(self) -> T: - """ - Return the minimum value in the column. - - Returns - ------- - min: - The minimum value in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.min() - 1 - """ - return self._series.min() - - def missing_value_count(self) -> int: - """ - Return the number of missing values in the column. - - Returns - ------- - missing_value_count: - The number of missing values in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, None, 3]) - >>> column.missing_value_count() - 1 - """ - return self._series.null_count() - - def missing_value_ratio(self) -> float: - """ - Return the missing value ratio. - - We define the missing value ratio as the number of missing values in the column divided by the number of rows. - If the column is empty, the missing value ratio is 1.0. - - A high missing value ratio indicates that the column is dominated by missing values. In this case, the column - may not be useful for analysis. - - Returns - ------- - missing_value_ratio: - The ratio of missing values in the column. - """ - if self.number_of_rows == 0: - return 1.0 - - return self._series.null_count() / self.number_of_rows - - def mode(self) -> ExperimentalColumn[T]: - """ - Return the mode of the values in the column. - - The mode is the value that appears most frequently in the column. If multiple values occur equally often, all - of them are returned. The values are sorted in ascending order. - - Returns - ------- - mode: - The mode of the values in the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [3, 1, 2, 1, 3]) - >>> column.mode() - +------+ - | test | - | --- | - | i64 | - +======+ - | 1 | - | 3 | - +------+ - """ - return self._from_polars_series(self._series.mode().sort()) - - def stability(self) -> float: - """ - Return the stability of the column. - - We define the stability as the number of occurrences of the most common non-missing value divided by the total - number of non-missing values. If the column is empty or all values are missing, the stability is 1.0. - - A high stability indicates that the column is dominated by a single value. In this case, the column may not be - useful for analysis. - - Returns - ------- - stability: - The stability of the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 1, 2, 3, None]) - >>> column.stability() - 0.5 - """ - non_missing = self._series.drop_nulls() - if non_missing.len() == 0: - return 1.0 - - mode_count = non_missing.unique_counts().max() - - return mode_count / non_missing.len() - - def standard_deviation(self) -> float | None: - """ - Return the standard deviation of the values in the column. - - The standard deviation is the square root of the variance. - - Returns - ------- - standard_deviation: - The standard deviation of the values in the column. If no standard deviation can be calculated due to the - type of the column, None is returned. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.standard_deviation() - 1.0 - """ - from polars.exceptions import InvalidOperationError - - try: - return self._series.std() - except InvalidOperationError: - return None - - def variance(self) -> float | None: - """ - Return the variance of the values in the column. - - The variance is the average of the squared differences from the mean. - - Returns - ------- - variance: - The variance of the values in the column. If no variance can be calculated due to the type of the column, - None is returned. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.variance() - 1.0 - """ - from polars.exceptions import InvalidOperationError - - try: - return self._series.var() - except InvalidOperationError: - return None - - # ------------------------------------------------------------------------------------------------------------------ - # Export - # ------------------------------------------------------------------------------------------------------------------ - - def to_list(self) -> list[T]: - """ - Return the values of the column in a list. - - Returns - ------- - values: - The values of the column in a list. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.to_list() - [1, 2, 3] - """ - return self._series.to_list() - - def to_table(self) -> ExperimentalTable: - """ - Create a table that contains only this column. - - Returns - ------- - table: - The table with this column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) - >>> column.to_table() - +------+ - | test | - | --- | - | i64 | - +======+ - | 1 | - | 2 | - | 3 | - +------+ - """ - from ._experimental_table import ExperimentalTable - - return ExperimentalTable._from_polars_data_frame(self._series.to_frame()) - - def temporary_to_old_column(self) -> Column: - """ - Convert the column to the old column format. This method is temporary and will be removed in a later version. - - Returns - ------- - old_column: - The column in the old format. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("a", [1, 2, 3]) - >>> old_column = column.temporary_to_old_column() - """ - return Column._from_pandas_series(self._series.to_pandas()) - - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - def _repr_html_(self) -> str: - """ - Return a compact HTML representation of the column for IPython. - - Note that this operation must fully load the data into memory, which can be expensive. - - Returns - ------- - html: - The generated HTML. - """ - return self._series._repr_html_() diff --git a/src/safeds/data/tabular/containers/_experimental_row.py b/src/safeds/data/tabular/containers/_experimental_row.py deleted file mode 100644 index 49a1deb8b..000000000 --- a/src/safeds/data/tabular/containers/_experimental_row.py +++ /dev/null @@ -1,115 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from collections.abc import Iterator, Mapping -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from safeds.data.tabular.typing import ExperimentalSchema - from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType - - from ._experimental_cell import ExperimentalCell - - -class ExperimentalRow(ABC, Mapping[str, Any]): - """ - A one-dimensional collection of named, heterogeneous values. - - This class cannot be instantiated directly. It is only used for arguments of callbacks. - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __contains__(self, name: Any) -> bool: - return self.has_column(name) - - @abstractmethod - def __eq__(self, other: object) -> bool: ... - - def __getitem__(self, name: str) -> ExperimentalCell: - return self.get_value(name) - - @abstractmethod - def __hash__(self) -> int: ... - - def __iter__(self) -> Iterator[Any]: - return iter(self.column_names) - - def __len__(self) -> int: - return self.number_of_columns - - @abstractmethod - def __sizeof__(self) -> int: ... - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - @abstractmethod - def column_names(self) -> list[str]: - """The names of the columns in the row.""" - - @property - @abstractmethod - def number_of_columns(self) -> int: - """The number of columns in the row.""" - - @property - @abstractmethod - def schema(self) -> ExperimentalSchema: - """The schema of the row.""" - - # ------------------------------------------------------------------------------------------------------------------ - # Column operations - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def get_value(self, name: str) -> ExperimentalCell: - """ - Get the value of the specified column. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - value: - The value of the column. - """ - - @abstractmethod - def get_column_type(self, name: str) -> ExperimentalDataType: - """ - Get the type of the specified column. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - type: - The type of the column. - """ - - @abstractmethod - def has_column(self, name: str) -> bool: - """ - Check if the row has a column with the specified name. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - has_column: - Whether the row has a column with the specified name. - """ diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py deleted file mode 100644 index 7fd7d700f..000000000 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ /dev/null @@ -1,1904 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, Literal - -from safeds._config._polars import _get_polars_config -from safeds._utils import _check_and_normalize_file_path, _structural_hash -from safeds._utils._random import _get_random_seed -from safeds.data.labeled.containers import ExperimentalTabularDataset -from safeds.data.tabular.plotting._experimental_table_plotter import ExperimentalTablePlotter -from safeds.data.tabular.typing._experimental_polars_data_type import _PolarsDataType -from safeds.data.tabular.typing._experimental_polars_schema import _PolarsSchema -from safeds.exceptions import ( - ClosedBound, - ColumnLengthMismatchError, - DuplicateColumnNameError, - OutOfBoundsError, - UnknownColumnNameError, -) - -from ._experimental_column import ExperimentalColumn -from ._experimental_lazy_cell import _LazyCell -from ._experimental_lazy_vectorized_row import _LazyVectorizedRow -from ._table import Table - -if TYPE_CHECKING: - from collections.abc import Callable, Mapping, Sequence - from pathlib import Path - - import polars as pl - - from safeds.data.tabular.transformation import ( - ExperimentalInvertibleTableTransformer, - ExperimentalTableTransformer, - ) - from safeds.data.tabular.typing import ExperimentalSchema - from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType - - from ._experimental_cell import ExperimentalCell - from ._experimental_row import ExperimentalRow - - -class ExperimentalTable: - """ - A two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. - - To create a `Table` call the constructor or use one of the following static methods: - - | Method | Description | - | ----------------------------------------------------------------------------------------------------------- | -------------------------------------- | - | [from_csv_file][safeds.data.tabular.containers._experimental_table.ExperimentalTable.from_csv_file] | Create a table from a CSV file. | - | [from_json_file][safeds.data.tabular.containers._experimental_table.ExperimentalTable.from_json_file] | Create a table from a JSON file. | - | [from_parquet_file][safeds.data.tabular.containers._experimental_table.ExperimentalTable.from_parquet_file] | Create a table from a Parquet file. | - | [from_columns][safeds.data.tabular.containers._experimental_table.ExperimentalTable.from_columns] | Create a table from a list of columns. | - | [from_dict][safeds.data.tabular.containers._experimental_table.ExperimentalTable.from_dict] | Create a table from a dictionary. | - - Parameters - ---------- - data: - The data of the table. If None, an empty table is created. - - Raises - ------ - ValueError - If columns have different lengths. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Import - # ------------------------------------------------------------------------------------------------------------------ - - @staticmethod - def from_columns(columns: ExperimentalColumn | list[ExperimentalColumn]) -> ExperimentalTable: - """ - Create a table from a list of columns. - - Parameters - ---------- - columns: - The columns. - - Returns - ------- - table: - The created table. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable - >>> a = ExperimentalColumn("a", [1, 2, 3]) - >>> b = ExperimentalColumn("b", [4, 5, 6]) - >>> ExperimentalTable.from_columns([a, b]) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - import polars as pl - - # TODO: raises - - if isinstance(columns, ExperimentalColumn): - columns = [columns] - - return ExperimentalTable._from_polars_lazy_frame( - pl.LazyFrame([column._series for column in columns]), - ) - - @staticmethod - def from_csv_file(path: str | Path) -> ExperimentalTable: - """ - Create a table from a CSV file. - - Parameters - ---------- - path: - The path to the CSV file. If the file extension is omitted, it is assumed to be ".csv". - - Returns - ------- - table: - The created table. - - Raises - ------ - FileNotFoundError - If no file exists at the given path. - ValueError - If the path has an extension that is not ".csv". - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> ExperimentalTable.from_csv_file("./src/resources/from_csv_file.csv") - +-----+-----+-----+ - | a | b | c | - | --- | --- | --- | - | i64 | i64 | i64 | - +=================+ - | 1 | 2 | 1 | - | 0 | 0 | 7 | - +-----+-----+-----+ - """ - import polars as pl - - path = _check_and_normalize_file_path(path, ".csv", [".csv"], check_if_file_exists=True) - return ExperimentalTable._from_polars_lazy_frame(pl.scan_csv(path)) - - @staticmethod - def from_dict(data: dict[str, list[Any]]) -> ExperimentalTable: - """ - Create a table from a dictionary that maps column names to column values. - - Parameters - ---------- - data: - The data. - - Returns - ------- - table: - The generated table. - - Raises - ------ - ValueError - If columns have different lengths. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6]} - >>> ExperimentalTable.from_dict(data) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - return ExperimentalTable(data) - - @staticmethod - def from_json_file(path: str | Path) -> ExperimentalTable: - """ - Create a table from a JSON file. - - Parameters - ---------- - path: - The path to the JSON file. If the file extension is omitted, it is assumed to be ".json". - - Returns - ------- - table: - The created table. - - Raises - ------ - FileNotFoundError - If no file exists at the given path. - ValueError - If the path has an extension that is not ".json". - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> ExperimentalTable.from_json_file("./src/resources/from_json_file_2.json") - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - import polars as pl - - path = _check_and_normalize_file_path(path, ".json", [".json"], check_if_file_exists=True) - return ExperimentalTable._from_polars_data_frame(pl.read_json(path)) - - @staticmethod - def from_parquet_file(path: str | Path) -> ExperimentalTable: - """ - Create a table from a Parquet file. - - Parameters - ---------- - path: - The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". - - Returns - ------- - table: - The created table. - - Raises - ------ - FileNotFoundError - If no file exists at the given path. - ValueError - If the path has an extension that is not ".parquet". - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> ExperimentalTable.from_parquet_file("./src/resources/from_parquet_file.parquet") - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - import polars as pl - - path = _check_and_normalize_file_path(path, ".parquet", [".parquet"], check_if_file_exists=True) - return ExperimentalTable._from_polars_lazy_frame(pl.scan_parquet(path)) - - @staticmethod - def _from_polars_data_frame(data: pl.DataFrame) -> ExperimentalTable: - result = object.__new__(ExperimentalTable) - result._lazy_frame = data.lazy() - result.__data_frame_cache = data - return result - - @staticmethod - def _from_polars_lazy_frame(data: pl.LazyFrame) -> ExperimentalTable: - result = object.__new__(ExperimentalTable) - result._lazy_frame = data - result.__data_frame_cache = None - return result - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: - import polars as pl - - if data is None: - data = {} - - # Validation - expected_length: int | None = None - for column_values in data.values(): - if expected_length is None: - expected_length = len(column_values) - elif len(column_values) != expected_length: - raise ColumnLengthMismatchError( - "\n".join(f"{column_name}: {len(column_values)}" for column_name, column_values in data.items()), - ) - - # Implementation - self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data) - self.__data_frame_cache: pl.DataFrame | None = None - - def __eq__(self, other: object) -> bool: - if not isinstance(other, ExperimentalTable): - return False - if self is other: - return True - - return self._data_frame.frame_equal(other._data_frame) - - def __hash__(self) -> int: - return _structural_hash(self.schema, self.number_of_rows) - - def __repr__(self) -> str: - with _get_polars_config(): - return self._data_frame.__repr__() - - def __sizeof__(self) -> int: - return self._data_frame.estimated_size() - - def __str__(self) -> str: - with _get_polars_config(): - return self._data_frame.__str__() - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - def _data_frame(self) -> pl.DataFrame: - if self.__data_frame_cache is None: - self.__data_frame_cache = self._lazy_frame.collect() - - return self.__data_frame_cache - - @property - def column_names(self) -> list[str]: - """ - The names of the columns in the table. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.column_names - ['a', 'b'] - """ - return self._lazy_frame.columns - - @property - def number_of_columns(self) -> int: - """ - The number of columns in the table. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.number_of_columns - 2 - """ - return self._lazy_frame.width - - @property - def number_of_rows(self) -> int: - """ - The number of rows in the table. - - **Note:** This operation must fully load the data into memory, which can be expensive. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.number_of_rows - 3 - """ - return self._data_frame.height - - @property - def plot(self) -> ExperimentalTablePlotter: - """The plotter for the table.""" - return ExperimentalTablePlotter(self) - - @property - def schema(self) -> ExperimentalSchema: - """The schema of the table.""" - return _PolarsSchema(self._lazy_frame.schema) - - # ------------------------------------------------------------------------------------------------------------------ - # Column operations - # ------------------------------------------------------------------------------------------------------------------ - - def add_columns( - self, - columns: ExperimentalColumn | list[ExperimentalColumn], - ) -> ExperimentalTable: - """ - Return a new table with additional columns. - - **Notes:** - - * The original table is not modified. - * This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - columns: - The columns to add. - - Returns - ------- - new_table: - The table with the additional columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3]}) - >>> new_column = ExperimentalColumn("b", [4, 5, 6]) - >>> table.add_columns(new_column) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - if isinstance(columns, ExperimentalColumn): - columns = [columns] - - if len(columns) == 0: - return self - - return ExperimentalTable._from_polars_data_frame( - self._data_frame.hstack([column._series for column in columns]), - ) - - def add_computed_column( - self, - name: str, - computer: Callable[[ExperimentalRow], ExperimentalCell], - ) -> ExperimentalTable: - """ - Return a new table with an additional computed column. - - **Note:** The original table is not modified. - - Parameters - ---------- - name: - The name of the new column. - computer: - The function that computes the values of the new column. - - Returns - ------- - new_table: - The table with the computed column. - - Raises - ------ - ValueError - If the column name already exists. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.add_computed_column("c", lambda row: row.get_value("a") + row.get_value("b")) - +-----+-----+-----+ - | a | b | c | - | --- | --- | --- | - | i64 | i64 | i64 | - +=================+ - | 1 | 4 | 5 | - | 2 | 5 | 7 | - | 3 | 6 | 9 | - +-----+-----+-----+ - """ - if self.has_column(name): - raise DuplicateColumnNameError(name) - - computed_column = computer(_LazyVectorizedRow(self)) - - return self._from_polars_lazy_frame( - self._lazy_frame.with_columns(computed_column._polars_expression.alias(name)), - ) - - def get_column(self, name: str) -> ExperimentalColumn: - """ - Get a column from the table. - - **Note:** This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - column: - The column. - - Raises - ------ - KeyError - If the column does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.get_column("a") - +-----+ - | a | - | --- | - | i64 | - +=====+ - | 1 | - | 2 | - | 3 | - +-----+ - """ - if not self.has_column(name): - raise UnknownColumnNameError([name]) - - return ExperimentalColumn._from_polars_series(self._data_frame.get_column(name)) - - def get_column_type(self, name: str) -> ExperimentalDataType: - """ - Get the data type of a column. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - type: - The data type of the column. - - Raises - ------ - KeyError - If the column does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.get_column_type("a") - Int64 - """ - if not self.has_column(name): - raise UnknownColumnNameError([name]) - - return _PolarsDataType(self._lazy_frame.schema[name]) - - def has_column(self, name: str) -> bool: - """ - Check if the table has a column with a specific name. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - has_column: - Whether the table has a column with the specified name. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.has_column("a") - True - """ - return name in self.column_names - - def remove_columns( - self, - names: str | list[str], - /, - ) -> ExperimentalTable: - """ - Return a new table without the specified columns. - - **Note:** The original table is not modified. - - Parameters - ---------- - names: - The names of the columns to remove. - - Returns - ------- - new_table: - The table with the columns removed. - - Raises - ------ - KeyError - If a column does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.remove_columns("a") - +-----+ - | b | - | --- | - | i64 | - +=====+ - | 4 | - | 5 | - | 6 | - +-----+ - """ - if isinstance(names, str): - names = [names] - - # TODO: raises? - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.drop(names), - ) - - def remove_columns_except( - self, - names: str | list[str], - /, - ) -> ExperimentalTable: - """ - Return a new table with only the specified columns. - - Parameters - ---------- - names: - The names of the columns to keep. - - Returns - ------- - new_table: - The table with only the specified columns. - - Raises - ------ - KeyError - If a column does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.remove_columns_except("a") - +-----+ - | a | - | --- | - | i64 | - +=====+ - | 1 | - | 2 | - | 3 | - +-----+ - """ - if isinstance(names, str): - names = [names] - - # TODO: raises? - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.select(names), - ) - - def remove_columns_with_missing_values(self) -> ExperimentalTable: - """ - Return a new table without columns that contain missing values. - - **Notes:** - - * The original table is not modified. - * This operation must fully load the data into memory, which can be expensive. - - Returns - ------- - new_table: - The table without columns containing missing values. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, None]}) - >>> table.remove_columns_with_missing_values() - +-----+ - | a | - | --- | - | i64 | - +=====+ - | 1 | - | 2 | - | 3 | - +-----+ - """ - import polars as pl - - return ExperimentalTable._from_polars_lazy_frame( - pl.LazyFrame( - [series for series in self._data_frame.get_columns() if series.null_count() == 0], - ), - ) - - def remove_non_numeric_columns(self) -> ExperimentalTable: - """ - Return a new table without non-numeric columns. - - **Note:** The original table is not modified. - - Returns - ------- - new_table: - The table without non-numeric columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": ["4", "5", "6"]}) - >>> table.remove_non_numeric_columns() - +-----+ - | a | - | --- | - | i64 | - +=====+ - | 1 | - | 2 | - | 3 | - +-----+ - """ - import polars.selectors as cs - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.select(cs.numeric()), - ) - - def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: - """ - Return a new table with a column renamed. - - **Note:** The original table is not modified. - - Parameters - ---------- - old_name: - The name of the column to rename. - new_name: - The new name of the column. - - Returns - ------- - new_table: - The table with the column renamed. - - Raises - ------ - KeyError - If no column with the old name exists. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.rename_column("a", "c") - +-----+-----+ - | c | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - if not self.has_column(old_name): - raise UnknownColumnNameError([old_name]) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.rename({old_name: new_name}), - ) - - def replace_column( - self, - old_name: str, - new_columns: ExperimentalColumn | list[ExperimentalColumn], - ) -> ExperimentalTable: - """ - Return a new table with a column replaced by zero or more columns. - - **Note:** The original table is not modified. - - Parameters - ---------- - old_name: - The name of the column to replace. - new_columns: - The new column or columns. - - Returns - ------- - new_table: - The table with the column replaced. - - Raises - ------ - KeyError - If no column with the old name exists. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.replace_column("a", []) - +-----+ - | b | - | --- | - | i64 | - +=====+ - | 4 | - | 5 | - | 6 | - +-----+ - - >>> column1 = ExperimentalColumn("c", [7, 8, 9]) - >>> table.replace_column("a", column1) - +-----+-----+ - | c | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 7 | 4 | - | 8 | 5 | - | 9 | 6 | - +-----+-----+ - - >>> column2 = ExperimentalColumn("d", [10, 11, 12]) - >>> table.replace_column("a", [column1, column2]) - +-----+-----+-----+ - | c | d | b | - | --- | --- | --- | - | i64 | i64 | i64 | - +=================+ - | 7 | 10 | 4 | - | 8 | 11 | 5 | - | 9 | 12 | 6 | - +-----+-----+-----+ - """ - if not self.has_column(old_name): - raise UnknownColumnNameError([old_name]) - - if isinstance(new_columns, ExperimentalColumn): - new_columns = [new_columns] - - if len(new_columns) == 0: - return self.remove_columns(old_name) - - if len(new_columns) == 1: - new_column = new_columns[0] - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.with_columns(new_column._series.alias(old_name)).rename({old_name: new_column.name}), - ) - - import polars as pl - - index = self.column_names.index(old_name) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.select( - *[pl.col(name) for name in self.column_names[:index]], - *[column._series for column in new_columns], - *[pl.col(name) for name in self.column_names[index + 1 :]], - ), - ) - - def transform_column( - self, - name: str, - transformer: Callable[[ExperimentalCell], ExperimentalCell], - ) -> ExperimentalTable: - """ - Return a new table with a column transformed. - - **Note:** The original table is not modified. - - Parameters - ---------- - name: - The name of the column to transform. - - transformer: - The function that transforms the column. - - Returns - ------- - new_table: - The table with the transformed column. - - Raises - ------ - KeyError - If no column with the specified name exists. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.transform_column("a", lambda cell: cell + 1) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 2 | 4 | - | 3 | 5 | - | 4 | 6 | - +-----+-----+ - """ - if not self.has_column(name): - raise UnknownColumnNameError([name]) # TODO: in the error, compute similar column names - - import polars as pl - - transformed_column = transformer(_LazyCell(pl.col(name))) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.with_columns(transformed_column._polars_expression), - ) - - # ------------------------------------------------------------------------------------------------------------------ - # Row operations - # ------------------------------------------------------------------------------------------------------------------ - - # TODO: Rethink group_rows/group_rows_by_column. They should not return a dict. - - def remove_duplicate_rows(self) -> ExperimentalTable: - """ - Return a new table without duplicate rows. - - **Note:** The original table is not modified. - - Returns - ------- - new_table: - The table without duplicate rows. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 2], "b": [4, 5, 5]}) - >>> table.remove_duplicate_rows() - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - +-----+-----+ - """ - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.unique(maintain_order=True), - ) - - def remove_rows( - self, - query: Callable[[ExperimentalRow], ExperimentalCell[bool]], - ) -> ExperimentalTable: - """ - Return a new table without rows that satisfy a condition. - - **Note:** The original table is not modified. - - Parameters - ---------- - query: - The function that determines which rows to remove. - - Returns - ------- - new_table: - The table without the specified rows. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.remove_rows(lambda row: row.get_value("a") == 2) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 3 | 6 | - +-----+-----+ - """ - mask = query(_LazyVectorizedRow(self)) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(~mask._polars_expression), - ) - - def remove_rows_by_column( - self, - name: str, - query: Callable[[ExperimentalCell], ExperimentalCell[bool]], - ) -> ExperimentalTable: - """ - Return a new table without rows that satisfy a condition on a specific column. - - **Note:** The original table is not modified. - - Parameters - ---------- - name: - The name of the column. - query: - The function that determines which rows to remove. - - Returns - ------- - new_table: - The table without the specified rows. - - Raises - ------ - KeyError - If the column does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.remove_rows_by_column("a", lambda cell: cell == 2) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 3 | 6 | - +-----+-----+ - """ - import polars as pl - - if not self.has_column(name): - raise UnknownColumnNameError([name]) - - mask = query(_LazyCell(pl.col(name))) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(~mask._polars_expression), - ) - - def remove_rows_with_missing_values( - self, - column_names: list[str] | None = None, - ) -> ExperimentalTable: - """ - Return a new table without rows containing missing values in the specified columns. - - **Note:** The original table is not modified. - - Parameters - ---------- - column_names: - Names of the columns to consider. If None, all columns are considered. - - Returns - ------- - new_table: - The table without rows containing missing values in the specified columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, None, 3], "b": [4, 5, None]}) - >>> table.remove_rows_with_missing_values() - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - +-----+-----+ - """ - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.drop_nulls(subset=column_names), - ) - - def remove_rows_with_outliers( - self, - column_names: list[str] | None = None, - *, - z_score_threshold: float = 3, - ) -> ExperimentalTable: - """ - Return a new table without rows containing outliers in the specified columns. - - Whether a data point is an outlier in a column is determined by its z-score. The z-score the distance of the - data point from the mean of the column divided by the standard deviation of the column. If the z-score is - greater than the given threshold, the data point is considered an outlier. Missing values are ignored during the - calculation of the z-score. - - The z-score is only defined for numeric columns. Non-numeric columns are ignored, even if they are specified in - `column_names`. - - **Notes:** - - * The original table is not modified. - * This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - column_names: - Names of the columns to consider. If None, all numeric columns are considered. - z_score_threshold: - The z-score threshold for detecting outliers. - - Returns - ------- - new_table: - The table without rows containing outliers in the specified columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable( - ... { - ... "a": [1, 2, 3, 4, 5, 6, 1000, None], - ... "b": [1, 2, 3, 4, 5, 6, 7, 8], - ... } - ... ) - >>> table.remove_rows_with_outliers(z_score_threshold=2) - +------+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +============+ - | 1 | 1 | - | 2 | 2 | - | 3 | 3 | - | 4 | 4 | - | 5 | 5 | - | 6 | 6 | - | null | 8 | - +------+-----+ - """ - if column_names is None: - column_names = self.column_names - - import polars as pl - import polars.selectors as cs - - non_outlier_mask = pl.all_horizontal( - self._data_frame.select(cs.numeric() & cs.by_name(column_names)).select( - pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), - ), - ) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(non_outlier_mask), - ) - - def shuffle_rows(self) -> ExperimentalTable: - """ - Return a new table with the rows shuffled. - - **Note:** The original table is not modified. - - Returns - ------- - new_table: - The table with the rows shuffled. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.shuffle_rows() - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 3 | 6 | - | 2 | 5 | - | 1 | 4 | - +-----+-----+ - """ - return ExperimentalTable._from_polars_data_frame( - self._data_frame.sample( - fraction=1, - shuffle=True, - seed=_get_random_seed(), - ), - ) - - def slice_rows(self, start: int = 0, length: int | None = None) -> ExperimentalTable: - """ - Return a new table with a slice of rows. - - **Note:** The original table is not modified. - - Parameters - ---------- - start: - The start index of the slice. - length: - The length of the slice. If None, the slice contains all rows starting from `start`. - - Returns - ------- - new_table: - The table with the slice of rows. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.slice_rows(start=1) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - - >>> table.slice_rows(start=1, length=1) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 2 | 5 | - +-----+-----+ - """ - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.slice(start, length), - ) - - def sort_rows( - self, - key_selector: Callable[[ExperimentalRow], ExperimentalCell], - *, - descending: bool = False, - ) -> ExperimentalTable: - """ - Return a new table with the rows sorted. - - **Note:** The original table is not modified. - - Parameters - ---------- - key_selector: - The function that selects the key to sort by. - descending: - Whether to sort in descending order. - - Returns - ------- - new_table: - The table with the rows sorted. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [2, 1, 3], "b": [1, 1, 2]}) - >>> table.sort_rows(lambda row: row.get_value("a") - row.get_value("b")) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 1 | - | 2 | 1 | - | 3 | 2 | - +-----+-----+ - """ - key = key_selector(_LazyVectorizedRow(self)) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.sort( - key._polars_expression, - descending=descending, - maintain_order=True, - ), - ) - - def sort_rows_by_column( - self, - name: str, - *, - descending: bool = False, - ) -> ExperimentalTable: - """ - Return a new table with the rows sorted by a specific column. - - **Note:** The original table is not modified. - - Parameters - ---------- - name: - The name of the column to sort by. - descending: - Whether to sort in descending order. - - Returns - ------- - new_table: - The table with the rows sorted by the specified column. - - Raises - ------ - KeyError - If the column does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [2, 1, 3], "b": [1, 1, 2]}) - >>> table.sort_rows_by_column("a") - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 1 | - | 2 | 1 | - | 3 | 2 | - +-----+-----+ - """ - if not self.has_column(name): - raise UnknownColumnNameError([name]) - - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.sort( - name, - descending=descending, - maintain_order=True, - ), - ) - - def split_rows( - self, - percentage_in_first: float, - *, - shuffle: bool = True, - ) -> tuple[ExperimentalTable, ExperimentalTable]: - """ - Create two tables by splitting the rows of the current table. - - The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table - contains the remaining rows. - - **Note:** The original table is not modified. - - Parameters - ---------- - percentage_in_first: - The percentage of rows to include in the first table. Must be between 0 and 1. - shuffle: - Whether to shuffle the rows before splitting. - - Returns - ------- - first_table: - The first table. - second_table: - The second table. - - Raises - ------ - ValueError - If `percentage_in_first` is not between 0 and 1. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 10]}) - >>> first_table, second_table = table.split_rows(0.6) - >>> first_table - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 6 | - | 4 | 9 | - | 3 | 8 | - +-----+-----+ - >>> second_table - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 5 | 10 | - | 2 | 7 | - +-----+-----+ - """ - if percentage_in_first < 0 or percentage_in_first > 1: - raise OutOfBoundsError( - actual=percentage_in_first, - name="percentage_in_first", - lower_bound=ClosedBound(0), - upper_bound=ClosedBound(1), - ) - - input_table = self.shuffle_rows() if shuffle else self - number_of_rows_in_first = round(percentage_in_first * input_table.number_of_rows) - - return ( - input_table.slice_rows(length=number_of_rows_in_first), - input_table.slice_rows(start=number_of_rows_in_first), - ) - - # ------------------------------------------------------------------------------------------------------------------ - # Table operations - # ------------------------------------------------------------------------------------------------------------------ - - def add_table_as_columns(self, other: ExperimentalTable) -> ExperimentalTable: - """ - Return a new table with the columns of another table added. - - **Notes:** - - * The original tables are not modified. - * This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - other: - The table to add as columns. - - Returns - ------- - new_table: - The table with the columns added. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table1 = ExperimentalTable({"a": [1, 2, 3]}) - >>> table2 = ExperimentalTable({"b": [4, 5, 6]}) - >>> table1.add_table_as_columns(table2) - +-----+-----+ - | a | b | - | --- | --- | - | i64 | i64 | - +===========+ - | 1 | 4 | - | 2 | 5 | - | 3 | 6 | - +-----+-----+ - """ - # TODO: raises? - - return ExperimentalTable._from_polars_data_frame( - self._data_frame.hstack(other._data_frame), - ) - - def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: - """ - Return a new table with the rows of another table added. - - **Notes:** - - * The original tables are not modified. - * This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - other: - The table to add as rows. - - Returns - ------- - new_table: - The table with the rows added. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table1 = ExperimentalTable({"a": [1, 2, 3]}) - >>> table2 = ExperimentalTable({"a": [4, 5, 6]}) - >>> table1.add_table_as_rows(table2) - +-----+ - | a | - | --- | - | i64 | - +=====+ - | 1 | - | 2 | - | 3 | - | 4 | - | 5 | - | 6 | - +-----+ - """ - # TODO: raises? - - return ExperimentalTable._from_polars_data_frame( - self._data_frame.vstack(other._data_frame), - ) - - def inverse_transform_table(self, fitted_transformer: ExperimentalInvertibleTableTransformer) -> ExperimentalTable: - """ - Return a new table inverse-transformed by a **fitted, invertible** transformer. - - **Notes:** - - * The original table is not modified. - * Depending on the transformer, this operation might fully load the data into memory, which can be expensive. - - Parameters - ---------- - fitted_transformer: - The fitted, invertible transformer to apply. - - Returns - ------- - new_table: - The inverse-transformed table. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> from safeds.data.tabular.transformation import ExperimentalRangeScaler - >>> table = ExperimentalTable({"a": [1, 2, 3]}) - >>> transformer, transformed_table = ExperimentalRangeScaler(min_=0, max_=1).fit_and_transform(table, ["a"]) - >>> transformed_table.inverse_transform_table(transformer) - +---------+ - | a | - | --- | - | f64 | - +=========+ - | 1.00000 | - | 2.00000 | - | 3.00000 | - +---------+ - """ - return fitted_transformer.inverse_transform(self) - - def transform_table(self, fitted_transformer: ExperimentalTableTransformer) -> ExperimentalTable: - """ - Return a new table transformed by a **fitted** transformer. - - **Notes:** - - * The original table is not modified. - * Depending on the transformer, this operation might fully load the data into memory, which can be expensive. - - - Parameters - ---------- - fitted_transformer: - The fitted transformer to apply. - - Returns - ------- - new_table: - The transformed table. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> from safeds.data.tabular.transformation import ExperimentalRangeScaler - >>> table = ExperimentalTable({"a": [1, 2, 3]}) - >>> transformer = ExperimentalRangeScaler(min_=0, max_=1).fit(table, ["a"]) - >>> table.transform_table(transformer) - +---------+ - | a | - | --- | - | f64 | - +=========+ - | 0.00000 | - | 0.50000 | - | 1.00000 | - +---------+ - """ - return fitted_transformer.transform(self) - - # ------------------------------------------------------------------------------------------------------------------ - # Statistics - # ------------------------------------------------------------------------------------------------------------------ - - def summarize_statistics(self) -> ExperimentalTable: - """ - Return a table with important statistics about this table. - - Returns - ------- - statistics: - The table with statistics. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 3]}) - >>> table.summarize_statistics() - +----------------------+--------------------+ - | metric | a | - | --- | --- | - | str | str | - +===========================================+ - | min | 1 | - | max | 3 | - | mean | 2.0 | - | median | 2.0 | - | standard deviation | 1.4142135623730951 | - | distinct value count | 2 | - | idness | 1.0 | - | missing value ratio | 0.0 | - | stability | 0.5 | - +----------------------+--------------------+ - """ - if self.number_of_columns == 0: - return ExperimentalTable() - - head = self.get_column(self.column_names[0]).summarize_statistics() - tail = [self.get_column(name).summarize_statistics().get_column(name)._series for name in self.column_names[1:]] - - return ExperimentalTable._from_polars_data_frame( - head._lazy_frame.collect().hstack(tail, in_place=True), - ) - - # ------------------------------------------------------------------------------------------------------------------ - # Export - # ------------------------------------------------------------------------------------------------------------------ - - def to_columns(self) -> list[ExperimentalColumn]: - """ - Return the data of the table as a list of columns. - - Returns - ------- - columns: - List of columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> columns = table.to_columns() - """ - return [ExperimentalColumn._from_polars_series(column) for column in self._data_frame.get_columns()] - - def to_csv_file(self, path: str | Path) -> None: - """ - Write the table to a CSV file. - - If the file and/or the parent directories do not exist, they will be created. If the file exists already, it - will be overwritten. - - Parameters - ---------- - path: - The path to the CSV file. If the file extension is omitted, it is assumed to be ".csv". - - Raises - ------ - ValueError - If the path has an extension that is not ".csv". - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_csv_file("./src/resources/to_csv_file.csv") - """ - path = _check_and_normalize_file_path(path, ".csv", [".csv"]) - path.parent.mkdir(parents=True, exist_ok=True) - - self._lazy_frame.sink_csv(path) - - def to_dict(self) -> dict[str, list[Any]]: - """ - Return a dictionary that maps column names to column values. - - Returns - ------- - dict_: - Dictionary representation of the table. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_dict() - {'a': [1, 2, 3], 'b': [4, 5, 6]} - """ - return self._data_frame.to_dict(as_series=False) - - def to_json_file( - self, - path: str | Path, - *, - orientation: Literal["column", "row"] = "column", - ) -> None: - """ - Write the table to a JSON file. - - If the file and/or the parent directories do not exist, they will be created. If the file exists already, it - will be overwritten. - - **Note:** This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - path: - The path to the JSON file. If the file extension is omitted, it is assumed to be ".json". - orientation: - The orientation of the JSON file. If "column", the JSON file will be structured as a list of columns. If - "row", the JSON file will be structured as a list of rows. Row orientation is more human-readable, but - slower and less memory-efficient. - - Raises - ------ - ValueError - If the path has an extension that is not ".json". - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_json_file("./src/resources/to_json_file_2.json") - """ - path = _check_and_normalize_file_path(path, ".json", [".json"]) - path.parent.mkdir(parents=True, exist_ok=True) - - # Write JSON to file - self._data_frame.write_json(path, row_oriented=(orientation == "row")) - - def to_parquet_file(self, path: str | Path) -> None: - """ - Write the table to a Parquet file. - - If the file and/or the parent directories do not exist, they will be created. If the file exists already, it - will be overwritten. - - Parameters - ---------- - path: - The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". - - Raises - ------ - ValueError - If the path has an extension that is not ".parquet". - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_parquet_file("./src/resources/to_parquet_file.parquet") - """ - path = _check_and_normalize_file_path(path, ".parquet", [".parquet"]) - path.parent.mkdir(parents=True, exist_ok=True) - - self._lazy_frame.sink_parquet(path) - - def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> ExperimentalTabularDataset: - """ - Return a new `TabularDataset` with columns marked as a target, feature, or extra. - - * The target column is the column that a model should predict. - * Feature columns are columns that a model should use to make predictions. - * Extra columns are columns that are neither feature nor target. They can be used to provide additional context, - like an ID column. - - Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns - are specified, all columns except the target column are used as features. - - Parameters - ---------- - target_name: - Name of the target column. - extra_names: - Names of the columns that are neither feature nor target. If None, no extra columns are used, i.e. all but - the target column are used as features. - - Returns - ------- - dataset: - A new tabular dataset with the given target and feature names. - - Raises - ------ - ValueError - If the target column is also a feature column. - ValueError - If no feature columns are specified. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable( - ... { - ... "item": ["apple", "milk", "beer"], - ... "price": [1.10, 1.19, 1.79], - ... "amount_bought": [74, 72, 51], - ... } - ... ) - >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) - """ - return ExperimentalTabularDataset(self, target_name, extra_names) - - def temporary_to_old_table(self) -> Table: - """ - Convert the table to the old table format. This method is temporary and will be removed in a later version. - - Returns - ------- - old_table: - The table in the old format. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> old_table = table.temporary_to_old_table() - """ - return Table._from_pandas_dataframe(self._data_frame.to_pandas()) - - # ------------------------------------------------------------------------------------------------------------------ - # Dataframe interchange protocol - # ------------------------------------------------------------------------------------------------------------------ - - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): # type: ignore[no-untyped-def] - """ - Return a dataframe object that conforms to the dataframe interchange protocol. - - Generally, there is no reason to call this method directly. The dataframe interchange protocol is designed to - allow libraries to consume tabular data from different sources, such as `pandas` or `polars`. If you still - decide to call this method, you should not rely on any capabilities of the returned object beyond the dataframe - interchange protocol. - - The specification of the dataframe interchange protocol can be found - [here](https://data-apis.org/dataframe-protocol/latest/index.html). - - **Note:** This operation must fully load the data into memory, which can be expensive. - - Parameters - ---------- - nan_as_null: - This parameter is deprecated and will be removed in a later revision of the dataframe interchange protocol. - Setting it has no effect. - allow_copy: - Whether memory may be copied to create the dataframe object. - - Returns - ------- - dataframe: - A dataframe object that conforms to the dataframe interchange protocol. - """ - return self._data_frame.__dataframe__(allow_copy=allow_copy) - - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - def _repr_html_(self) -> str: - """ - Return a compact HTML representation of the table for IPython. - - **Note:** This operation must fully load the data into memory, which can be expensive. - - Returns - ------- - html: - The generated HTML. - """ - return self._data_frame._repr_html_() diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py b/src/safeds/data/tabular/containers/_lazy_cell.py similarity index 67% rename from src/safeds/data/tabular/containers/_experimental_lazy_cell.py rename to src/safeds/data/tabular/containers/_lazy_cell.py index d3b5e56cf..f82144d22 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py +++ b/src/safeds/data/tabular/containers/_lazy_cell.py @@ -4,7 +4,7 @@ from safeds._utils import _structural_hash -from ._experimental_cell import ExperimentalCell +from ._cell import Cell if TYPE_CHECKING: import polars as pl @@ -14,7 +14,7 @@ R = TypeVar("R") -class _LazyCell(ExperimentalCell[T]): +class _LazyCell(Cell[T]): """ A single value in a table. @@ -30,123 +30,123 @@ def __init__(self, expression: pl.Expr) -> None: # "Boolean" operators (actually bitwise) ----------------------------------- - def __invert__(self) -> ExperimentalCell[bool]: + def __invert__(self) -> Cell[bool]: return _wrap(self._expression.__invert__()) - def __and__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __and__(self, other: bool | Cell[bool]) -> Cell[bool]: return _wrap(self._expression.__and__(other)) - def __rand__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __rand__(self, other: bool | Cell[bool]) -> Cell[bool]: return _wrap(self._expression.__rand__(other)) - def __or__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __or__(self, other: bool | Cell[bool]) -> Cell[bool]: return _wrap(self._expression.__or__(other)) - def __ror__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __ror__(self, other: bool | Cell[bool]) -> Cell[bool]: return _wrap(self._expression.__ror__(other)) - def __xor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __xor__(self, other: bool | Cell[bool]) -> Cell[bool]: return _wrap(self._expression.__xor__(other)) - def __rxor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __rxor__(self, other: bool | Cell[bool]) -> Cell[bool]: return _wrap(self._expression.__rxor__(other)) # Comparison --------------------------------------------------------------- - def __eq__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[override] + def __eq__(self, other: object) -> Cell[bool]: # type: ignore[override] other = _unwrap(other) return _wrap(self._expression.__eq__(other)) - def __ge__(self, other: Any) -> ExperimentalCell[bool]: + def __ge__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._expression.__ge__(other)) - def __gt__(self, other: Any) -> ExperimentalCell[bool]: + def __gt__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._expression.__gt__(other)) - def __le__(self, other: Any) -> ExperimentalCell[bool]: + def __le__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._expression.__le__(other)) - def __lt__(self, other: Any) -> ExperimentalCell[bool]: + def __lt__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._expression.__lt__(other)) - def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[override] + def __ne__(self, other: object) -> Cell[bool]: # type: ignore[override] other = _unwrap(other) return _wrap(self._expression.__ne__(other)) # Numeric operators -------------------------------------------------------- - def __abs__(self) -> ExperimentalCell[R]: + def __abs__(self) -> Cell[R]: return _wrap(self._expression.__abs__()) - def __ceil__(self) -> ExperimentalCell[R]: + def __ceil__(self) -> Cell[R]: return _wrap(self._expression.ceil()) - def __floor__(self) -> ExperimentalCell[R]: + def __floor__(self) -> Cell[R]: return _wrap(self._expression.floor()) - def __neg__(self) -> ExperimentalCell[R]: + def __neg__(self) -> Cell[R]: return _wrap(self._expression.__neg__()) - def __pos__(self) -> ExperimentalCell[R]: + def __pos__(self) -> Cell[R]: return _wrap(self._expression.__pos__()) - def __add__(self, other: Any) -> ExperimentalCell[R]: + def __add__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__add__(other)) - def __radd__(self, other: Any) -> ExperimentalCell[R]: + def __radd__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__radd__(other)) - def __floordiv__(self, other: Any) -> ExperimentalCell[R]: + def __floordiv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__floordiv__(other)) - def __rfloordiv__(self, other: Any) -> ExperimentalCell[R]: + def __rfloordiv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__rfloordiv__(other)) - def __mod__(self, other: Any) -> ExperimentalCell[R]: + def __mod__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__mod__(other)) - def __rmod__(self, other: Any) -> ExperimentalCell[R]: + def __rmod__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__rmod__(other)) - def __mul__(self, other: Any) -> ExperimentalCell[R]: + def __mul__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__mul__(other)) - def __rmul__(self, other: Any) -> ExperimentalCell[R]: + def __rmul__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__rmul__(other)) - def __pow__(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + def __pow__(self, other: float | Cell[P]) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__pow__(other)) - def __rpow__(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + def __rpow__(self, other: float | Cell[P]) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__rpow__(other)) - def __sub__(self, other: Any) -> ExperimentalCell[R]: + def __sub__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__sub__(other)) - def __rsub__(self, other: Any) -> ExperimentalCell[R]: + def __rsub__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__rsub__(other)) - def __truediv__(self, other: Any) -> ExperimentalCell[R]: + def __truediv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__truediv__(other)) - def __rtruediv__(self, other: Any) -> ExperimentalCell[R]: + def __rtruediv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._expression.__rtruediv__(other)) diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py b/src/safeds/data/tabular/containers/_lazy_vectorized_row.py similarity index 81% rename from src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py rename to src/safeds/data/tabular/containers/_lazy_vectorized_row.py index 7f29db99c..00e9cc30f 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py +++ b/src/safeds/data/tabular/containers/_lazy_vectorized_row.py @@ -4,17 +4,16 @@ from safeds.exceptions import UnknownColumnNameError -from ._experimental_lazy_cell import _LazyCell -from ._experimental_row import ExperimentalRow +from ._lazy_cell import _LazyCell +from ._row import Row if TYPE_CHECKING: - from safeds.data.tabular.typing import ExperimentalSchema - from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType + from safeds.data.tabular.typing import DataType, Schema - from ._experimental_table import ExperimentalTable + from ._table import Table -class _LazyVectorizedRow(ExperimentalRow): +class _LazyVectorizedRow(Row): """ A one-dimensional collection of named, heterogeneous values. @@ -29,8 +28,8 @@ class _LazyVectorizedRow(ExperimentalRow): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, table: ExperimentalTable): - self._table: ExperimentalTable = table + def __init__(self, table: Table): + self._table: Table = table def __eq__(self, other: object) -> bool: if not isinstance(other, _LazyVectorizedRow): @@ -58,7 +57,7 @@ def number_of_columns(self) -> int: return self._table.number_of_columns @property - def schema(self) -> ExperimentalSchema: + def schema(self) -> Schema: return self._table.schema # ------------------------------------------------------------------------------------------------------------------ @@ -73,7 +72,7 @@ def get_value(self, name: str) -> _LazyCell: return _LazyCell(pl.col(name)) - def get_column_type(self, name: str) -> ExperimentalDataType: + def get_column_type(self, name: str) -> DataType: return self._table.get_column_type(name) def has_column(self, name: str) -> bool: diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py index ff3312eb9..b3c2d1ea7 100644 --- a/src/safeds/data/tabular/containers/_row.py +++ b/src/safeds/data/tabular/containers/_row.py @@ -1,507 +1,114 @@ from __future__ import annotations -import sys -from collections.abc import Mapping +from abc import ABC, abstractmethod +from collections.abc import Iterator, Mapping from typing import TYPE_CHECKING, Any -from safeds._utils import _structural_hash -from safeds.data.tabular.typing import ColumnType, Schema -from safeds.exceptions import UnknownColumnNameError - if TYPE_CHECKING: - from collections.abc import Iterator + from safeds.data.tabular.typing import DataType, Schema - import pandas as pd + from ._cell import Cell -class Row(Mapping[str, Any]): +class Row(ABC, Mapping[str, Any]): """ - A row is a collection of named values. - - Parameters - ---------- - data: - The data. If None, an empty row is created. + A one-dimensional collection of named, heterogeneous values. - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) + This class cannot be instantiated directly. It is only used for arguments of callbacks. """ - # ------------------------------------------------------------------------------------------------------------------ - # Creation - # ------------------------------------------------------------------------------------------------------------------ - - @staticmethod - def from_dict(data: dict[str, Any]) -> Row: - """ - Create a row from a dictionary that maps column names to column values. - - Parameters - ---------- - data: - The data. - - Returns - ------- - row: - The created row. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row.from_dict({"a": 1, "b": 2}) - """ - return Row(data) - - @staticmethod - def _from_pandas_dataframe(data: pd.DataFrame, schema: Schema | None = None) -> Row: - """ - Create a row from a `pandas.DataFrame`. - - Parameters - ---------- - data: - The data. - schema: - The schema. If None, the schema is inferred from the data. - - Returns - ------- - row: - The created row. - - Raises - ------ - ValueError - If the dataframe does not contain exactly one row. - - Examples - -------- - >>> import pandas as pd - >>> from safeds.data.tabular.containers import Row - >>> row = Row._from_pandas_dataframe(pd.DataFrame({"a": [1], "b": [2]})) - """ - if data.shape[0] != 1: - raise ValueError("The dataframe has to contain exactly one row.") - - data = data.reset_index(drop=True) - - result = object.__new__(Row) - result._data = data - - if schema is None: - # noinspection PyProtectedMember - result._schema = Schema._from_pandas_dataframe(data) - else: - result._schema = schema - - return result - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, data: Mapping[str, Any] | None = None) -> None: - """ - Create a row from a mapping of column names to column values. - - Parameters - ---------- - data: - The data. If None, an empty row is created. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - """ - import pandas as pd - - # Enable copy-on-write for pandas dataframes - pd.options.mode.copy_on_write = True - - if data is None: - data = {} - - data = {key: [value] for key, value in data.items()} - - self._data: pd.DataFrame = pd.DataFrame(data) - # noinspection PyProtectedMember - self._schema: Schema = Schema._from_pandas_dataframe(self._data) + def __contains__(self, name: Any) -> bool: + return self.has_column(name) - def __contains__(self, obj: Any) -> bool: - """ - Check whether the row contains an object as key. - - Parameters - ---------- - obj: - The object. - - Returns - ------- - has_column: - True, if the row contains the object as key, False otherwise. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> "a" in row - True - - >>> "c" in row - False - """ - return isinstance(obj, str) and self.has_column(obj) - - def __eq__(self, other: object) -> bool: - """ - Check whether this row is equal to another object. - - Parameters - ---------- - other: - The other object. - - Returns - ------- - equal: - True if the other object is an identical row. False if the other object is a different row. NotImplemented - if the other object is not a row. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row1 = Row({"a": 1, "b": 2}) - >>> row2 = Row({"a": 1, "b": 2}) - >>> row1 == row2 - True + @abstractmethod + def __eq__(self, other: object) -> bool: ... - >>> row3 = Row({"a": 1, "b": 3}) - >>> row1 == row3 - False - """ - if not isinstance(other, Row): - return NotImplemented - if self is other: - return True - return self._schema == other._schema and self._data.equals(other._data) - - def __getitem__(self, column_name: str) -> Any: - """ - Return the value of a specified column. - - Parameters - ---------- - column_name: - The column name. - - Returns - ------- - value: - The column value. + def __getitem__(self, name: str) -> Cell: + return self.get_value(name) - Raises - ------ - UnknownColumnNameError - If the row does not contain the specified column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row["a"] - 1 - """ - return self.get_value(column_name) - - def __hash__(self) -> int: - """ - Return a deterministic hash value for this row. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self._schema, [str(self.get_value(value)) for value in self]) + @abstractmethod + def __hash__(self) -> int: ... def __iter__(self) -> Iterator[Any]: - """ - Create an iterator for the column names of this row. - - Returns - ------- - iterator: - The iterator. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> list(row) - ['a', 'b'] - """ return iter(self.column_names) def __len__(self) -> int: - """ - Return the number of columns in this row. - - Returns - ------- - number_of_columns: - The number of columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> len(row) - 2 - """ - return self._data.shape[1] - - def __repr__(self) -> str: - """ - Return an unambiguous string representation of this row. - - Returns - ------- - representation: - The string representation. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1}) - >>> repr(row) - "Row({'a': 1})" - """ - return f"Row({self!s})" - - def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return sys.getsizeof(self._data) + sys.getsizeof(self._schema) - - def __str__(self) -> str: - """ - Return a user-friendly string representation of this row. + return self.number_of_columns - Returns - ------- - representation: - The string representation. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1}) - >>> str(row) - "{'a': 1}" - """ - match len(self): - case 0: - return "{}" - case 1: - return str(self.to_dict()) - case _: - lines = (f" {name!r}: {value!r}" for name, value in self.to_dict().items()) - joined = ",\n".join(lines) - return f"{{\n{joined}\n}}" + @abstractmethod + def __sizeof__(self) -> int: ... # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @property + @abstractmethod def column_names(self) -> list[str]: - """ - Return a list of all column names in the row. - - Returns - ------- - column_names: - The column names. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row.column_names - ['a', 'b'] - """ - return self._schema.column_names + """The names of the columns in the row.""" @property + @abstractmethod def number_of_columns(self) -> int: - """ - Return the number of columns in this row. - - Returns - ------- - number_of_columns: - The number of columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row.number_of_columns - 2 - """ - return self._data.shape[1] + """The number of columns in the row.""" @property + @abstractmethod def schema(self) -> Schema: - """ - Return the schema of the row. - - Returns - ------- - schema: - The schema. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> schema = row.schema - """ - return self._schema + """The schema of the row.""" # ------------------------------------------------------------------------------------------------------------------ - # Getters + # Column operations # ------------------------------------------------------------------------------------------------------------------ - def get_value(self, column_name: str) -> Any: + @abstractmethod + def get_value(self, name: str) -> Cell: """ - Return the value of a specified column. + Get the value of the specified column. Parameters ---------- - column_name: - The column name. + name: + The name of the column. Returns ------- value: - The column value. - - Raises - ------ - UnknownColumnNameError - If the row does not contain the specified column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row.get_value("a") - 1 + The value of the column. """ - if not self.has_column(column_name): - raise UnknownColumnNameError([column_name]) - - return self._data.loc[0, column_name] - def has_column(self, column_name: str) -> bool: + @abstractmethod + def get_column_type(self, name: str) -> DataType: """ - Check whether the row contains a given column. + Get the type of the specified column. Parameters ---------- - column_name: - The column name. - - Returns - ------- - has_column: - True, if the row contains the column, False otherwise. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row.has_column("a") - True - - >>> row.has_column("c") - False - """ - return self._schema.has_column(column_name) - - def get_column_type(self, column_name: str) -> ColumnType: - """ - Return the type of the specified column. - - Parameters - ---------- - column_name: - The column name. + name: + The name of the column. Returns ------- type: The type of the column. - - Raises - ------ - UnknownColumnNameError - If the row does not contain the specified column. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row.get_column_type("a") - Integer """ - return self._schema.get_column_type(column_name) - - # ------------------------------------------------------------------------------------------------------------------ - # Conversion - # ------------------------------------------------------------------------------------------------------------------ - def to_dict(self) -> dict[str, Any]: + @abstractmethod + def has_column(self, name: str) -> bool: """ - Return a dictionary that maps column names to column values. - - Returns - ------- - data: - Dictionary representation of the row. + Check if the row has a column with the specified name. - Examples - -------- - >>> from safeds.data.tabular.containers import Row - >>> row = Row({"a": 1, "b": 2}) - >>> row.to_dict() - {'a': 1, 'b': 2} - """ - return {column_name: self.get_value(column_name) for column_name in self.column_names} - - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - def _repr_html_(self) -> str: - """ - Return an HTML representation of the row. + Parameters + ---------- + name: + The name of the column. Returns ------- - output: - The generated HTML. + has_column: + Whether the row has a column with the specified name. """ - return self._data.to_html(max_rows=1, max_cols=self._data.shape[1], notebook=True) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index dbcd923e1..f1eb78707 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -1,66 +1,68 @@ from __future__ import annotations -import functools -import io -import sys -import warnings -from pathlib import Path -from typing import TYPE_CHECKING, Any, TypeVar +from typing import TYPE_CHECKING, Any, Literal from safeds._config import _get_device, _init_default_device -from safeds._utils import _structural_hash -from safeds.data.image.containers import Image -from safeds.data.tabular.typing import ColumnType, Schema +from safeds._config._polars import _get_polars_config +from safeds._utils import _check_and_normalize_file_path, _structural_hash +from safeds._utils._random import _get_random_seed +from safeds.data.labeled.containers import TabularDataset, TimeSeriesDataset +from safeds.data.tabular.plotting import TablePlotter +from safeds.data.tabular.typing._polars_data_type import _PolarsDataType +from safeds.data.tabular.typing._polars_schema import _PolarsSchema from safeds.exceptions import ( + ClosedBound, ColumnLengthMismatchError, - ColumnSizeError, DuplicateColumnNameError, - IndexOutOfBoundsError, - NonNumericColumnError, + OutOfBoundsError, UnknownColumnNameError, - WrongFileExtensionError, ) from ._column import Column -from ._row import Row +from ._lazy_cell import _LazyCell +from ._lazy_vectorized_row import _LazyVectorizedRow if TYPE_CHECKING: from collections.abc import Callable, Mapping, Sequence + from pathlib import Path - import numpy as np - import pandas as pd + import polars as pl import torch + from torch import Tensor from torch.utils.data import DataLoader, Dataset - from safeds.data.labeled.containers import TabularDataset, TimeSeriesDataset - from safeds.data.tabular.transformation import InvertibleTableTransformer, TableTransformer + from safeds.data.tabular.transformation import ( + InvertibleTableTransformer, + TableTransformer, + ) + from safeds.data.tabular.typing import DataType, Schema + + from ._cell import Cell + from ._row import Row -# noinspection PyProtectedMember class Table: """ - A table is a two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. + A two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. To create a `Table` call the constructor or use one of the following static methods: - | Method | Description | - | ---------------------------------------------------------------------------- | -------------------------------------- | - | [from_csv_file][safeds.data.tabular.containers._table.Table.from_csv_file] | Create a table from a CSV file. | - | [from_json_file][safeds.data.tabular.containers._table.Table.from_json_file] | Create a table from a JSON file. | - | [from_dict][safeds.data.tabular.containers._table.Table.from_dict] | Create a table from a dictionary. | - | [from_columns][safeds.data.tabular.containers._table.Table.from_columns] | Create a table from a list of columns. | - | [from_rows][safeds.data.tabular.containers._table.Table.from_rows] | Create a table from a list of rows. | - - Note: When removing the last column of the table, the `number_of_columns` property will be set to 0. + | Method | Description | + | ---------------------------------------------------------------------------------- | -------------------------------------- | + | [from_csv_file][safeds.data.tabular.containers._table.Table.from_csv_file] | Create a table from a CSV file. | + | [from_json_file][safeds.data.tabular.containers._table.Table.from_json_file] | Create a table from a JSON file. | + | [from_parquet_file][safeds.data.tabular.containers._table.Table.from_parquet_file] | Create a table from a Parquet file. | + | [from_columns][safeds.data.tabular.containers._table.Table.from_columns] | Create a table from a list of columns. | + | [from_dict][safeds.data.tabular.containers._table.Table.from_dict] | Create a table from a dictionary. | Parameters ---------- data: - The data. If None, an empty table is created. + The data of the table. If None, an empty table is created. Raises ------ - ColumnLengthMismatchError + ValueError If columns have different lengths. Examples @@ -70,97 +72,90 @@ class Table: """ # ------------------------------------------------------------------------------------------------------------------ - # Creation + # Import # ------------------------------------------------------------------------------------------------------------------ @staticmethod - def from_csv_file(path: str | Path) -> Table: + def from_columns(columns: Column | list[Column]) -> Table: """ - Read data from a CSV file into a table. + Create a table from a list of columns. Parameters ---------- - path: - The path to the CSV file. + columns: + The columns. Returns ------- table: - The table created from the CSV file. - - Raises - ------ - FileNotFoundError - If the specified file does not exist. - WrongFileExtensionError - If the file is not a csv file. + The created table. Examples -------- - >>> from safeds.data.tabular.containers import Table - >>> Table.from_csv_file('./src/resources/from_csv_file.csv') - a b c - 0 1 2 1 - 1 0 0 7 - """ - import pandas as pd - - path = Path(path) - if path.suffix != ".csv": - raise WrongFileExtensionError(path, ".csv") - if path.exists(): - with path.open(encoding="utf-8") as f: - if f.read().replace("\n", "") == "": - return Table() - - return Table._from_pandas_dataframe(pd.read_csv(path)) - else: - raise FileNotFoundError(f'File "{path}" does not exist') + >>> from safeds.data.tabular.containers import Column, Table + >>> a = Column("a", [1, 2, 3]) + >>> b = Column("b", [4, 5, 6]) + >>> Table.from_columns([a, b]) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + # TODO: raises + + if isinstance(columns, Column): + columns = [columns] + + return Table._from_polars_lazy_frame( + pl.LazyFrame([column._series for column in columns]), + ) @staticmethod - def from_json_file(path: str | Path) -> Table: + def from_csv_file(path: str | Path) -> Table: """ - Read data from a JSON file into a table. + Create a table from a CSV file. Parameters ---------- path: - The path to the JSON file. + The path to the CSV file. If the file extension is omitted, it is assumed to be ".csv". Returns ------- table: - The table created from the JSON file. + The created table. Raises ------ FileNotFoundError - If the specified file does not exist. - WrongFileExtensionError - If the file is not a JSON file. + If no file exists at the given path. + ValueError + If the path has an extension that is not ".csv". Examples -------- >>> from safeds.data.tabular.containers import Table - >>> Table.from_json_file('./src/resources/from_json_file.json') - a b - 0 1 4 - 1 2 5 - 2 3 6 - """ - import pandas as pd - - path = Path(path) - if path.suffix != ".json": - raise WrongFileExtensionError(path, ".json") - if path.exists(): - with path.open(encoding="utf-8") as f: - if f.read().replace("\n", "") in ("", "{}"): - return Table() - - return Table._from_pandas_dataframe(pd.read_json(path)) - else: - raise FileNotFoundError(f'File "{path}" does not exist') + >>> Table.from_csv_file("./src/resources/from_csv_file.csv") + +-----+-----+-----+ + | a | b | c | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 1 | 2 | 1 | + | 0 | 0 | 7 | + +-----+-----+-----+ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".csv", [".csv"], check_if_file_exists=True) + return Table._from_polars_lazy_frame(pl.scan_csv(path)) @staticmethod def from_dict(data: dict[str, list[Any]]) -> Table: @@ -179,164 +174,120 @@ def from_dict(data: dict[str, list[Any]]) -> Table: Raises ------ - ColumnLengthMismatchError + ValueError If columns have different lengths. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> d = {'a': [1, 2], 'b': [3, 4]} - >>> Table.from_dict(d) - a b - 0 1 3 - 1 2 4 + >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6]} + >>> Table.from_dict(data) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ return Table(data) @staticmethod - def from_columns(columns: list[Column]) -> Table: + def from_json_file(path: str | Path) -> Table: """ - Return a table created from a list of columns. + Create a table from a JSON file. Parameters ---------- - columns: - The columns to be combined. They need to have the same size. + path: + The path to the JSON file. If the file extension is omitted, it is assumed to be ".json". Returns ------- table: - The generated table. + The created table. Raises ------ - ColumnLengthMismatchError - If any of the column sizes does not match with the others. - DuplicateColumnNameError - If multiple columns have the same name. + FileNotFoundError + If no file exists at the given path. + ValueError + If the path has an extension that is not ".json". Examples -------- - >>> from safeds.data.tabular.containers import Column, Table - >>> col1 = Column("a", [1, 2, 3]) - >>> col2 = Column("b", [4, 5, 6]) - >>> Table.from_columns([col1, col2]) - a b - 0 1 4 - 1 2 5 - 2 3 6 - """ - import pandas as pd - from pandas import DataFrame - - dataframe: DataFrame = pd.DataFrame() - column_names = [] - - for column in columns: - if column._data.size != columns[0]._data.size: - raise ColumnLengthMismatchError( - "\n".join(f"{column.name}: {column._data.size}" for column in columns), - ) - if column.name in column_names: - raise DuplicateColumnNameError(column.name) - column_names.append(column.name) - dataframe[column.name] = column._data - - return Table._from_pandas_dataframe(dataframe) + >>> from safeds.data.tabular.containers import Table + >>> Table.from_json_file("./src/resources/from_json_file_2.json") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".json", [".json"], check_if_file_exists=True) + return Table._from_polars_data_frame(pl.read_json(path)) @staticmethod - def from_rows(rows: list[Row]) -> Table: + def from_parquet_file(path: str | Path) -> Table: """ - Return a table created from a list of rows. + Create a table from a Parquet file. Parameters ---------- - rows: - The rows to be combined. They need to have a matching schema. + path: + The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". Returns ------- table: - The generated table. + The created table. Raises ------ - UnknownColumnNameError - If any of the row column names does not match with the first row. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row, Table - >>> row1 = Row({"a": 1, "b": 2}) - >>> row2 = Row({"a": 3, "b": 4}) - >>> Table.from_rows([row1, row2]) - a b - 0 1 2 - 1 3 4 - """ - import pandas as pd - from pandas import DataFrame - - if len(rows) == 0: - return Table._from_pandas_dataframe(pd.DataFrame()) - - column_names_compare: list = list(rows[0].column_names) - unknown_column_names = set() - row_array: list[pd.DataFrame] = [] - - for row in rows: - unknown_column_names.update(set(column_names_compare) - set(row.column_names)) - row_array.append(row._data) - if len(unknown_column_names) > 0: - raise UnknownColumnNameError(list(unknown_column_names)) - - dataframe: DataFrame = pd.concat(row_array, ignore_index=True) - dataframe.columns = column_names_compare - - schema = Schema._merge_multiple_schemas([row.schema for row in rows]) - - return Table._from_pandas_dataframe(dataframe, schema) - - @staticmethod - def _from_pandas_dataframe(data: pd.DataFrame, schema: Schema | None = None) -> Table: - """ - Create a table from a `pandas.DataFrame`. - - Parameters - ---------- - data: - The data. - schema: - The schema. If None, the schema is inferred from the data. - - Returns - ------- - table: - The created table. + FileNotFoundError + If no file exists at the given path. + ValueError + If the path has an extension that is not ".parquet". Examples -------- - >>> import pandas as pd >>> from safeds.data.tabular.containers import Table - >>> Table._from_pandas_dataframe(pd.DataFrame({"a": [1], "b": [2]})) - a b - 0 1 2 - """ - import pandas as pd - - data = data.reset_index(drop=True) + >>> Table.from_parquet_file("./src/resources/from_parquet_file.parquet") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".parquet", [".parquet"], check_if_file_exists=True) + return Table._from_polars_lazy_frame(pl.scan_parquet(path)) + @staticmethod + def _from_polars_data_frame(data: pl.DataFrame) -> Table: result = object.__new__(Table) - result._data = data - - if schema is None: - # noinspection PyProtectedMember - result._schema = Schema._from_pandas_dataframe(data) - else: - result._schema = schema - if result._data.empty: - result._data = pd.DataFrame(columns=schema.column_names) + result._lazy_frame = data.lazy() + result.__data_frame_cache = data + return result + @staticmethod + def _from_polars_lazy_frame(data: pl.LazyFrame) -> Table: + result = object.__new__(Table) + result._lazy_frame = data + result.__data_frame_cache = None return result # ------------------------------------------------------------------------------------------------------------------ @@ -344,32 +295,7 @@ def _from_pandas_dataframe(data: pd.DataFrame, schema: Schema | None = None) -> # ------------------------------------------------------------------------------------------------------------------ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: - """ - Create a table from a mapping of column names to their values. - - Parameters - ---------- - data: - The data. If None, an empty table is created. - - Raises - ------ - ColumnLengthMismatchError - If columns have different lengths. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> Table({"a": [1, 2, 3], "b": [4, 5, 6]}) - a b - 0 1 4 - 1 2 5 - 2 3 6 - """ - import pandas as pd - - # Enable copy-on-write for pandas dataframes - pd.options.mode.copy_on_write = True + import polars as pl if data is None: data = {} @@ -385,1705 +311,1405 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: ) # Implementation - self._data: pd.DataFrame = pd.DataFrame(data) - self._data = self._data.reset_index(drop=True) - self._schema: Schema = Schema._from_pandas_dataframe(self._data) + self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data) + self.__data_frame_cache: pl.DataFrame | None = None def __eq__(self, other: object) -> bool: - """ - Compare two table instances. - - Returns - ------- - equals: - 'True' if contents are equal, 'False' otherwise. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row, Table - >>> row1 = Row({"a": 1, "b": 2}) - >>> row2 = Row({"a": 3, "b": 4}) - >>> row3 = Row({"a": 5, "b": 6}) - >>> table1 = Table.from_rows([row1, row2]) - >>> table2 = Table.from_rows([row1, row2]) - >>> table3 = Table.from_rows([row1, row3]) - >>> table1 == table2 - True - >>> table1 == table3 - False - """ if not isinstance(other, Table): - return NotImplemented + return False if self is other: return True - if self.number_of_columns == 0 and other.number_of_columns == 0: - return True - table1 = self.sort_columns() - table2 = other.sort_columns() - if table1.number_of_rows == 0 and table2.number_of_rows == 0: - return table1.column_names == table2.column_names - return table1._schema == table2._schema and table1._data.equals(table2._data) - def __hash__(self) -> int: - """ - Return a deterministic hash value for this table. + return self._data_frame.frame_equal(other._data_frame) - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self._schema, self.number_of_rows) + def __hash__(self) -> int: + return _structural_hash(self.schema, self.number_of_rows) def __repr__(self) -> str: - r""" - Display the table in only one line. - - Returns - ------- - representation: - A string representation of the table in only one line. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> repr(table) - ' a b\n0 1 2\n1 3 4' - """ - tmp = self._data.reset_index(drop=True) - tmp.columns = self.column_names - return tmp.__repr__() + with _get_polars_config(): + return self._data_frame.__repr__() def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return sys.getsizeof(self._data) + sys.getsizeof(self._schema) + return self._data_frame.estimated_size() def __str__(self) -> str: - tmp = self._data.reset_index(drop=True) - tmp.columns = self.column_names - return tmp.__str__() + with _get_polars_config(): + return self._data_frame.__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @property - def column_names(self) -> list[str]: - """ - Return a list of all column names in this table. + def _data_frame(self) -> pl.DataFrame: + if self.__data_frame_cache is None: + self.__data_frame_cache = self._lazy_frame.collect() - Alias for self.schema.column_names -> list[str]. + return self.__data_frame_cache - Returns - ------- - column_names: - The list of the column names. + @property + def column_names(self) -> list[str]: + """ + The names of the columns in the table. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"col1": [1, 3], "col2": [2, 4]}) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> table.column_names - ['col1', 'col2'] + ['a', 'b'] """ - return self._schema.column_names + return self._lazy_frame.columns @property def number_of_columns(self) -> int: """ - Return the number of columns. - - Returns - ------- - number_of_columns: - The number of columns. + The number of columns in the table. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2]}) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> table.number_of_columns 2 """ - return self._data.shape[1] + return self._lazy_frame.width @property def number_of_rows(self) -> int: """ - Return the number of rows. + The number of rows in the table. - Returns - ------- - number_of_rows: - The number of rows. + **Note:** This operation must fully load the data into memory, which can be expensive. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2]}) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> table.number_of_rows - 1 + 3 """ - return self._data.shape[0] + return self._data_frame.height @property - def schema(self) -> Schema: - """ - Return the schema of the table. + def plot(self) -> TablePlotter: + """The plotter for the table.""" + return TablePlotter(self) - Returns - ------- - schema: - The schema. - - Examples - -------- - >>> from safeds.data.tabular.containers import Row, Table - >>> row = Row({"a": 1, "b": 2.5, "c": "ff"}) - >>> table = Table.from_dict({"a": [1, 8], "b": [2.5, 9], "c": ["g", "g"]}) - >>> table.schema - Schema({ - 'a': Integer, - 'b': RealNumber, - 'c': String - }) - >>> table.schema == row.schema - True - """ - return self._schema + @property + def schema(self) -> Schema: + """The schema of the table.""" + return _PolarsSchema(self._lazy_frame.schema) # ------------------------------------------------------------------------------------------------------------------ - # Getters + # Column operations # ------------------------------------------------------------------------------------------------------------------ - def get_column(self, column_name: str) -> Column: + def add_columns( + self, + columns: Column | list[Column], + ) -> Table: """ - Return a column with the data of the specified column. + Return a new table with additional columns. + + **Notes:** + + - The original table is not modified. + - This operation must fully load the data into memory, which can be expensive. Parameters ---------- - column_name: - The name of the column. + columns: + The columns to add. Returns ------- - column: - The column. + new_table: + The table with the additional columns. Raises ------ - UnknownColumnNameError - If the specified target column name does not exist. + ValueError + If a column name already exists. + ValueError + If the columns have incompatible lengths. Examples -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2]}) - >>> table.get_column("b") - Column('b', [2]) - """ - if not self.has_column(column_name): - similar_columns = self._get_similar_columns(column_name) - raise UnknownColumnNameError([column_name], similar_columns) + >>> from safeds.data.tabular.containers import Column, Table + >>> table = Table({"a": [1, 2, 3]}) + >>> new_column = Column("b", [4, 5, 6]) + >>> table.add_columns(new_column) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + if isinstance(columns, Column): + columns = [columns] + + if len(columns) == 0: + return self - return Column._from_pandas_series( - self._data[column_name], - self.get_column_type(column_name), + return Table._from_polars_data_frame( + self._data_frame.hstack([column._series for column in columns]), ) - def has_column(self, column_name: str) -> bool: - """ - Return whether the table contains a given column. - - Alias for self.schema.hasColumn(column_name: str) -> bool. - - Parameters - ---------- - column_name: - The name of the column. - - Returns - ------- - contains: - True if the column exists. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2]}) - >>> table.has_column("b") - True - >>> table.has_column("c") - False - """ - return self._schema.has_column(column_name) - - def get_column_type(self, column_name: str) -> ColumnType: + def add_computed_column( + self, + name: str, + computer: Callable[[Row], Cell], + ) -> Table: """ - Return the type of the given column. + Return a new table with an additional computed column. - Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType. + **Note:** The original table is not modified. Parameters ---------- - column_name: - The name of the column to be queried. + name: + The name of the new column. + computer: + The function that computes the values of the new column. Returns ------- - type: - The type of the column. + new_table: + The table with the computed column. Raises ------ - UnknownColumnNameError - If the specified target column name does not exist. + ValueError + If the column name already exists. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2.5]}) - >>> table.get_column_type("b") - RealNumber + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.add_computed_column("c", lambda row: row.get_value("a") + row.get_value("b")) + +-----+-----+-----+ + | a | b | c | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 1 | 4 | 5 | + | 2 | 5 | 7 | + | 3 | 6 | 9 | + +-----+-----+-----+ """ - return self._schema.get_column_type(column_name) - - def _get_similar_columns(self, column_name: str) -> list[str]: - """ - Get all the column names in a Table that are similar to a given name. - - Parameters - ---------- - column_name: - The name to compare the Table's column names to. - - Returns - ------- - similar_columns: - A list of all column names in the Table that are similar or equal to the given column name. - """ - import Levenshtein - - similar_columns = [] - similarity = 0.6 - i = 0 - while i < len(self.column_names): - if Levenshtein.jaro_winkler(self.column_names[i], column_name) >= similarity: - similar_columns.append(self.column_names[i]) - i += 1 - if len(similar_columns) == 4 and similarity < 0.9: - similarity += 0.1 - similar_columns = [] - i = 0 - - return similar_columns - - # ------------------------------------------------------------------------------------------------------------------ - # Information - # ------------------------------------------------------------------------------------------------------------------ - - def summarize_statistics(self) -> Table: - """ - Return a table with a number of statistical key values. - - The original table is not modified. - - Returns - ------- - result: - The table with statistics. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> table.summarize_statistics() - metric a b - 0 minimum 1 2 - 1 maximum 3 4 - 2 mean 2.0 3.0 - 3 mode [1, 3] [2, 4] - 4 median 2.0 3.0 - 5 variance 2.0 2.0 - 6 standard deviation 1.4142135623730951 1.4142135623730951 - 7 missing value count 0 0 - 8 missing value ratio 0.0 0.0 - 9 idness 1.0 1.0 - 10 stability 0.5 0.5 - """ - import pandas as pd - - if self.number_of_columns == 0: - return Table( - { - "metric": [ - "minimum", - "maximum", - "mean", - "mode", - "median", - "variance", - "standard deviation", - "missing value count", - "missing value ratio", - "idness", - "stability", - ], - }, - ) - elif self.number_of_rows == 0: - table = Table( - { - "metric": [ - "minimum", - "maximum", - "mean", - "mode", - "median", - "variance", - "standard deviation", - "missing value count", - "missing value ratio", - "idness", - "stability", - ], - }, - ) - for name in self.column_names: - table = table.add_column(Column(name, ["-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"])) - return table - - columns = self.to_columns() - result = pd.DataFrame() - statistics = {} - - for column in columns: - statistics = { - "minimum": column.minimum, - "maximum": column.maximum, - "mean": column.mean, - "mode": column.mode, - "median": column.median, - "variance": column.variance, - "standard deviation": column.standard_deviation, - "missing value count": column.missing_value_count, - "missing value ratio": column.missing_value_ratio, - "idness": column.idness, - "stability": column.stability, - } - values = [] - - for function in statistics.values(): - try: - values.append(str(function())) - except (NonNumericColumnError, ValueError): - values.append("-") - - result = pd.concat([result, pd.DataFrame(values)], axis=1) - - result = pd.concat([pd.DataFrame(list(statistics.keys())), result], axis=1) - result.columns = ["metric", *self.column_names] - - return Table._from_pandas_dataframe(result) - - # ------------------------------------------------------------------------------------------------------------------ - # Transformations - # ------------------------------------------------------------------------------------------------------------------ - - def _as_table(self: Table) -> Table: - """ - Transform the table to an instance of the Table class. + if self.has_column(name): + raise DuplicateColumnNameError(name) - This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any - additional constraints that might have to hold in the subclass. Override accordingly in subclasses. + computed_column = computer(_LazyVectorizedRow(self)) - Returns - ------- - table: - The table, as an instance of the Table class. - """ - return self + return self._from_polars_lazy_frame( + self._lazy_frame.with_columns(computed_column._polars_expression.alias(name)), + ) - def add_column(self, column: Column) -> Table: + def get_column(self, name: str) -> Column: """ - Return a new table with the provided column attached at the end. + Get a column from the table. - The original table is not modified. + **Note:** This operation must fully load the data into memory, which can be expensive. - !!! warning "Deprecated" - Use [add_columns][safeds.data.tabular.containers._table.Table.add_columns] instead. + Parameters + ---------- + name: + The name of the column. Returns ------- - result: - The table with the column attached. + column: + The column. Raises ------ - DuplicateColumnNameError - If the new column already exists. - ColumnSizeError - If the size of the column does not match the number of rows. + KeyError + If the column does not exist. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> col = Column("c", ["d", "e"]) - >>> table.add_column(col) - a b c - 0 1 2 d - 1 3 4 e - """ - warnings.warn( - "This method is deprecated and will be removed in a future version. Use `Table.add_columns` instead.", - DeprecationWarning, - stacklevel=2, - ) - - return self.add_columns([column]) - - def add_columns(self, columns: list[Column] | Table) -> Table: + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.get_column("a") + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ """ - Return a new `Table` with multiple added columns. + self._check_columns_exist(name) + return Column._from_polars_series(self._data_frame.get_column(name)) - The original table is not modified. + def get_column_type(self, name: str) -> DataType: + """ + Get the data type of a column. Parameters ---------- - columns: - The columns to be added. + name: + The name of the column. Returns ------- - result: - A new table combining the original table and the given columns. + type: + The data type of the column. Raises ------ - DuplicateColumnNameError - If at least one column name from the provided column list already exists in the table. - ColumnSizeError - If at least one of the column sizes from the provided column list does not match the table. + KeyError + If the column does not exist. Examples -------- - >>> from safeds.data.tabular.containers import Column, Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> col1 = Column("c", ["d", "e"]) - >>> col2 = Column("d", [3.5, 7.9]) - >>> table.add_columns([col1, col2]) - a b c d - 0 1 2 d 3.5 - 1 3 4 e 7.9 + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.get_column_type("a") + Int64 """ - if isinstance(columns, Table): - columns = columns.to_columns() - result = self._data.reset_index(drop=True) - result.columns = self._schema.column_names - for column in columns: - if column.name in result.columns: - raise DuplicateColumnNameError(column.name) + self._check_columns_exist(name) + return _PolarsDataType(self._lazy_frame.schema[name]) - if column.number_of_rows != self.number_of_rows and self.number_of_columns != 0: - raise ColumnSizeError(str(self.number_of_rows), str(column._data.size)) - - result[column.name] = column._data - return Table._from_pandas_dataframe(result) - - def filter_rows(self, query: Callable[[Row], bool]) -> Table: + def has_column(self, name: str) -> bool: """ - Return a new table containing only the rows that satisfy the query. - - The original table is not modified. - - !!! warning "Deprecated" - Use [keep_only_rows][safeds.data.tabular.containers._table.Table.keep_only_rows] instead. + Check if the table has a column with a specific name. Parameters ---------- - query: - A callable that returns True if a row should be included in the new table. + name: + The name of the column. Returns ------- - table: - A table containing only the rows that satisfy the query. - - See Also - -------- - [remove_rows][safeds.data.tabular.containers._table.Table.remove_rows]: - Remove rows that satifsfy a query. + has_column: + Whether the table has a column with the specified name. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> table.filter_rows(lambda x: x["a"] < 2) - a b - 0 1 2 - """ - warnings.warn( - "This method is deprecated and will be removed in a future version. Use `Table.keep_only_rows` instead.", - DeprecationWarning, - stacklevel=2, - ) - - return self.keep_only_rows(query) - - _T = TypeVar("_T") + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.has_column("a") + True + """ + return name in self.column_names - def keep_only_columns(self, column_names: list[str]) -> Table: + def remove_columns( + self, + names: str | list[str], + /, + ) -> Table: """ - Return a new table with only the given column(s). + Return a new table without the specified columns. - The original table is not modified. + **Notes:** - Note: When removing the last column of the table, the `number_of_columns` property will be set to 0. + - The original table is not modified. + - This method does not raise if a column does not exist. You can use it to ensure that the resulting table does + not contain certain columns. Parameters ---------- - column_names: - A list containing only the columns to be kept. + names: + The names of the columns to remove. Returns ------- - table: - A table containing only the given column(s). - - Raises - ------ - UnknownColumnNameError - If any of the given columns does not exist. - IllegalSchemaModificationError - If removing the columns would violate an invariant in the subclass. + new_table: + The table with the columns removed. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> table.keep_only_columns(["b"]) - b - 0 2 - 1 4 - """ - invalid_columns = [] - similar_columns: list[str] = [] - for name in column_names: - if not self._schema.has_column(name): - similar_columns = similar_columns + self._get_similar_columns(name) - invalid_columns.append(name) - if len(invalid_columns) != 0: - raise UnknownColumnNameError(invalid_columns, similar_columns) - - return self.remove_columns(list(set(self.column_names) - set(column_names))) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_columns("a") + +-----+ + | b | + | --- | + | i64 | + +=====+ + | 4 | + | 5 | + | 6 | + +-----+ + + >>> table.remove_columns(["c"]) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + if isinstance(names, str): + names = [names] + + return Table._from_polars_lazy_frame( + self._lazy_frame.drop(names), + ) - def remove_columns(self, column_names: list[str]) -> Table: + def remove_columns_except( + self, + names: str | list[str], + /, + ) -> Table: """ - Return a new table without the given column(s). - - The original table is not modified. - - Note: When removing the last column of the table, the `number_of_columns` property will be set to 0. + Return a new table with only the specified columns. Parameters ---------- - column_names: - A list containing all columns to be dropped. + names: + The names of the columns to keep. Returns ------- - table: - A table without the given columns. + new_table: + The table with only the specified columns. Raises ------ - UnknownColumnNameError - If any of the given columns does not exist. - IllegalSchemaModificationError - If removing the columns would violate an invariant in the subclass. + KeyError + If a column does not exist. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> table.remove_columns(["b"]) - a - 0 1 - 1 3 - """ - invalid_columns = [] - similar_columns: list[str] = [] - for name in column_names: - if not self._schema.has_column(name): - similar_columns = similar_columns + self._get_similar_columns(name) - invalid_columns.append(name) - if len(invalid_columns) != 0: - raise UnknownColumnNameError(invalid_columns, similar_columns) - - transformed_data = self._data.drop(labels=column_names, axis="columns") - transformed_data.columns = [name for name in self._schema.column_names if name not in column_names] - - if len(transformed_data.columns) == 0: - return Table() - - return Table._from_pandas_dataframe(transformed_data) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_columns_except("a") + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + if isinstance(names, str): + names = [names] + + self._check_columns_exist(names) + + return Table._from_polars_lazy_frame( + self._lazy_frame.select(names), + ) def remove_columns_with_missing_values(self) -> Table: """ - Return a new table without the columns that contain missing values. + Return a new table without columns that contain missing values. - The original table is not modified. + **Notes:** - Note: When removing the last column of the table, the `number_of_columns` property will be set to 0. + - The original table is not modified. + - This operation must fully load the data into memory, which can be expensive. Returns ------- - table: - A table without the columns that contain missing values. - - Raises - ------ - IllegalSchemaModificationError - If removing the columns would violate an invariant in the subclass. + new_table: + The table without columns containing missing values. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 2], "b": [None, 2]}) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, None]}) >>> table.remove_columns_with_missing_values() - a - 0 1 - 1 2 - """ - return Table.from_columns([column for column in self.to_columns() if not column.has_missing_values()]) + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + import polars as pl + + return Table._from_polars_lazy_frame( + pl.LazyFrame( + [series for series in self._data_frame.get_columns() if series.null_count() == 0], + ), + ) - def remove_columns_with_non_numerical_values(self) -> Table: + def remove_non_numeric_columns(self) -> Table: """ - Return a new table without the columns that contain non-numerical values. - - The original table is not modified. + Return a new table without non-numeric columns. - Note: When removing the last column of the table, the `number_of_columns` property will be set to 0. + **Note:** The original table is not modified. Returns ------- - table: - A table without the columns that contain non-numerical values. - - Raises - ------ - IllegalSchemaModificationError - If removing the columns would violate an invariant in the subclass. + new_table: + The table without non-numeric columns. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 0], "b": ["test", 2]}) - >>> table.remove_columns_with_non_numerical_values() - a - 0 1 - 1 0 - """ - return Table.from_columns([column for column in self.to_columns() if column.type.is_numeric()]) + >>> table = Table({"a": [1, 2, 3], "b": ["4", "5", "6"]}) + >>> table.remove_non_numeric_columns() + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + import polars.selectors as cs + + return Table._from_polars_lazy_frame( + self._lazy_frame.select(cs.numeric()), + ) - def keep_only_rows(self, query: Callable[[Row], bool]) -> Table: + def rename_column(self, old_name: str, new_name: str) -> Table: """ - Return a new table containing only the rows that satisfy the query. + Return a new table with a column renamed. - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - query: - A callable that returns True if a row should be included in the new table. + old_name: + The name of the column to rename. + new_name: + The new name of the column. Returns ------- - table: - A table containing only the rows that satisfy the query. + new_table: + The table with the column renamed. - See Also - -------- - [remove_rows][safeds.data.tabular.containers._table.Table.remove_rows]: - Remove rows that satifsfy a query. + Raises + ------ + KeyError + If no column with the old name exists. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> table.keep_only_rows(lambda x: x["a"] < 2) - a b - 0 1 2 - """ - import pandas as pd - - rows: list[Row] = [row for row in self.to_rows() if query(row)] - if len(rows) == 0: - result_table = Table._from_pandas_dataframe(pd.DataFrame(), self._schema) - else: - result_table = self.from_rows(rows) - return result_table + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.rename_column("a", "c") + +-----+-----+ + | c | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + self._check_columns_exist(old_name) + + return Table._from_polars_lazy_frame( + self._lazy_frame.rename({old_name: new_name}), + ) - def remove_rows(self, query: Callable[[Row], bool]) -> Table: + def replace_column( + self, + old_name: str, + new_columns: Column | list[Column], + ) -> Table: """ - Return a new table without the rows that satisfy the query. + Return a new table with a column replaced by zero or more columns. - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - query: - A callable that returns True if the row should be removed. - - Returns - ------- - table: - A table without the rows that satisfy the query. - - See Also - -------- - [keep_only_rows][safeds.data.tabular.containers._table.Table.keep_only_rows]: - Create a table containing only the rows that satisfy a query. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3], "b": [2, 4]}) - >>> table.remove_rows(lambda x: x["a"] < 2) - a b - 0 3 4 - """ - import pandas as pd - - rows: list[Row] = [row for row in self.to_rows() if not query(row)] - if len(rows) == 0: - result_table = Table._from_pandas_dataframe(pd.DataFrame(), self._schema) - else: - result_table = self.from_rows(rows) - return result_table - - def remove_duplicate_rows(self) -> Table: - """ - Return a new table with every duplicate row removed. - - The original table is not modified. + old_name: + The name of the column to replace. + new_columns: + The new column or columns. Returns ------- - result: - The table with the duplicate rows removed. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3, 3], "b": [2, 4, 4]}) - >>> table.remove_duplicate_rows() - a b - 0 1 2 - 1 3 4 - """ - result = self._data.drop_duplicates(ignore_index=True) - result.columns = self._schema.column_names - return Table._from_pandas_dataframe(result) - - def remove_rows_with_missing_values(self) -> Table: - """ - Return a new table without the rows that contain missing values. - - The original table is not modified. + new_table: + The table with the column replaced. - Returns - ------- - table: - A table without the rows that contain missing values. + Raises + ------ + KeyError + If no column with the old name exists. Examples -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1.0, None, 3], "b": [2, 4.0, None]}) - >>> table.remove_rows_with_missing_values() - a b - 0 1.0 2.0 - """ - result = self._data.dropna(axis="index") - return Table._from_pandas_dataframe(result) - - def remove_rows_with_outliers(self) -> Table: - """ - Return a new table without those rows that contain at least one outlier. - - We define an outlier as a value that has a distance of more than 3 standard deviations from the column mean. - Missing values are not considered outliers. They are also ignored during the calculation of the standard - deviation. + >>> from safeds.data.tabular.containers import Column, Table + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.replace_column("a", []) + +-----+ + | b | + | --- | + | i64 | + +=====+ + | 4 | + | 5 | + | 6 | + +-----+ + + >>> column1 = Column("c", [7, 8, 9]) + >>> table.replace_column("a", column1) + +-----+-----+ + | c | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 7 | 4 | + | 8 | 5 | + | 9 | 6 | + +-----+-----+ + + >>> column2 = Column("d", [10, 11, 12]) + >>> table.replace_column("a", [column1, column2]) + +-----+-----+-----+ + | c | d | b | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 7 | 10 | 4 | + | 8 | 11 | 5 | + | 9 | 12 | 6 | + +-----+-----+-----+ + """ + self._check_columns_exist(old_name) + + if isinstance(new_columns, Column): + new_columns = [new_columns] + + if len(new_columns) == 0: + return self.remove_columns(old_name) + + if len(new_columns) == 1: + new_column = new_columns[0] + return Table._from_polars_lazy_frame( + self._lazy_frame.with_columns(new_column._series.alias(old_name)).rename({old_name: new_column.name}), + ) - The original table is not modified. + import polars as pl - Returns - ------- - new_table: - A new table without rows containing outliers. + index = self.column_names.index(old_name) - Examples - -------- - >>> from safeds.data.tabular.containers import Column, Table - >>> c1 = Column("a", [1, 3, 1, 0.1, 0, 0, 0, 0, 0, 0, 0, 0]) - >>> c2 = Column("b", [1.5, 1, 0.5, 0.01, 0, 0, 0, 0, 0, 0, 0, 0]) - >>> c3 = Column("c", [0.1, 0.00, 0.4, 0.2, 0, 0, 0, 0, 0, 0, 0, 0]) - >>> c4 = Column("d", [-1000000, 1000000, -1000000, -1000000, -1000000, -1000000, -1000000, -1000000, -1000000, -1000000, -1000000, -1000000]) - >>> table = Table.from_columns([c1, c2, c3, c4]) - >>> table.remove_rows_with_outliers() - a b c d - 0 1.0 1.50 0.1 -1000000 - 1 1.0 0.50 0.4 -1000000 - 2 0.1 0.01 0.2 -1000000 - 3 0.0 0.00 0.0 -1000000 - 4 0.0 0.00 0.0 -1000000 - 5 0.0 0.00 0.0 -1000000 - 6 0.0 0.00 0.0 -1000000 - 7 0.0 0.00 0.0 -1000000 - 8 0.0 0.00 0.0 -1000000 - 9 0.0 0.00 0.0 -1000000 - 10 0.0 0.00 0.0 -1000000 - """ - import numpy as np - from scipy import stats - - table_without_nonnumericals = self.remove_columns_with_non_numerical_values() - z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit")) - filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1) - - return Table._from_pandas_dataframe(self._data[filter_], self._schema) + return Table._from_polars_lazy_frame( + self._lazy_frame.select( + *[pl.col(name) for name in self.column_names[:index]], + *[column._series for column in new_columns], + *[pl.col(name) for name in self.column_names[index + 1:]], + ), + ) - def rename_column(self, old_name: str, new_name: str) -> Table: + def transform_column( + self, + name: str, + transformer: Callable[[Cell], Cell], + ) -> Table: """ - Return a new `Table` with a single column renamed. + Return a new table with a column transformed. - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - old_name: - The old name of the target column. - new_name: - The new name of the target column. + name: + The name of the column to transform. + + transformer: + The function that transforms the column. Returns ------- - table: - The Table with the renamed column. + new_table: + The table with the transformed column. Raises ------ - UnknownColumnNameError - If the specified old target column name does not exist. - DuplicateColumnNameError - If the specified new target column name already exists. + KeyError + If no column with the specified name exists. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2]}) - >>> table.rename_column("b", "c") - a c - 0 1 2 - """ - if old_name not in self._schema.column_names: - similar_columns = self._get_similar_columns(old_name) - raise UnknownColumnNameError([old_name], similar_columns) - if old_name == new_name: - return self - if new_name in self._schema.column_names: - raise DuplicateColumnNameError(new_name) - - new_df = self._data.reset_index(drop=True) - new_df.columns = self._schema.column_names - return Table._from_pandas_dataframe(new_df.rename(columns={old_name: new_name})) - - def replace_column(self, old_column_name: str, new_columns: list[Column]) -> Table: - """ - Return a new table with the specified old column replaced by a list of new columns. + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.transform_column("a", lambda cell: cell + 1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 4 | + | 3 | 5 | + | 4 | 6 | + +-----+-----+ + """ + self._check_columns_exist(name) + + import polars as pl + + transformed_column = transformer(_LazyCell(pl.col(name))) + + return Table._from_polars_lazy_frame( + self._lazy_frame.with_columns(transformed_column._polars_expression), + ) - The order of columns is kept. + # ------------------------------------------------------------------------------------------------------------------ + # Row operations + # ------------------------------------------------------------------------------------------------------------------ - The original table is not modified. + # TODO: Rethink group_rows/group_rows_by_column. They should not return a dict. - Parameters - ---------- - old_column_name: - The name of the column to be replaced. + def remove_duplicate_rows(self) -> Table: + """ + Return a new table without duplicate rows. - new_columns: - The list of new columns replacing the old column. + **Note:** The original table is not modified. Returns ------- - result: - A table with the old column replaced by the new columns. - - Raises - ------ - UnknownColumnNameError - If the old column does not exist. - DuplicateColumnNameError - If at least one of the new columns already exists and the existing column is not affected by the replacement. - ColumnSizeError - If the size of at least one of the new columns does not match the amount of rows. - IllegalSchemaModificationError - If replacing the column would violate an invariant in the subclass. + new_table: + The table without duplicate rows. Examples -------- - >>> from safeds.data.tabular.containers import Column, Table - >>> table = Table.from_dict({"a": [1], "b": [2]}) - >>> new_col = Column("new", [3]) - >>> table.replace_column("b", [new_col]) - a new - 0 1 3 - """ - if old_column_name not in self._schema.column_names: - similar_columns = self._get_similar_columns(old_column_name) - raise UnknownColumnNameError([old_column_name], similar_columns) - - columns = list[Column]() - for old_column in self.column_names: - if old_column == old_column_name: - for new_column in new_columns: - if new_column.name in self.column_names and new_column.name != old_column_name: - raise DuplicateColumnNameError(new_column.name) - - if self.number_of_rows != new_column.number_of_rows: - raise ColumnSizeError(str(self.number_of_rows), str(new_column.number_of_rows)) - columns.append(new_column) - else: - columns.append(self.get_column(old_column)) - - return Table.from_columns(columns) + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a": [1, 2, 2], "b": [4, 5, 5]}) + >>> table.remove_duplicate_rows() + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + +-----+-----+ + """ + return Table._from_polars_lazy_frame( + self._lazy_frame.unique(maintain_order=True), + ) - def shuffle_rows(self) -> Table: + def remove_rows( + self, + query: Callable[[Row], Cell[bool]], + ) -> Table: """ - Return a new `Table` with randomly shuffled rows of this `Table`. + Return a new table without rows that satisfy a condition. - The original table is not modified. + **Note:** The original table is not modified. + + Parameters + ---------- + query: + The function that determines which rows to remove. Returns ------- - result: - The shuffled Table. + new_table: + The table without the specified rows. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> import numpy as np - >>> np.random.seed(123456) - >>> table = Table.from_dict({"a": [1, 3, 5], "b": [2, 4, 6]}) - >>> table.shuffle_rows() - a b - 0 5 6 - 1 1 2 - 2 3 4 - """ - new_df = self._data.sample(frac=1.0) - new_df.columns = self._schema.column_names - return Table._from_pandas_dataframe(new_df) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_rows(lambda row: row.get_value("a") == 2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 3 | 6 | + +-----+-----+ + """ + mask = query(_LazyVectorizedRow(self)) + + return Table._from_polars_lazy_frame( + self._lazy_frame.filter(~mask._polars_expression), + ) - def slice_rows( + def remove_rows_by_column( self, - start: int | None = None, - end: int | None = None, - step: int = 1, + name: str, + query: Callable[[Cell], Cell[bool]], ) -> Table: """ - Slice a part of the table into a new table. + Return a new table without rows that satisfy a condition on a specific column. - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - start: - The first index of the range to be copied into a new table, None by default. - end: - The last index of the range to be copied into a new table, None by default. - step: - The step size used to iterate through the table, 1 by default. + name: + The name of the column. + query: + The function that determines which rows to remove. Returns ------- - result : Table - The resulting table. + new_table: + The table without the specified rows. Raises ------ - IndexOutOfBoundsError - If the index is out of bounds. + KeyError + If the column does not exist. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3, 5], "b": [2, 4, 6]}) - >>> table.slice_rows(0, 2) - a b - 0 1 2 - 1 3 4 + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_rows_by_column("a", lambda cell: cell == 2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 3 | 6 | + +-----+-----+ """ - if start is None: - start = 0 + self._check_columns_exist(name) - if end is None: - end = self.number_of_rows + import polars as pl - if end < start: - raise IndexOutOfBoundsError(slice(start, end)) - if start < 0 or end < 0 or start > self.number_of_rows or end > self.number_of_rows: - raise IndexOutOfBoundsError(start if start < 0 or start > self.number_of_rows else end) + mask = query(_LazyCell(pl.col(name))) - new_df = self._data.iloc[start:end:step] - new_df.columns = self._schema.column_names - return Table._from_pandas_dataframe(new_df) + return Table._from_polars_lazy_frame( + self._lazy_frame.filter(~mask._polars_expression), + ) - def sort_columns( + def remove_rows_with_missing_values( self, - comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name) - - (col1.name < col2.name), + column_names: list[str] | None = None, ) -> Table: """ - Sort the columns of a `Table` with the given comparator and return a new `Table`. + Return a new table without rows containing missing values in the specified columns. - The comparator is a function that takes two columns `col1` and `col2` and - returns an integer: - - * If `col1` should be ordered before `col2`, the function should return a negative number. - * If `col1` should be ordered after `col2`, the function should return a positive number. - * If the original order of `col1` and `col2` should be kept, the function should return 0. - - If no comparator is given, the columns will be sorted alphabetically by their name. - - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - comparator: - The function used to compare two columns. + column_names: + Names of the columns to consider. If None, all columns are considered. Returns ------- new_table: - A new table with sorted columns. + The table without rows containing missing values in the specified columns. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1], "b": [2] }) - >>> table.sort_columns(lambda col1, col2: 1) - a b - 0 1 2 - >>> table.sort_columns(lambda col1, col2: -1) - b a - 0 2 1 - >>> table2 = Table.from_dict({"b": [2], "a": [1]}) - >>> table2.sort_columns() - a b - 0 1 2 - """ - columns = self.to_columns() - columns.sort(key=functools.cmp_to_key(comparator)) - return Table.from_columns(columns) + >>> table = Table({"a": [1, None, 3], "b": [4, 5, None]}) + >>> table.remove_rows_with_missing_values() + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + +-----+-----+ + """ + return Table._from_polars_lazy_frame( + self._lazy_frame.drop_nulls(subset=column_names), + ) - def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table: + def remove_rows_with_outliers( + self, + column_names: list[str] | None = None, + *, + z_score_threshold: float = 3, + ) -> Table: """ - Sort the rows of a `Table` with the given comparator and return a new `Table`. + Return a new table without rows containing outliers in the specified columns. - The comparator is a function that takes two rows `row1` and `row2` and - returns an integer: + Whether a data point is an outlier in a column is determined by its z-score. The z-score the distance of the + data point from the mean of the column divided by the standard deviation of the column. If the z-score is + greater than the given threshold, the data point is considered an outlier. Missing values are ignored during the + calculation of the z-score. - * If `row1` should be ordered before `row2`, the function should return a negative number. - * If `row1` should be ordered after `row2`, the function should return a positive number. - * If the original order of `row1` and `row2` should be kept, the function should return 0. + The z-score is only defined for numeric columns. Non-numeric columns are ignored, even if they are specified in + `column_names`. - The original table is not modified. + **Notes:** + + - The original table is not modified. + - This operation must fully load the data into memory, which can be expensive. Parameters ---------- - comparator: - The function used to compare two rows. + column_names: + Names of the columns to consider. If None, all numeric columns are considered. + z_score_threshold: + The z-score threshold for detecting outliers. Returns ------- new_table: - A new table with sorted rows. + The table without rows containing outliers in the specified columns. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 3, 5], "b": [2, 4, 6] }) - >>> table.sort_rows(lambda row1, row2: 1) - a b - 0 1 2 - 1 3 4 - 2 5 6 - >>> table.sort_rows(lambda row1, row2: -1) - a b - 0 5 6 - 1 3 4 - 2 1 2 - >>> table.sort_rows(lambda row1, row2: 0) - a b - 0 1 2 - 1 3 4 - 2 5 6 - """ - rows = self.to_rows() - rows.sort(key=functools.cmp_to_key(comparator)) - return Table.from_rows(rows) - - def split_rows(self, percentage_in_first: float) -> tuple[Table, Table]: - """ - Split the table into two new tables. + >>> table = Table( + ... { + ... "a": [1, 2, 3, 4, 5, 6, 1000, None], + ... "b": [1, 2, 3, 4, 5, 6, 7, 8], + ... } + ... ) + >>> table.remove_rows_with_outliers(z_score_threshold=2) + +------+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +============+ + | 1 | 1 | + | 2 | 2 | + | 3 | 3 | + | 4 | 4 | + | 5 | 5 | + | 6 | 6 | + | null | 8 | + +------+-----+ + """ + if column_names is None: + column_names = self.column_names + + import polars as pl + import polars.selectors as cs + + non_outlier_mask = pl.all_horizontal( + self._data_frame.select(cs.numeric() & cs.by_name(column_names)).select( + pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), + ), + ) - The original table is not modified. + return Table._from_polars_lazy_frame( + self._lazy_frame.filter(non_outlier_mask), + ) - Parameters - ---------- - percentage_in_first: - The desired size of the first table in percentage to the given table; must be between 0 and 1. + def shuffle_rows(self) -> Table: + """ + Return a new table with the rows shuffled. + + **Note:** The original table is not modified. Returns ------- - result: - A tuple containing the two resulting tables. The first table has the specified size, the second table - contains the rest of the data. - - Raises - ------ - ValueError: - if the 'percentage_in_first' is not between 0 and 1. + new_table: + The table with the rows shuffled. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> slices = table.split_rows(0.4) - >>> slices[0] - temperature sales - 0 10 54 - 1 15 74 - >>> slices[1] - temperature sales - 0 20 90 - 1 25 206 - 2 30 210 - """ - if percentage_in_first < 0 or percentage_in_first > 1: - raise ValueError("The given percentage is not between 0 and 1") - if self.number_of_rows == 0: - return Table(), Table() - return ( - self.slice_rows(0, round(percentage_in_first * self.number_of_rows)), - self.slice_rows(round(percentage_in_first * self.number_of_rows)), + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.shuffle_rows() + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 3 | 6 | + | 2 | 5 | + | 1 | 4 | + +-----+-----+ + """ + return Table._from_polars_data_frame( + self._data_frame.sample( + fraction=1, + shuffle=True, + seed=_get_random_seed(), + ), ) - def transform_column(self, name: str, transformer: Callable[[Row], Any]) -> Table: + def slice_rows(self, start: int = 0, length: int | None = None) -> Table: """ - Return a new `Table` with the provided column transformed by calling the provided transformer. + Return a new table with a slice of rows. - The original table is not modified. + **Note:** The original table is not modified. + + Parameters + ---------- + start: + The start index of the slice. + length: + The length of the slice. If None, the slice contains all rows starting from `start`. Returns ------- - result: - The table with the transformed column. - - Raises - ------ - UnknownColumnNameError - If the column does not exist. + new_table: + The table with the slice of rows. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"item": ["apple", "milk", "beer"], "price": [1.00, 1.19, 1.79]}) - >>> table.transform_column("price", lambda row: row.get_value("price") * 100) - item price - 0 apple 100.0 - 1 milk 119.0 - 2 beer 179.0 - """ - if self.has_column(name): - items: list = [transformer(item) for item in self.to_rows()] - result: list[Column] = [Column(name, items)] - return self.replace_column(name, result) - similar_columns = self._get_similar_columns(name) - raise UnknownColumnNameError([name], similar_columns) + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.slice_rows(start=1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + + >>> table.slice_rows(start=1, length=1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + +-----+-----+ + """ + return Table._from_polars_lazy_frame( + self._lazy_frame.slice(start, length), + ) - def transform_table(self, transformer: TableTransformer) -> Table: + def sort_rows( + self, + key_selector: Callable[[Row], Cell], + *, + descending: bool = False, + ) -> Table: """ - Return a new `Table` with a learned transformation applied to this table. + Return a new table with the rows sorted. - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - transformer: - The transformer which transforms the given table. + key_selector: + The function that selects the key to sort by. + descending: + Whether to sort in descending order. Returns ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - IllegalSchemaModificationError - If replacing the column would violate an invariant in the subclass. + new_table: + The table with the rows sorted. Examples -------- - >>> from safeds.data.tabular.transformation import OneHotEncoder >>> from safeds.data.tabular.containers import Table - >>> transformer = OneHotEncoder() - >>> table = Table.from_dict({"fruit": ["apple", "pear", "apple"], "pet": ["dog", "duck", "duck"]}) - >>> transformer = transformer.fit(table, None) - >>> table.transform_table(transformer) - fruit__apple fruit__pear pet__dog pet__duck - 0 1.0 0.0 1.0 0.0 - 1 0.0 1.0 0.0 1.0 - 2 1.0 0.0 0.0 1.0 - """ - return transformer.transform(self) + >>> table = Table({"a": [2, 1, 3], "b": [1, 1, 2]}) + >>> table.sort_rows(lambda row: row.get_value("a") - row.get_value("b")) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 1 | + | 2 | 1 | + | 3 | 2 | + +-----+-----+ + """ + key = key_selector(_LazyVectorizedRow(self)) + + return Table._from_polars_lazy_frame( + self._lazy_frame.sort( + key._polars_expression, + descending=descending, + maintain_order=True, + ), + ) - def inverse_transform_table(self, transformer: InvertibleTableTransformer) -> Table: + def sort_rows_by_column( + self, + name: str, + *, + descending: bool = False, + ) -> Table: """ - Return a new `Table` with the inverted transformation applied by the given transformer. + Return a new table with the rows sorted by a specific column. - The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - transformer: - A transformer that was fitted with columns, which are all present in the table. + name: + The name of the column to sort by. + descending: + Whether to sort in descending order. Returns ------- - table: - The original table. + new_table: + The table with the rows sorted by the specified column. Raises ------ - TransformerNotFittedError - If the transformer has not been fitted yet. + KeyError + If the column does not exist. Examples -------- - >>> from safeds.data.tabular.transformation import OneHotEncoder >>> from safeds.data.tabular.containers import Table - >>> transformer = OneHotEncoder() - >>> table = Table.from_dict({"a": ["j", "k", "k"], "b": ["x", "y", "x"]}) - >>> transformer = transformer.fit(table, None) - >>> transformed_table = transformer.transform(table) - >>> transformed_table.inverse_transform_table(transformer) - a b - 0 j x - 1 k y - 2 k x - >>> transformer.inverse_transform(transformed_table) - a b - 0 j x - 1 k y - 2 k x - """ - return transformer.inverse_transform(self) - - # ------------------------------------------------------------------------------------------------------------------ - # Plotting - # ------------------------------------------------------------------------------------------------------------------ - - def plot_correlation_heatmap(self) -> Image: - """ - Plot a correlation heatmap for all numerical columns of this `Table`. - - Returns - ------- - plot: - The plot as an image. + >>> table = Table({"a": [2, 1, 3], "b": [1, 1, 2]}) + >>> table.sort_rows_by_column("a") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 1 | + | 2 | 1 | + | 3 | 2 | + +-----+-----+ + """ + self._check_columns_exist(name) + + return Table._from_polars_lazy_frame( + self._lazy_frame.sort( + name, + descending=descending, + maintain_order=True, + ), + ) - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_correlation_heatmap() + def split_rows( + self, + percentage_in_first: float, + *, + shuffle: bool = True, + ) -> tuple[Table, Table]: """ - import matplotlib.pyplot as plt - import seaborn as sns - - only_numerical = self.remove_columns_with_non_numerical_values() - - if self.number_of_rows == 0: - warnings.warn( - "An empty table has been used. A correlation heatmap on an empty table will show nothing.", - stacklevel=2, - ) - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=( - "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" - " automatically expanding." - ), - ) - fig = plt.figure() - sns.heatmap( - data=only_numerical._data.corr(), - vmin=-1, - vmax=1, - xticklabels=only_numerical.column_names, - yticklabels=only_numerical.column_names, - cmap="vlag", - ) - plt.tight_layout() - else: - fig = plt.figure() - sns.heatmap( - data=only_numerical._data.corr(), - vmin=-1, - vmax=1, - xticklabels=only_numerical.column_names, - yticklabels=only_numerical.column_names, - cmap="vlag", - ) - plt.tight_layout() - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + Create two tables by splitting the rows of the current table. - def plot_lineplot(self, x_column_name: str, y_column_name: str) -> Image: - """ - Plot two columns against each other in a lineplot. + The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table + contains the remaining rows. - If there are multiple x-values for a y-value, the resulting plot will consist of a line representing the mean - and the lower-transparency area around the line representing the 95% confidence interval. + **Note:** The original table is not modified. Parameters ---------- - x_column_name: - The column name of the column to be plotted on the x-Axis. - y_column_name: - The column name of the column to be plotted on the y-Axis. + percentage_in_first: + The percentage of rows to include in the first table. Must be between 0 and 1. + shuffle: + Whether to shuffle the rows before splitting. Returns ------- - plot: - The plot as an image. + first_table: + The first table. + second_table: + The second table. Raises ------ - UnknownColumnNameError - If either of the columns do not exist. + ValueError + If `percentage_in_first` is not between 0 and 1. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_lineplot("temperature", "sales") - """ - import matplotlib.pyplot as plt - import seaborn as sns - - if not self.has_column(x_column_name) or not self.has_column(y_column_name): - similar_columns_x = self._get_similar_columns(x_column_name) - similar_columns_y = self._get_similar_columns(y_column_name) - raise UnknownColumnNameError( - ([x_column_name] if not self.has_column(x_column_name) else []) - + ([y_column_name] if not self.has_column(y_column_name) else []), - (similar_columns_x if not self.has_column(x_column_name) else []) - + (similar_columns_y if not self.has_column(y_column_name) else []), + >>> table = Table({"a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 10]}) + >>> first_table, second_table = table.split_rows(0.6) + >>> first_table + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 6 | + | 4 | 9 | + | 3 | 8 | + +-----+-----+ + >>> second_table + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 5 | 10 | + | 2 | 7 | + +-----+-----+ + """ + if percentage_in_first < 0 or percentage_in_first > 1: + raise OutOfBoundsError( + actual=percentage_in_first, + name="percentage_in_first", + lower_bound=ClosedBound(0), + upper_bound=ClosedBound(1), ) - fig = plt.figure() - ax = sns.lineplot( - data=self._data, - x=x_column_name, - y=y_column_name, + input_table = self.shuffle_rows() if shuffle else self + number_of_rows_in_first = round(percentage_in_first * input_table.number_of_rows) + + return ( + input_table.slice_rows(length=number_of_rows_in_first), + input_table.slice_rows(start=number_of_rows_in_first), ) - ax.set(xlabel=x_column_name, ylabel=y_column_name) - ax.set_xticks(ax.get_xticks()) - ax.set_xticklabels( - ax.get_xticklabels(), - rotation=45, - horizontalalignment="right", - ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels - plt.tight_layout() - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + # ------------------------------------------------------------------------------------------------------------------ + # Table operations + # ------------------------------------------------------------------------------------------------------------------ - def plot_scatterplot(self, x_column_name: str, y_column_name: str) -> Image: + def add_table_as_columns(self, other: Table) -> Table: """ - Plot two columns against each other in a scatterplot. + Return a new table with the columns of another table added. + + **Notes:** + + - The original tables are not modified. + - This operation must fully load the data into memory, which can be expensive. Parameters ---------- - x_column_name: - The column name of the column to be plotted on the x-Axis. - y_column_name: - The column name of the column to be plotted on the y-Axis. + other: + The table to add as columns. Returns ------- - plot: - The plot as an image. - - Raises - ------ - UnknownColumnNameError - If either of the columns do not exist. + new_table: + The table with the columns added. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_scatterplot("temperature", "sales") - """ - import matplotlib.pyplot as plt - import seaborn as sns - - if not self.has_column(x_column_name) or not self.has_column(y_column_name): - similar_columns_x = self._get_similar_columns(x_column_name) - similar_columns_y = self._get_similar_columns(y_column_name) - raise UnknownColumnNameError( - ([x_column_name] if not self.has_column(x_column_name) else []) - + ([y_column_name] if not self.has_column(y_column_name) else []), - (similar_columns_x if not self.has_column(x_column_name) else []) - + (similar_columns_y if not self.has_column(y_column_name) else []), - ) - - fig = plt.figure() - ax = sns.scatterplot( - data=self._data, - x=x_column_name, - y=y_column_name, + >>> table1 = Table({"a": [1, 2, 3]}) + >>> table2 = Table({"b": [4, 5, 6]}) + >>> table1.add_table_as_columns(table2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + # TODO: raises? + + return Table._from_polars_data_frame( + self._data_frame.hstack(other._data_frame), ) - ax.set(xlabel=x_column_name, ylabel=y_column_name) - ax.set_xticks(ax.get_xticks()) - ax.set_xticklabels( - ax.get_xticklabels(), - rotation=45, - horizontalalignment="right", - ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels - plt.tight_layout() - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - def plot_boxplots(self) -> Image: + def add_table_as_rows(self, other: Table) -> Table: """ - Plot a boxplot for every numerical column. + Return a new table with the rows of another table added. + + **Notes:** + + - The original tables are not modified. + - This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + other: + The table to add as rows. Returns ------- - plot: - The plot as an image. - - Raises - ------ - NonNumericColumnError - If the table contains only non-numerical columns. + new_table: + The table with the rows added. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table({"a":[1, 2], "b": [3, 42]}) - >>> image = table.plot_boxplots() - """ - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - numerical_table = self.remove_columns_with_non_numerical_values() - if numerical_table.number_of_columns == 0: - raise NonNumericColumnError("This table contains only non-numerical columns.") - col_wrap = min(numerical_table.number_of_columns, 3) - - data = pd.melt(numerical_table._data, value_vars=numerical_table.column_names) - grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", - ) - grid.map(sns.boxplot, "variable", "value") - grid.set_xlabels("") - grid.set_ylabels("") - grid.set_titles("{col_name}") - for axes in grid.axes.flat: - axes.set_xticks([]) - plt.tight_layout() - fig = grid.fig - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + >>> table1 = Table({"a": [1, 2, 3]}) + >>> table2 = Table({"a": [4, 5, 6]}) + >>> table1.add_table_as_rows(table2) + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + | 4 | + | 5 | + | 6 | + +-----+ + """ + # TODO: raises? + + return Table._from_polars_data_frame( + self._data_frame.vstack(other._data_frame), + ) - def plot_histograms(self, *, number_of_bins: int = 10) -> Image: + def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer) -> Table: """ - Plot a histogram for every column. + Return a new table inverse-transformed by a **fitted, invertible** transformer. + + **Notes:** + + - The original table is not modified. + - Depending on the transformer, this operation might fully load the data into memory, which can be expensive. Parameters ---------- - number_of_bins: - The number of bins to use in the histogram. Default is 10. + fitted_transformer: + The fitted, invertible transformer to apply. Returns ------- - plot: - The plot as an image. + new_table: + The inverse-transformed table. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) - >>> image = table.plot_histograms() + >>> from safeds.data.tabular.transformation import RangeScaler + >>> table = Table({"a": [1, 2, 3]}) + >>> transformer, transformed_table = RangeScaler(min_=0, max_=1).fit_and_transform(table, ["a"]) + >>> transformed_table.inverse_transform_table(transformer) + +---------+ + | a | + | --- | + | f64 | + +=========+ + | 1.00000 | + | 2.00000 | + | 3.00000 | + +---------+ """ - import matplotlib.pyplot as plt - import numpy as np - import pandas as pd + return fitted_transformer.inverse_transform(self) - n_cols = min(3, self.number_of_columns) - n_rows = 1 + (self.number_of_columns - 1) // n_cols + def transform_table(self, fitted_transformer: TableTransformer) -> Table: + """ + Return a new table transformed by a **fitted** transformer. - if n_cols == 1 and n_rows == 1: - fig, axs = plt.subplots(1, 1, tight_layout=True) - one_col = True - else: - fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) - one_col = False + **Notes:** - col_names = self.column_names - for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): - np_col = np.array(self.get_column(col_name)) - bins = min(number_of_bins, len(pd.unique(np_col))) + - The original table is not modified. + - Depending on the transformer, this operation might fully load the data into memory, which can be expensive. - ax.set_title(col_name) - ax.set_xlabel("") - ax.set_ylabel("") + Parameters + ---------- + fitted_transformer: + The fitted transformer to apply. + + Returns + ------- + new_table: + The transformed table. - if self.get_column(col_name).type.is_numeric(): - np_col = np_col[~np.isnan(np_col)] + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> from safeds.data.tabular.transformation import RangeScaler + >>> table = Table({"a": [1, 2, 3]}) + >>> transformer = RangeScaler(min_=0, max_=1).fit(table, ["a"]) + >>> table.transform_table(transformer) + +---------+ + | a | + | --- | + | f64 | + +=========+ + | 0.00000 | + | 0.50000 | + | 1.00000 | + +---------+ + """ + return fitted_transformer.transform(self) - if bins < len(pd.unique(np_col)): - min_val = np.min(np_col) - max_val = np.max(np_col) - hist, bin_edges = np.histogram(self.get_column(col_name), bins, range=(min_val, max_val)) + # ------------------------------------------------------------------------------------------------------------------ + # Statistics + # ------------------------------------------------------------------------------------------------------------------ - bars = np.array([]) - for i in range(len(hist)): - bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") + def summarize_statistics(self) -> Table: + """ + Return a table with important statistics about this table. - ax.bar(bars, hist, edgecolor="black") - ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") - continue + Returns + ------- + statistics: + The table with statistics. - np_col = np_col.astype(str) - unique_values = np.unique(np_col) - hist = np.array([np.sum(np_col == value) for value in unique_values]) - ax.bar(unique_values, hist, edgecolor="black") - ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a": [1, 3]}) + >>> table.summarize_statistics() + +----------------------+--------------------+ + | metric | a | + | --- | --- | + | str | str | + +===========================================+ + | min | 1 | + | max | 3 | + | mean | 2.0 | + | median | 2.0 | + | standard deviation | 1.4142135623730951 | + | distinct value count | 2 | + | idness | 1.0 | + | missing value ratio | 0.0 | + | stability | 0.5 | + +----------------------+--------------------+ + """ + if self.number_of_columns == 0: + return Table() - for i in range(len(col_names), n_rows * n_cols): - fig.delaxes(axs.flatten()[i]) # Remove empty subplots + head = self.get_column(self.column_names[0]).summarize_statistics() + tail = [self.get_column(name).summarize_statistics().get_column(name)._series for name in self.column_names[1:]] - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + return Table._from_polars_data_frame( + head._lazy_frame.collect().hstack(tail, in_place=True), + ) # ------------------------------------------------------------------------------------------------------------------ - # Conversion + # Export # ------------------------------------------------------------------------------------------------------------------ - def to_csv_file(self, path: str | Path) -> None: + def to_columns(self) -> list[Column]: """ - Write the data from the table into a CSV file. + Return the data of the table as a list of columns. - If the file and/or the directories do not exist they will be created. If the file already exists it will be - overwritten. - - Parameters - ---------- - path: - The path to the output file. - - Raises - ------ - WrongFileExtensionError - If the file is not a csv file. + Returns + ------- + columns: + List of columns. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_csv_file("./src/resources/to_csv_file.csv") + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> columns = table.to_columns() """ - path = Path(path) - if path.suffix != ".csv": - raise WrongFileExtensionError(path, ".csv") - path.parent.mkdir(parents=True, exist_ok=True) - data_to_csv = self._data.reset_index(drop=True) - data_to_csv.columns = self._schema.column_names - data_to_csv.to_csv(path, index=False) + return [Column._from_polars_series(column) for column in self._data_frame.get_columns()] - def to_json_file(self, path: str | Path) -> None: + def to_csv_file(self, path: str | Path) -> None: """ - Write the data from the table into a JSON file. + Write the table to a CSV file. - If the file and/or the directories do not exist, they will be created. If the file already exists it will be - overwritten. + If the file and/or the parent directories do not exist, they will be created. If the file exists already, it + will be overwritten. Parameters ---------- path: - The path to the output file. + The path to the CSV file. If the file extension is omitted, it is assumed to be ".csv". Raises ------ - WrongFileExtensionError - If the file is not a JSON file. + ValueError + If the path has an extension that is not ".csv". Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_json_file("./src/resources/to_json_file.json") + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.to_csv_file("./src/resources/to_csv_file.csv") """ - path = Path(path) - if path.suffix != ".json": - raise WrongFileExtensionError(path, ".json") + path = _check_and_normalize_file_path(path, ".csv", [".csv"]) path.parent.mkdir(parents=True, exist_ok=True) - data_to_json = self._data.reset_index(drop=True) - data_to_json.columns = self._schema.column_names - data_to_json.to_json(path) + + self._lazy_frame.sink_csv(path) def to_dict(self) -> dict[str, list[Any]]: """ @@ -2091,83 +1717,104 @@ def to_dict(self) -> dict[str, list[Any]]: Returns ------- - data: + dict_: Dictionary representation of the table. Examples -------- >>> from safeds.data.tabular.containers import Table - >>> row1 = Row({"a": 1, "b": 5}) - >>> row2 = Row({"a": 2, "b": 6}) - >>> table1 = Table.from_rows([row1, row2]) - >>> table2 = Table.from_dict({"a": [1, 2], "b": [5, 6]}) - >>> table1 == table2 - True + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.to_dict() + {'a': [1, 2, 3], 'b': [4, 5, 6]} """ - return {column_name: list(self.get_column(column_name)) for column_name in self.column_names} + return self._data_frame.to_dict(as_series=False) - def to_columns(self) -> list[Column]: + def to_json_file( + self, + path: str | Path, + *, + orientation: Literal["column", "row"] = "column", + ) -> None: """ - Return a list of the columns. + Write the table to a JSON file. - Returns - ------- - columns: - List of columns. + If the file and/or the parent directories do not exist, they will be created. If the file exists already, it + will be overwritten. + + **Note:** This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + path: + The path to the JSON file. If the file extension is omitted, it is assumed to be ".json". + orientation: + The orientation of the JSON file. If "column", the JSON file will be structured as a list of columns. If + "row", the JSON file will be structured as a list of rows. Row orientation is more human-readable, but + slower and less memory-efficient. + + Raises + ------ + ValueError + If the path has an extension that is not ".json". Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a":[1, 2],"b":[20, 30]}) - >>> table.to_columns() - [Column('a', [1, 2]), Column('b', [20, 30])] + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.to_json_file("./src/resources/to_json_file_2.json") """ - return [self.get_column(name) for name in self._schema.column_names] + path = _check_and_normalize_file_path(path, ".json", [".json"]) + path.parent.mkdir(parents=True, exist_ok=True) + + # Write JSON to file + self._data_frame.write_json(path, row_oriented=(orientation == "row")) - def to_rows(self) -> list[Row]: + def to_parquet_file(self, path: str | Path) -> None: """ - Return a list of the rows. + Write the table to a Parquet file. - Returns - ------- - rows: - List of rows. + If the file and/or the parent directories do not exist, they will be created. If the file exists already, it + will be overwritten. + + Parameters + ---------- + path: + The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". + + Raises + ------ + ValueError + If the path has an extension that is not ".parquet". Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"a":[1, 2],"b":[20, 30]}) - >>> table.to_rows() - [Row({ - 'a': 1, - 'b': 20 - }), Row({ - 'a': 2, - 'b': 30 - })] - """ - import pandas as pd - - return [ - Row._from_pandas_dataframe( - pd.DataFrame([list(series_row)], columns=self._schema.column_names), - self._schema, - ) - for (_, series_row) in self._data.iterrows() - ] + >>> table = Table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.to_parquet_file("./src/resources/to_parquet_file.parquet") + """ + path = _check_and_normalize_file_path(path, ".parquet", [".parquet"]) + path.parent.mkdir(parents=True, exist_ok=True) + + self._lazy_frame.sink_parquet(path) def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset: """ - Return a new `TabularDataset` with columns marked as a target column or feature columns. + Return a new `TabularDataset` with columns marked as a target, feature, or extra. - The original table is not modified. + - The target column is the column that a model should predict. + - Feature columns are columns that a model should use to make predictions. + - Extra columns are columns that are neither feature nor target. They can be used to provide additional context, + like an ID column. + + Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns + are specified, all columns except the target column are used as features. Parameters ---------- target_name: Name of the target column. extra_names: - Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but + Names of the columns that are neither feature nor target. If None, no extra columns are used, i.e. all but the target column are used as features. Returns @@ -2185,11 +1832,15 @@ def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = N Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]}) + >>> table = Table( + ... { + ... "item": ["apple", "milk", "beer"], + ... "price": [1.10, 1.19, 1.79], + ... "amount_bought": [74, 72, 51], + ... } + ... ) >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) """ - from safeds.data.labeled.containers import TabularDataset - return TabularDataset(self, target_name, extra_names) def to_time_series_dataset( @@ -2235,56 +1886,87 @@ def to_time_series_dataset( return TimeSeriesDataset(self, target_name, time_name, extra_names) - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - def _repr_html_(self) -> str: - """ - Return an HTML representation of the table. - - Returns - ------- - output: - The generated HTML. - """ - return self._data.to_html(max_rows=self._data.shape[0], max_cols=self._data.shape[1], notebook=True) - # ------------------------------------------------------------------------------------------------------------------ # Dataframe interchange protocol # ------------------------------------------------------------------------------------------------------------------ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): # type: ignore[no-untyped-def] """ - Return a DataFrame exchange object that conforms to the dataframe interchange protocol. + Return a dataframe object that conforms to the dataframe interchange protocol. Generally, there is no reason to call this method directly. The dataframe interchange protocol is designed to allow libraries to consume tabular data from different sources, such as `pandas` or `polars`. If you still decide to call this method, you should not rely on any capabilities of the returned object beyond the dataframe interchange protocol. - The specification of the dataframe interchange protocol can be found on - [GitHub](https://github.com/data-apis/dataframe-api). + The specification of the dataframe interchange protocol can be found + [here](https://data-apis.org/dataframe-protocol/latest/index.html). + + **Note:** This operation must fully load the data into memory, which can be expensive. Parameters ---------- nan_as_null: - Whether to replace missing values in the data with `NaN`. + This parameter is deprecated and will be removed in a later revision of the dataframe interchange protocol. + Setting it has no effect. allow_copy: - Whether memory may be copied to create the DataFrame exchange object. + Whether memory may be copied to create the dataframe object. Returns ------- dataframe: - A DataFrame object that conforms to the dataframe interchange protocol. + A dataframe object that conforms to the dataframe interchange protocol. + """ + return self._data_frame.__dataframe__(allow_copy=allow_copy) + + # ------------------------------------------------------------------------------------------------------------------ + # IPython integration + # ------------------------------------------------------------------------------------------------------------------ + + def _repr_html_(self) -> str: + """ + Return a compact HTML representation of the table for IPython. + + **Note:** This operation must fully load the data into memory, which can be expensive. + + Returns + ------- + html: + The generated HTML. """ - if not allow_copy: - raise NotImplementedError("For the moment we need to copy the data, so `allow_copy` must be True.") + return self._data_frame._repr_html_() - data_copy = self._data.reset_index(drop=True) - data_copy.columns = self.column_names - return data_copy.__dataframe__(nan_as_null, allow_copy) + # ------------------------------------------------------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------------------------------------------------------ + + def _check_columns_exist(self, requested_names: str | list[str]) -> None: + """ + Check if the specified column names exist in the table and raise an error if they do not. + + Parameters + ---------- + requested_names: + The column names to check. + + Raises + ------ + KeyError + If a column name does not exist. + """ + if isinstance(requested_names, str): + requested_names = [requested_names] + + if len(requested_names) > 1: + known_names = set(self.column_names) + else: + known_names = self.column_names # type: ignore[assignment] + + unknown_names = [name for name in requested_names if name not in known_names] + if unknown_names: + raise UnknownColumnNameError(unknown_names) # TODO: in the error, compute similar column names + # TODO def _into_dataloader(self, batch_size: int) -> DataLoader: """ Return a Dataloader for the data stored in this table, used for predicting with neural networks. @@ -2302,36 +1984,27 @@ def _into_dataloader(self, batch_size: int) -> DataLoader: The DataLoader. """ - import numpy as np + import polars as pl import torch from torch.utils.data import DataLoader _init_default_device() - features = self.to_rows() - all_rows = [] - for row in features: - new_item = [] - for column_name in row: - new_item.append(row.get_value(column_name)) - all_rows.append(new_item.copy()) return DataLoader( - dataset=_create_dataset(np.array(all_rows)), + dataset=_create_dataset(self._data_frame.to_torch(dtype=pl.Float32)), batch_size=batch_size, generator=torch.Generator(device=_get_device()), ) - -def _create_dataset(features: np.array) -> Dataset: - import numpy as np - import torch +# TODO +def _create_dataset(features: Tensor) -> Dataset: from torch.utils.data import Dataset _init_default_device() class _CustomDataset(Dataset): - def __init__(self, features: np.array): - self.X = torch.from_numpy(features.astype(np.float32)).to(_get_device()) + def __init__(self, features: Tensor): + self.X = features self.len = self.X.shape[0] def __getitem__(self, item: int) -> torch.Tensor: diff --git a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py b/src/safeds/data/tabular/containers/_vectorized_cell.py similarity index 72% rename from src/safeds/data/tabular/containers/_experimental_vectorized_cell.py rename to src/safeds/data/tabular/containers/_vectorized_cell.py index b648cc2ab..675305b8b 100644 --- a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py +++ b/src/safeds/data/tabular/containers/_vectorized_cell.py @@ -3,23 +3,23 @@ from typing import TYPE_CHECKING, Any, TypeVar from safeds._utils import _structural_hash -from safeds.data.tabular.typing._experimental_polars_data_type import _PolarsDataType +from safeds.data.tabular.typing._polars_data_type import _PolarsDataType -from ._experimental_cell import ExperimentalCell +from ._cell import Cell if TYPE_CHECKING: import polars as pl - from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType + from safeds.data.tabular.typing import DataType - from ._experimental_column import ExperimentalColumn + from ._column import Column T = TypeVar("T") P = TypeVar("P") R = TypeVar("R") -class _VectorizedCell(ExperimentalCell[T]): +class _VectorizedCell(Cell[T]): """ A single value in a table. @@ -40,12 +40,12 @@ def _from_polars_series(data: pl.Series) -> _VectorizedCell: # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, column: ExperimentalColumn[T]) -> None: + def __init__(self, column: Column[T]) -> None: self._series: pl.Series = column._series # "Boolean" operators (actually bitwise) ----------------------------------- - def __invert__(self) -> ExperimentalCell[bool]: + def __invert__(self) -> Cell[bool]: import polars as pl if self._series.dtype != pl.Boolean: @@ -53,42 +53,42 @@ def __invert__(self) -> ExperimentalCell[bool]: return _wrap(self._series.__invert__()) - def __and__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __and__(self, other: bool | Cell[bool]) -> Cell[bool]: right_operand = _normalize_boolean_operation_operands(self, other) if right_operand is None: return NotImplemented return _wrap(self._series.__and__(right_operand)) - def __rand__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __rand__(self, other: bool | Cell[bool]) -> Cell[bool]: right_operand = _normalize_boolean_operation_operands(self, other) if right_operand is None: return NotImplemented return _wrap(self._series.__rand__(right_operand)) - def __or__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __or__(self, other: bool | Cell[bool]) -> Cell[bool]: right_operand = _normalize_boolean_operation_operands(self, other) if right_operand is None: return NotImplemented return _wrap(self._series.__or__(right_operand)) - def __ror__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __ror__(self, other: bool | Cell[bool]) -> Cell[bool]: right_operand = _normalize_boolean_operation_operands(self, other) if right_operand is None: return NotImplemented return _wrap(self._series.__ror__(right_operand)) - def __xor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __xor__(self, other: bool | Cell[bool]) -> Cell[bool]: right_operand = _normalize_boolean_operation_operands(self, other) if right_operand is None: return NotImplemented return _wrap(self._series.__xor__(right_operand)) - def __rxor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + def __rxor__(self, other: bool | Cell[bool]) -> Cell[bool]: right_operand = _normalize_boolean_operation_operands(self, other) if right_operand is None: return NotImplemented @@ -97,100 +97,100 @@ def __rxor__(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[boo # Comparison --------------------------------------------------------------- - def __eq__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[override] + def __eq__(self, other: object) -> Cell[bool]: # type: ignore[override] other = _unwrap(other) return _wrap(self._series.__eq__(other)) - def __ge__(self, other: Any) -> ExperimentalCell[bool]: + def __ge__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._series.__ge__(other)) - def __gt__(self, other: Any) -> ExperimentalCell[bool]: + def __gt__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._series.__gt__(other)) - def __le__(self, other: Any) -> ExperimentalCell[bool]: + def __le__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._series.__le__(other)) - def __lt__(self, other: Any) -> ExperimentalCell[bool]: + def __lt__(self, other: Any) -> Cell[bool]: other = _unwrap(other) return _wrap(self._series.__lt__(other)) - def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[override] + def __ne__(self, other: object) -> Cell[bool]: # type: ignore[override] other = _unwrap(other) return _wrap(self._series.__ne__(other)) # Numeric operators -------------------------------------------------------- - def __abs__(self) -> ExperimentalCell[R]: + def __abs__(self) -> Cell[R]: return _wrap(self._series.__abs__()) - def __ceil__(self) -> ExperimentalCell[R]: + def __ceil__(self) -> Cell[R]: return _wrap(self._series.ceil()) - def __floor__(self) -> ExperimentalCell[R]: + def __floor__(self) -> Cell[R]: return _wrap(self._series.floor()) - def __neg__(self) -> ExperimentalCell[R]: + def __neg__(self) -> Cell[R]: return _wrap(self._series.__neg__()) - def __pos__(self) -> ExperimentalCell[R]: + def __pos__(self) -> Cell[R]: return _wrap(self._series.__pos__()) - def __add__(self, other: Any) -> ExperimentalCell[R]: + def __add__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__add__(other)) - def __radd__(self, other: Any) -> ExperimentalCell[R]: + def __radd__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__radd__(other)) - def __floordiv__(self, other: Any) -> ExperimentalCell[R]: + def __floordiv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__floordiv__(other)) - def __rfloordiv__(self, other: Any) -> ExperimentalCell[R]: + def __rfloordiv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__rfloordiv__(other)) - def __mod__(self, other: Any) -> ExperimentalCell[R]: + def __mod__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__mod__(other)) - def __rmod__(self, other: Any) -> ExperimentalCell[R]: + def __rmod__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__rmod__(other)) - def __mul__(self, other: Any) -> ExperimentalCell[R]: + def __mul__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__mul__(other)) - def __rmul__(self, other: Any) -> ExperimentalCell[R]: + def __rmul__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__rmul__(other)) - def __pow__(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + def __pow__(self, other: float | Cell[P]) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__pow__(other)) - def __rpow__(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + def __rpow__(self, other: float | Cell[P]) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__rpow__(other)) - def __sub__(self, other: Any) -> ExperimentalCell[R]: + def __sub__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__sub__(other)) - def __rsub__(self, other: Any) -> ExperimentalCell[R]: + def __rsub__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__rsub__(other)) - def __truediv__(self, other: Any) -> ExperimentalCell[R]: + def __truediv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__truediv__(other)) - def __rtruediv__(self, other: Any) -> ExperimentalCell[R]: + def __rtruediv__(self, other: Any) -> Cell[R]: other = _unwrap(other) return _wrap(self._series.__rtruediv__(other)) @@ -219,7 +219,7 @@ def __sizeof__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def type(self) -> ExperimentalDataType: + def type(self) -> DataType: return _PolarsDataType(self._series.dtype) # ------------------------------------------------------------------------------------------------------------------ @@ -240,7 +240,7 @@ def _equals(self, other: object) -> bool: def _normalize_boolean_operation_operands( left_operand: _VectorizedCell, - right_operand: bool | ExperimentalCell[bool], + right_operand: bool | Cell[bool], ) -> pl.Series | None: """ Normalize the operands of a boolean operation (not, and, or, xor). diff --git a/src/safeds/data/tabular/plotting/__init__.py b/src/safeds/data/tabular/plotting/__init__.py index a6169ca18..5cb85c2a7 100644 --- a/src/safeds/data/tabular/plotting/__init__.py +++ b/src/safeds/data/tabular/plotting/__init__.py @@ -5,18 +5,18 @@ import apipkg if TYPE_CHECKING: - from ._experimental_column_plotter import ExperimentalColumnPlotter - from ._experimental_table_plotter import ExperimentalTablePlotter + from ._column_plotter import ColumnPlotter + from ._table_plotter import TablePlotter apipkg.initpkg( __name__, { - "ExperimentalColumnPlotter": "._experimental_column_plotter:ExperimentalColumnPlotter", - "ExperimentalTablePlotter": "._experimental_table_plotter:ExperimentalTablePlotter", + "ColumnPlotter": "._column_plotter:ColumnPlotter", + "TablePlotter": "._table_plotter:TablePlotter", }, ) __all__ = [ - "ExperimentalColumnPlotter", - "ExperimentalTablePlotter", + "ColumnPlotter", + "TablePlotter", ] diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_column_plotter.py similarity index 80% rename from src/safeds/data/tabular/plotting/_experimental_column_plotter.py rename to src/safeds/data/tabular/plotting/_column_plotter.py index 3402cd6d4..32bebad9a 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_column_plotter.py @@ -7,10 +7,10 @@ if TYPE_CHECKING: from safeds.data.image.containers import Image - from safeds.data.tabular.containers import ExperimentalColumn + from safeds.data.tabular.containers import Column -class ExperimentalColumnPlotter: +class ColumnPlotter: """ A class that contains plotting methods for a column. @@ -21,13 +21,13 @@ class ExperimentalColumnPlotter: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, 2, 3]) >>> plotter = column.plot """ - def __init__(self, column: ExperimentalColumn): - self._column: ExperimentalColumn = column + def __init__(self, column: Column): + self._column: Column = column def box_plot(self) -> Image: """ @@ -45,8 +45,8 @@ def box_plot(self) -> Image: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, 2, 3]) >>> boxplot = column.plot.box_plot() """ if not self._column.is_numeric: @@ -85,8 +85,8 @@ def histogram(self, *, number_of_bins: int = 10) -> Image: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, 2, 3]) >>> histogram = column.plot.histogram() """ return self._column.to_table().plot.histograms(number_of_bins=number_of_bins) @@ -112,8 +112,8 @@ def lag_plot(self, lag: int) -> Image: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> column = ExperimentalColumn("values", [1, 2, 3, 4]) + >>> from safeds.data.tabular.containers import Column + >>> column = Column("values", [1, 2, 3, 4]) >>> image = column.plot.lag_plot(2) """ if not self._column.is_numeric: diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_table_plotter.py similarity index 93% rename from src/safeds/data/tabular/plotting/_experimental_table_plotter.py rename to src/safeds/data/tabular/plotting/_table_plotter.py index 16c8e5916..66ef96f5d 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_table_plotter.py @@ -8,12 +8,27 @@ if TYPE_CHECKING: from safeds.data.image.containers import Image - from safeds.data.tabular.containers import ExperimentalTable + from safeds.data.tabular.containers import Table -class ExperimentalTablePlotter: - def __init__(self, table: ExperimentalTable): - self._table: ExperimentalTable = table +class TablePlotter: + """ + A class that contains plotting methods for a table. + + Parameters + ---------- + table: + The table to plot. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table("test", [1, 2, 3]) + >>> plotter = table.plot + """ + + def __init__(self, table: Table): + self._table: Table = table def box_plots(self) -> Image: """ @@ -33,7 +48,7 @@ def box_plots(self) -> Image: -------- >>> from safeds.data.tabular.containers import Table >>> table = Table({"a":[1, 2], "b": [3, 42]}) - >>> image = table.plot_boxplots() + >>> image = table.plot.box_plots() """ # TOOD: implement using matplotlib and polars import matplotlib.pyplot as plt @@ -75,7 +90,7 @@ def correlation_heatmap(self) -> Image: -------- >>> from safeds.data.tabular.containers import Table >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_correlation_heatmap() + >>> image = table.plot.correlation_heatmap() """ # TODO: implement using matplotlib and polars # https://stackoverflow.com/questions/33282368/plotting-a-2d-heatmap @@ -140,7 +155,7 @@ def histograms(self, *, number_of_bins: int = 10) -> Image: -------- >>> from safeds.data.tabular.containers import Table >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) - >>> image = table.plot_histograms() + >>> image = table.plot.histograms() """ # TODO: implement using polars import matplotlib.pyplot as plt @@ -218,8 +233,8 @@ def line_plot(self, x_name: str, y_name: str) -> Image: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable( + >>> from safeds.data.tabular.containers import Table + >>> table = Table( ... { ... "a": [1, 2, 3, 4, 5], ... "b": [2, 3, 4, 5, 6], @@ -288,8 +303,8 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable( + >>> from safeds.data.tabular.containers import Table + >>> table = Table( ... { ... "a": [1, 2, 3, 4, 5], ... "b": [2, 3, 4, 5, 6], diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index 098af9adf..66db09f28 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -6,38 +6,23 @@ if TYPE_CHECKING: from ._discretizer import Discretizer - from ._experimental_discretizer import ExperimentalDiscretizer - from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer - from ._experimental_label_encoder import ExperimentalLabelEncoder - from ._experimental_one_hot_encoder import ExperimentalOneHotEncoder - from ._experimental_range_scaler import ExperimentalRangeScaler - from ._experimental_simple_imputer import ExperimentalSimpleImputer - from ._experimental_standard_scaler import ExperimentalStandardScaler - from ._experimental_table_transformer import ExperimentalTableTransformer - from ._imputer import Imputer + from ._invertible_table_transformer import InvertibleTableTransformer from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder from ._range_scaler import RangeScaler + from ._simple_imputer import SimpleImputer from ._standard_scaler import StandardScaler - from ._table_transformer import InvertibleTableTransformer, TableTransformer + from ._table_transformer import TableTransformer apipkg.initpkg( __name__, { "Discretizer": "._discretizer:Discretizer", - "ExperimentalDiscretizer": "._experimental_discretizer:ExperimentalDiscretizer", - "ExperimentalInvertibleTableTransformer": "._experimental_invertible_table_transformer:ExperimentalInvertibleTableTransformer", - "ExperimentalLabelEncoder": "._experimental_label_encoder:ExperimentalLabelEncoder", - "ExperimentalOneHotEncoder": "._experimental_one_hot_encoder:ExperimentalOneHotEncoder", - "ExperimentalRangeScaler": "._experimental_range_scaler:ExperimentalRangeScaler", - "ExperimentalSimpleImputer": "._experimental_simple_imputer:ExperimentalSimpleImputer", - "ExperimentalStandardScaler": "._experimental_standard_scaler:ExperimentalStandardScaler", - "ExperimentalTableTransformer": "._experimental_table_transformer:ExperimentalTableTransformer", - "Imputer": "._imputer:Imputer", "InvertibleTableTransformer": "._table_transformer:InvertibleTableTransformer", "LabelEncoder": "._label_encoder:LabelEncoder", "OneHotEncoder": "._one_hot_encoder:OneHotEncoder", "RangeScaler": "._range_scaler:RangeScaler", + "SimpleImputer": "._simple_imputer:SimpleImputer", "StandardScaler": "._standard_scaler:StandardScaler", "TableTransformer": "._table_transformer:TableTransformer", }, @@ -45,19 +30,11 @@ __all__ = [ "Discretizer", - "ExperimentalDiscretizer", - "ExperimentalInvertibleTableTransformer", - "ExperimentalLabelEncoder", - "ExperimentalOneHotEncoder", - "ExperimentalRangeScaler", - "ExperimentalSimpleImputer", - "ExperimentalStandardScaler", - "ExperimentalTableTransformer", - "Imputer", "InvertibleTableTransformer", "LabelEncoder", "OneHotEncoder", "RangeScaler", + "SimpleImputer", "StandardScaler", "TableTransformer", ] diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py index 555c47235..f22c2cb35 100644 --- a/src/safeds/data/tabular/transformation/_discretizer.py +++ b/src/safeds/data/tabular/transformation/_discretizer.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation._table_transformer import TableTransformer from safeds.exceptions import ( ClosedBound, NonNumericColumnError, @@ -12,6 +11,8 @@ UnknownColumnNameError, ) +from ._table_transformer import TableTransformer + if TYPE_CHECKING: from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer @@ -84,11 +85,14 @@ def fit(self, table: Table, column_names: list[str] | None) -> Discretizer: ) for column in column_names: - if not table.get_column(column).type.is_numeric(): + if not table.get_column(column).type.is_numeric: raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal") - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = Discretizer(self._number_of_bins) result._wrapped_transformer = wrapped_transformer @@ -141,13 +145,15 @@ def transform(self, table: Table) -> Table: ) for column in self._column_names: - if not table.get_column(column).type.is_numeric(): + if not table.get_column(column).type.is_numeric: raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return Table._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) @property def is_fitted(self) -> bool: @@ -172,7 +178,6 @@ def get_names_of_added_columns(self) -> list[str]: raise TransformerNotFittedError return [] - # (Must implement abstract method, cannot instantiate class otherwise.) def get_names_of_changed_columns(self) -> list[str]: """ Get the names of all columns that may have been changed by the Discretizer. diff --git a/src/safeds/data/tabular/transformation/_experimental_discretizer.py b/src/safeds/data/tabular/transformation/_experimental_discretizer.py deleted file mode 100644 index ea3485831..000000000 --- a/src/safeds/data/tabular/transformation/_experimental_discretizer.py +++ /dev/null @@ -1,215 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds.data.tabular.containers import ExperimentalTable -from safeds.exceptions import ( - ClosedBound, - NonNumericColumnError, - OutOfBoundsError, - TransformerNotFittedError, - UnknownColumnNameError, -) - -from ._experimental_table_transformer import ExperimentalTableTransformer - -if TYPE_CHECKING: - from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer - - -class ExperimentalDiscretizer(ExperimentalTableTransformer): - """ - The Discretizer bins continuous data into intervals. - - Parameters - ---------- - number_of_bins: - The number of bins to be created. - - Raises - ------ - OutOfBoundsError - If the given number_of_bins is less than 2. - """ - - def __init__(self, number_of_bins: int = 5): - self._column_names: list[str] | None = None - self._wrapped_transformer: sk_KBinsDiscretizer | None = None - - if number_of_bins < 2: - raise OutOfBoundsError(number_of_bins, name="number_of_bins", lower_bound=ClosedBound(2)) - self._number_of_bins = number_of_bins - - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalDiscretizer: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - - Raises - ------ - ValueError - If the table is empty. - NonNumericColumnError - If one of the columns, that should be fitted is non-numeric. - UnknownColumnNameError - If one of the columns, that should be fitted is not in the table. - """ - from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer - - if table.number_of_rows == 0: - raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows") - - if column_names is None: - column_names = table.column_names - else: - missing_columns = set(column_names) - set(table.column_names) - if len(missing_columns) > 0: - raise UnknownColumnNameError( - sorted( - missing_columns, - key={val: ix for ix, val in enumerate(column_names)}.__getitem__, - ), - ) - - for column in column_names: - if not table.get_column(column).type.is_numeric: - raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") - - wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal") - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) - - result = ExperimentalDiscretizer(self._number_of_bins) - result._wrapped_transformer = wrapped_transformer - result._column_names = column_names - - return result - - def transform(self, table: ExperimentalTable) -> ExperimentalTable: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - ValueError - If the table is empty. - UnknownColumnNameError - If one of the columns, that should be transformed is not in the table. - NonNumericColumnError - If one of the columns, that should be fitted is non-numeric. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - if table.number_of_rows == 0: - raise ValueError("The table cannot be transformed because it contains 0 rows") - - # Input table does not contain all columns used to fit the transformer - missing_columns = set(self._column_names) - set(table.column_names) - if len(missing_columns) > 0: - raise UnknownColumnNameError( - sorted( - missing_columns, - key={val: ix for ix, val in enumerate(self._column_names)}.__getitem__, - ), - ) - - for column in self._column_names: - if not table.get_column(column).type.is_numeric: - raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") - - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), - ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the Discretizer. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the Discretizer. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the Discretizer. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py deleted file mode 100644 index a556260aa..000000000 --- a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py +++ /dev/null @@ -1,246 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -from safeds.data.tabular.containers import ExperimentalTable -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError - -from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer - -if TYPE_CHECKING: - from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder - - -class ExperimentalLabelEncoder(ExperimentalInvertibleTableTransformer): - """The LabelEncoder encodes one or more given columns into labels.""" - - def __init__(self) -> None: - self._wrapped_transformer: sk_OrdinalEncoder | None = None - self._column_names: list[str] | None = None - - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalLabelEncoder: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - - Raises - ------ - UnknownColumnNameError - If column_names contain a column name that is missing in the table. - ValueError - If the table contains 0 rows. - """ - from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder - - if column_names is None: - column_names = table.column_names - else: - missing_columns = sorted(set(column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - - if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: - warnings.warn( - "The columns" - f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" - " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values", - UserWarning, - stacklevel=2, - ) - - # TODO: use polars Enum type instead: - # my_enum = pl.Enum(['A', 'B', 'C']) <-- create this from the given order - # my_data = pl.Series(['A', 'A', 'B'], dtype=my_enum) - wrapped_transformer = sk_OrdinalEncoder() - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) - - result = ExperimentalLabelEncoder() - result._wrapped_transformer = wrapped_transformer - result._column_names = column_names - - return result - - def transform(self, table: ExperimentalTable) -> ExperimentalTable: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - # Input table does not contain all columns used to fit the transformer - missing_columns = sorted(set(self._column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), - ) - - def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: - """ - Undo the learned transformation. - - The table is not modified. - - Parameters - ---------- - transformed_table: - The table to be transformed back to the original version. - - Returns - ------- - original_table: - The original table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError - If the specified columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if transformed_table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") - - if transformed_table.remove_columns_except( - self._column_names, - ).remove_non_numeric_columns().number_of_columns < len(self._column_names): - raise NonNumericColumnError( - str( - sorted( - set(self._column_names) - - set( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_lazy_frame( - transformed_table._lazy_frame.update(new_data.lazy()), - ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the LabelEncoder. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the LabelEncoder. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the LabelEncoder. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the LabelEncoder was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py deleted file mode 100644 index a11cf5798..000000000 --- a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py +++ /dev/null @@ -1,381 +0,0 @@ -from __future__ import annotations - -import warnings -from collections import Counter -from typing import Any - -from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable -from safeds.exceptions import ( - NonNumericColumnError, - TransformerNotFittedError, - UnknownColumnNameError, - ValueNotPresentWhenFittedError, -) - -from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer - - -class ExperimentalOneHotEncoder(ExperimentalInvertibleTableTransformer): - """ - A way to deal with categorical features that is particularly useful for unordered (i.e. nominal) data. - - It replaces a column with a set of columns, each representing a unique value in the original column. The value of - each new column is 1 if the original column had that value, and 0 otherwise. Take the following table as an - example: - - | col1 | - |------| - | "a" | - | "b" | - | "c" | - | "a" | - - The one-hot encoding of this table is: - - | col1__a | col1__b | col1__c | - |---------|---------|---------| - | 1 | 0 | 0 | - | 0 | 1 | 0 | - | 0 | 0 | 1 | - | 1 | 0 | 0 | - - The name "one-hot" comes from the fact that each row has exactly one 1 in it, and the rest of the values are 0s. - One-hot encoding is closely related to dummy variable / indicator variables, which are used in statistics. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> from safeds.data.tabular.transformation import OneHotEncoder - >>> table = Table({"col1": ["a", "b", "c", "a"]}) - >>> transformer = OneHotEncoder() - >>> transformer.fit_and_transform(table, ["col1"])[1] - col1__a col1__b col1__c - 0 1.0 0.0 0.0 - 1 0.0 1.0 0.0 - 2 0.0 0.0 1.0 - 3 1.0 0.0 0.0 - """ - - def __init__(self) -> None: - # Maps each old column to (list of) new columns created from it: - self._column_names: dict[str, list[str]] | None = None - # Maps concrete values (tuples of old column and value) to corresponding new column names: - self._value_to_column: dict[tuple[str, Any], str] | None = None - # Maps nan values (str of old column) to corresponding new column name - self._value_to_column_nans: dict[str, str] | None = None - - def __hash__(self) -> int: - return super().__hash__() - - def __eq__(self, other: object) -> bool: - if not isinstance(other, ExperimentalOneHotEncoder): - return NotImplemented - return ( - self._column_names == other._column_names - and self._value_to_column == other._value_to_column - and self._value_to_column_nans == other._value_to_column_nans - ) - - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalOneHotEncoder: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - - Raises - ------ - UnknownColumnNameError - If column_names contain a column name that is missing in the table. - ValueError - If the table contains 0 rows. - """ - import numpy as np - - if column_names is None: - column_names = table.column_names - else: - missing_columns = sorted(set(column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The OneHotEncoder cannot be fitted because the table contains 0 rows") - - if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: - warnings.warn( - "The columns" - f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" - " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values", - UserWarning, - stacklevel=2, - ) - - result = ExperimentalOneHotEncoder() - - result._column_names = {} - result._value_to_column = {} - result._value_to_column_nans = {} - - # Keep track of number of occurrences of column names; - # initially all old column names appear exactly once: - name_counter = Counter(table.column_names) - - # Iterate through all columns to-be-changed: - for column in column_names: - result._column_names[column] = [] - for element in table.get_column(column).get_distinct_values(): - base_name = f"{column}__{element}" - name_counter[base_name] += 1 - new_column_name = base_name - # Check if newly created name matches some other existing column name: - if name_counter[base_name] > 1: - new_column_name += f"#{name_counter[base_name]}" - # Update dictionary entries: - result._column_names[column] += [new_column_name] - if isinstance(element, float) and np.isnan(element): - result._value_to_column_nans[column] = new_column_name - else: - result._value_to_column[(column, element)] = new_column_name - - return result - - def transform(self, table: ExperimentalTable) -> ExperimentalTable: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - ValueError - If the table contains 0 rows. - ValueNotPresentWhenFittedError - If a column in the to-be-transformed table contains a new value that was not already present in the table the OneHotEncoder was fitted on. - """ - import numpy as np - - # Transformer has not been fitted yet - if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: - raise TransformerNotFittedError - - # Input table does not contain all columns used to fit the transformer - missing_columns = sorted(set(self._column_names.keys()) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - - encoded_values = {} - for new_column_name in self._value_to_column.values(): - encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] - for new_column_name in self._value_to_column_nans.values(): - encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] - - values_not_present_when_fitted = [] - for old_column_name in self._column_names: - for i in range(table.number_of_rows): - value = table.get_column(old_column_name).get_value(i) - try: - if isinstance(value, float) and np.isnan(value): - new_column_name = self._value_to_column_nans[old_column_name] - else: - new_column_name = self._value_to_column[(old_column_name, value)] - encoded_values[new_column_name][i] = 1.0 - except KeyError: - # This happens when a column in the to-be-transformed table contains a new value that was not - # already present in the table the OneHotEncoder was fitted on. - values_not_present_when_fitted.append((value, old_column_name)) - - for new_column in self._column_names[old_column_name]: - table = table.add_columns([ExperimentalColumn(new_column, encoded_values[new_column])]) - - if len(values_not_present_when_fitted) > 0: - raise ValueNotPresentWhenFittedError(values_not_present_when_fitted) - - # New columns may not be sorted: - column_names = [] - for name in table.column_names: - if name not in self._column_names: - column_names.append(name) - else: - column_names.extend( - [f_name for f_name in self._value_to_column.values() if f_name.startswith(name)] - + [f_name for f_name in self._value_to_column_nans.values() if f_name.startswith(name)], - ) - - # Drop old, non-encoded columns: - # (Don't do this earlier - we need the old column nams for sorting, - # plus we need to prevent the table from possibly having 0 columns temporarily.) - return table.remove_columns(list(self._column_names.keys())) - - def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: - """ - Undo the learned transformation. - - The table is not modified. - - Parameters - ---------- - transformed_table: - The table to be transformed back to the original version. - - Returns - ------- - table: - The original table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError - If the transformed columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: - raise TransformerNotFittedError - - _transformed_column_names = [item for sublist in self._column_names.values() for item in sublist] - - missing_columns = sorted(set(_transformed_column_names) - set(transformed_table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if transformed_table.number_of_rows == 0: - raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") - - if transformed_table.remove_columns_except( - _transformed_column_names, - ).remove_non_numeric_columns().number_of_columns < len(_transformed_column_names): - raise NonNumericColumnError( - str( - sorted( - set(_transformed_column_names) - - set( - transformed_table.remove_columns_except(_transformed_column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - original_columns = {} - for original_column_name in self._column_names: - original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] - - for original_column_name, value in self._value_to_column: - constructed_column = self._value_to_column[(original_column_name, value)] - for i in range(transformed_table.number_of_rows): - if transformed_table.get_column(constructed_column)[i] == 1.0: - original_columns[original_column_name][i] = value - - for original_column_name in self._value_to_column_nans: - constructed_column = self._value_to_column_nans[original_column_name] - for i in range(transformed_table.number_of_rows): - if transformed_table.get_column(constructed_column)[i] == 1.0: - original_columns[original_column_name][i] = None - - table = transformed_table - - for column_name, encoded_column in original_columns.items(): - table = table.add_columns(ExperimentalColumn(column_name, encoded_column)) - - # Drop old column names: - table = table.remove_columns(list(self._value_to_column.values())) - return table.remove_columns(list(self._value_to_column_nans.values())) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return ( - self._column_names is not None - and self._value_to_column is not None - and self._value_to_column_nans is not None - ) - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the OneHotEncoder. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return [name for column_names in self._column_names.values() for name in column_names] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that have been changed by the OneHotEncoder (none). - - Returns - ------- - changed_columns: - The empty list. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the OneHotEncoder. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the OneHotEncoder was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return list(self._column_names.keys()) diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py deleted file mode 100644 index 7d708b721..000000000 --- a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py +++ /dev/null @@ -1,295 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds.data.tabular.containers import ExperimentalTable -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError - -from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer - -if TYPE_CHECKING: - from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler - - -class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): - """ - The RangeScaler transforms column values by scaling each value to a given range. - - Parameters - ---------- - min_: - The minimum of the new range after the transformation - max_: - The maximum of the new range after the transformation - - Raises - ------ - ValueError - If the given minimum is greater or equal to the given maximum - """ - - def __init__(self, min_: float = 0.0, max_: float = 1.0): - self._column_names: list[str] | None = None - self._wrapped_transformer: sk_MinMaxScaler | None = None - if min_ >= max_: - raise ValueError('Parameter "maximum" must be higher than parameter "minimum".') - self._minimum = min_ - self._maximum = max_ - - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalRangeScaler: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - - Raises - ------ - UnknownColumnNameError - If column_names contain a column name that is missing in the table. - NonNumericColumnError - If at least one of the specified columns in the table contains non-numerical data. - ValueError - If the table contains 0 rows. - """ - from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler - - if column_names is None: - column_names = table.column_names - else: - missing_columns = sorted(set(column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") - - if ( - table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(column_names).column_names) - - set( - table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) - - wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) - - result = ExperimentalRangeScaler() - result._wrapped_transformer = wrapped_transformer - result._column_names = column_names - - return result - - def transform(self, table: ExperimentalTable) -> ExperimentalTable: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError - If at least one of the columns in the input table that is used to fit contains non-numerical data. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - # Input table does not contain all columns used to fit the transformer - missing_columns = sorted(set(self._column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") - - if ( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(self._column_names).column_names) - - set( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), - ) - - def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: - """ - Undo the learned transformation. - - The table is not modified. - - Parameters - ---------- - transformed_table: - The table to be transformed back to the original version. - - Returns - ------- - original_table: - The original table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError - If the transformed columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if transformed_table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") - - if ( - transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < transformed_table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(transformed_table.remove_columns_except(self._column_names).column_names) - - set( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - import polars as pl - - new_data = pl.DataFrame( - self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, - ) - ) - - name_mapping = dict(zip(new_data.columns, self._column_names, strict=True)) - - new_data = new_data.rename(name_mapping) - - return ExperimentalTable._from_polars_data_frame( - transformed_table._data_frame.update(new_data), - ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the RangeScaler. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the RangeScaler. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the RangeScaler. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the RangeScaler was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py deleted file mode 100644 index c3176eb81..000000000 --- a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py +++ /dev/null @@ -1,268 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds.data.tabular.containers import ExperimentalTable -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError - -from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer - -if TYPE_CHECKING: - from sklearn.preprocessing import StandardScaler as sk_StandardScaler - - -class ExperimentalStandardScaler(ExperimentalInvertibleTableTransformer): - """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" - - def __init__(self) -> None: - self._column_names: list[str] | None = None - self._wrapped_transformer: sk_StandardScaler | None = None - - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalStandardScaler: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - - Raises - ------ - UnknownColumnNameError - If column_names contain a column name that is missing in the table. - NonNumericColumnError - If at least one of the specified columns in the table contains non-numerical data. - ValueError - If the table contains 0 rows. - """ - from sklearn.preprocessing import StandardScaler as sk_StandardScaler - - if column_names is None: - column_names = table.column_names - else: - missing_columns = sorted(set(column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") - - if ( - table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(column_names).column_names) - - set( - table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) - - wrapped_transformer = sk_StandardScaler() - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) - - result = ExperimentalStandardScaler() - result._wrapped_transformer = wrapped_transformer - result._column_names = column_names - - return result - - def transform(self, table: ExperimentalTable) -> ExperimentalTable: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError - If at least one of the columns in the input table that is used to fit contains non-numerical data. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - # Input table does not contain all columns used to fit the transformer - missing_columns = sorted(set(self._column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") - - if ( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(self._column_names).column_names) - - set( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), - ) - - def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: - """ - Undo the learned transformation. - - The table is not modified. - - Parameters - ---------- - transformed_table: - The table to be transformed back to the original version. - - Returns - ------- - original_table: - The original table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError - If the transformed columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. - """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if transformed_table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") - - if ( - transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < transformed_table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(transformed_table.remove_columns_except(self._column_names).column_names) - - set( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_data_frame( - transformed_table._data_frame.update(new_data), - ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the StandardScaler. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the StandardScaler. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the StandardScaler. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the StandardScaler was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py deleted file mode 100644 index ed30ae728..000000000 --- a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Self - -from safeds._utils import _structural_hash - -if TYPE_CHECKING: - from safeds.data.tabular.containers import ExperimentalTable - - -class ExperimentalTableTransformer(ABC): - """Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns.""" - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __hash__(self) -> int: - """ - Return a deterministic hash value for a table transformer. - - Returns - ------- - hash: - The hash value. - """ - added = self.get_names_of_added_columns() if self.is_fitted else [] - changed = self.get_names_of_changed_columns() if self.is_fitted else [] - removed = self.get_names_of_removed_columns() if self.is_fitted else [] - return _structural_hash(self.__class__.__qualname__, self.is_fitted, added, changed, removed) - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - @abstractmethod - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - - # ------------------------------------------------------------------------------------------------------------------ - # Methods - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Self: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - """ - - @abstractmethod - def transform(self, table: ExperimentalTable) -> ExperimentalTable: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - - @abstractmethod - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the transformer. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - - @abstractmethod - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that have been changed by the transformer. - - Returns - ------- - changed_columns: - A list of names of changed columns, ordered as they appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - - @abstractmethod - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the transformer. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the transformer was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - - def fit_and_transform( - self, table: ExperimentalTable, column_names: list[str] | None = None - ) -> tuple[Self, ExperimentalTable]: - """ - Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. - - Neither the transformer nor the table are modified. - - Parameters - ---------- - table: - The table used to fit the transformer. The transformer is then applied to this table. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - transformed_table: - The transformed table. - """ - fitted_transformer = self.fit(table, column_names) - transformed_table = fitted_transformer.transform(table) - return fitted_transformer, transformed_table diff --git a/src/safeds/data/tabular/transformation/_imputer.py b/src/safeds/data/tabular/transformation/_imputer.py deleted file mode 100644 index 416c19bce..000000000 --- a/src/safeds/data/tabular/transformation/_imputer.py +++ /dev/null @@ -1,389 +0,0 @@ -from __future__ import annotations - -import sys -import warnings -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any - -import pandas as pd - -from safeds._utils import _structural_hash -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation._table_transformer import TableTransformer -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError - -if TYPE_CHECKING: - from sklearn.impute import SimpleImputer as sk_SimpleImputer - - -class Imputer(TableTransformer): - """ - Replace missing values using the given strategy. - - Parameters - ---------- - strategy: - How to replace missing values. - value_to_replace: - The value that should be replaced. - - Examples - -------- - >>> from safeds.data.tabular.containers import Column, Table - >>> from safeds.data.tabular.transformation import Imputer - >>> - >>> table = Table.from_columns( - ... [ - ... Column("a", [1, 3, None]), - ... Column("b", [None, 2, 3]), - ... ], - ... ) - >>> transformer = Imputer(Imputer.Strategy.Constant(0)) - >>> transformed_table = transformer.fit_and_transform(table) - """ - - class Strategy(ABC): - """Various strategies to replace missing values. Use the static methods to create instances of this class.""" - - @abstractmethod - def __eq__(self, other: object) -> bool: - pass # pragma: no cover - - @abstractmethod - def __hash__(self) -> int: - pass # pragma: no cover - - @abstractmethod - def _apply(self, imputer: sk_SimpleImputer) -> None: - """ - Set the imputer strategy of the given imputer. - - Parameters - ---------- - imputer: - The imputer to augment. - """ - - @staticmethod - def Constant(value: Any) -> Imputer.Strategy: # noqa: N802 - """ - Replace missing values with the given constant value. - - Parameters - ---------- - value: - The value to replace missing values. - """ - return _Constant(value) # pragma: no cover - - @staticmethod - def Mean() -> Imputer.Strategy: # noqa: N802 - """Replace missing values with the mean of each column.""" - return _Mean() # pragma: no cover - - @staticmethod - def Median() -> Imputer.Strategy: # noqa: N802 - """Replace missing values with the median of each column.""" - return _Median() # pragma: no cover - - @staticmethod - def Mode() -> Imputer.Strategy: # noqa: N802 - """Replace missing values with the mode of each column.""" - return _Mode() # pragma: no cover - - def __init__(self, strategy: Imputer.Strategy, *, value_to_replace: float | str | None = None): - if value_to_replace is None: - value_to_replace = pd.NA - - self._strategy = strategy - self._value_to_replace = value_to_replace - - self._wrapped_transformer: sk_SimpleImputer | None = None - self._column_names: list[str] | None = None - - @property - def strategy(self) -> Imputer.Strategy: - """The strategy used to replace missing values.""" - return self._strategy - - @property - def value_to_replace(self) -> Any: - """The value that should be replaced.""" - return self._value_to_replace - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def fit(self, table: Table, column_names: list[str] | None) -> Imputer: - """ - Learn a transformation for a set of columns in a table. - - This transformer is not modified. - - Parameters - ---------- - table: - The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. - - Returns - ------- - fitted_transformer: - The fitted transformer. - - Raises - ------ - UnknownColumnNameError - If column_names contain a column name that is missing in the table - ValueError - If the table contains 0 rows - NonNumericColumnError - If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data. - """ - from sklearn.impute import SimpleImputer as sk_SimpleImputer - - if column_names is None: - column_names = table.column_names - else: - missing_columns = sorted(set(column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The Imputer cannot be fitted because the table contains 0 rows") - - if (isinstance(self._strategy, _Mean | _Median)) and table.keep_only_columns( - column_names, - ).remove_columns_with_non_numerical_values().number_of_columns < len( - column_names, - ): - raise NonNumericColumnError( - str( - sorted( - set(table.keep_only_columns(column_names).column_names) - - set( - table.keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() - .column_names, - ), - ), - ), - ) - - if isinstance(self._strategy, _Mode): - multiple_most_frequent = {} - for name in column_names: - if len(table.get_column(name).mode()) > 1: - multiple_most_frequent[name] = table.get_column(name).mode() - if len(multiple_most_frequent) > 0: - warnings.warn( - "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values" - " are being chosen in this cases. The following columns have multiple most frequent" - f" values:\n{multiple_most_frequent}", - UserWarning, - stacklevel=2, - ) - - wrapped_transformer = sk_SimpleImputer() - self._strategy._apply(wrapped_transformer) - wrapped_transformer.missing_values = self._value_to_replace - wrapped_transformer.fit(table._data[column_names]) - - result = Imputer(self._strategy) - result._wrapped_transformer = wrapped_transformer - result._column_names = column_names - - return result - - def transform(self, table: Table) -> Table: - """ - Apply the learned transformation to a table. - - The table is not modified. - - Parameters - ---------- - table: - The table to which the learned transformation is applied. - - Returns - ------- - transformed_table: - The transformed table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - UnknownColumnNameError - If the input table does not contain all columns used to fit the transformer. - ValueError - If the table contains 0 rows. - """ - import pandas as pd - - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: - raise TransformerNotFittedError - - # Input table does not contain all columns used to fit the transformer - missing_columns = sorted(set(self._column_names) - set(table.column_names)) - if len(missing_columns) > 0: - raise UnknownColumnNameError(missing_columns) - - if table.number_of_rows == 0: - raise ValueError("The Imputer cannot transform the table because it contains 0 rows") - - data = table._data.reset_index(drop=True) - data[self._column_names] = pd.DataFrame( - self._wrapped_transformer.transform(data[self._column_names]), - columns=self._column_names, - ) - return Table._from_pandas_dataframe(data, table.schema) - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the Imputer. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the Imputer. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the Imputer. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - -# ---------------------------------------------------------------------------------------------------------------------- -# Imputation strategies -# ---------------------------------------------------------------------------------------------------------------------- - - -class _Constant(Imputer.Strategy): - def __init__(self, value: Any): - self._value = value - - @property - def value(self) -> Any: - return self._value - - def __eq__(self, other: object) -> bool: - if not isinstance(other, _Constant): - return NotImplemented - if self is other: - return True - return self._value == other._value - - def __hash__(self) -> int: - return _structural_hash(str(self)) - - def __sizeof__(self) -> int: - return sys.getsizeof(self._value) - - def __str__(self) -> str: - return f"Constant({self._value})" - - def _apply(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "constant" - imputer.fill_value = self._value - - -class _Mean(Imputer.Strategy): - def __eq__(self, other: object) -> bool: - if not isinstance(other, _Mean): - return NotImplemented - return True - - def __hash__(self) -> int: - return _structural_hash(str(self)) - - def __str__(self) -> str: - return "Mean" - - def _apply(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "mean" - - -class _Median(Imputer.Strategy): - def __eq__(self, other: object) -> bool: - if not isinstance(other, _Median): - return NotImplemented - return True - - def __hash__(self) -> int: - return _structural_hash(str(self)) - - def __str__(self) -> str: - return "Median" - - def _apply(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "median" - - -class _Mode(Imputer.Strategy): - def __eq__(self, other: object) -> bool: - if not isinstance(other, _Mode): - return NotImplemented - return True - - def __hash__(self) -> int: - return _structural_hash(str(self)) - - def __str__(self) -> str: - return "Mode" - - def _apply(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "most_frequent" - - -# Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. -# This is needed for the DSL, where imputer strategies are variants of an enum. -Imputer.Strategy.Constant = _Constant # type: ignore[method-assign] -Imputer.Strategy.Mean = _Mean # type: ignore[method-assign] -Imputer.Strategy.Median = _Median # type: ignore[method-assign] -Imputer.Strategy.Mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py b/src/safeds/data/tabular/transformation/_invertible_table_transformer.py similarity index 69% rename from src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py rename to src/safeds/data/tabular/transformation/_invertible_table_transformer.py index 9e240c050..cd0e25da9 100644 --- a/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_invertible_table_transformer.py @@ -3,17 +3,17 @@ from abc import abstractmethod from typing import TYPE_CHECKING -from ._experimental_table_transformer import ExperimentalTableTransformer +from ._table_transformer import TableTransformer if TYPE_CHECKING: - from safeds.data.tabular.containers import ExperimentalTable + from safeds.data.tabular.containers import Table -class ExperimentalInvertibleTableTransformer(ExperimentalTableTransformer): +class InvertibleTableTransformer(TableTransformer): """A `TableTransformer` that can also undo the learned transformation after it has been applied.""" @abstractmethod - def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + def inverse_transform(self, transformed_table: Table) -> Table: """ Undo the learned transformation. diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py index bfaa0319b..6dd2091e5 100644 --- a/src/safeds/data/tabular/transformation/_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_label_encoder.py @@ -4,11 +4,10 @@ from typing import TYPE_CHECKING from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation._table_transformer import ( - InvertibleTableTransformer, -) from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError +from ._invertible_table_transformer import InvertibleTableTransformer + if TYPE_CHECKING: from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder @@ -57,17 +56,23 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: if table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0: + if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: warnings.warn( "The columns" - f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain" + f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values", UserWarning, stacklevel=2, ) + # TODO: use polars Enum type instead: + # my_enum = pl.Enum(['A', 'B', 'C']) <-- create this from the given order + # my_data = pl.Series(['A', 'A', 'B'], dtype=my_enum) wrapped_transformer = sk_OrdinalEncoder() - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = LabelEncoder() result._wrapped_transformer = wrapped_transformer @@ -112,10 +117,12 @@ def transform(self, table: Table) -> Table: if table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return Table._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) def inverse_transform(self, transformed_table: Table) -> Table: """ @@ -130,7 +137,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: Returns ------- - table: + original_table: The original table. Raises @@ -155,26 +162,28 @@ def inverse_transform(self, transformed_table: Table) -> Table: if transformed_table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") - if transformed_table.keep_only_columns( + if transformed_table.remove_columns_except( self._column_names, - ).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names): + ).remove_non_numeric_columns().number_of_columns < len(self._column_names): raise NonNumericColumnError( str( sorted( set(self._column_names) - set( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = transformed_table._data.reset_index(drop=True) - data.columns = transformed_table.column_names - data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return Table._from_polars_lazy_frame( + transformed_table._lazy_frame.update(new_data.lazy()), + ) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index f9694e6f9..262408192 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -5,9 +5,6 @@ from typing import Any from safeds.data.tabular.containers import Column, Table -from safeds.data.tabular.transformation._table_transformer import ( - InvertibleTableTransformer, -) from safeds.exceptions import ( NonNumericColumnError, TransformerNotFittedError, @@ -15,6 +12,8 @@ ValueNotPresentWhenFittedError, ) +from ._invertible_table_transformer import InvertibleTableTransformer + class OneHotEncoder(InvertibleTableTransformer): """ @@ -114,24 +113,15 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: if table.number_of_rows == 0: raise ValueError("The OneHotEncoder cannot be fitted because the table contains 0 rows") - if ( - table._as_table() - .keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() - .number_of_columns - > 0 - ): + if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: warnings.warn( "The columns" - f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain" + f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values", UserWarning, stacklevel=2, ) - data = table._data.reset_index(drop=True) - data.columns = table.column_names - result = OneHotEncoder() result._column_names = {} @@ -140,12 +130,12 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: # Keep track of number of occurrences of column names; # initially all old column names appear exactly once: - name_counter = Counter(data.columns) + name_counter = Counter(table.column_names) # Iterate through all columns to-be-changed: for column in column_names: result._column_names[column] = [] - for element in table.get_column(column).get_unique_values(): + for element in table.get_column(column).get_distinct_values(): base_name = f"{column}__{element}" name_counter[base_name] += 1 new_column_name = base_name @@ -186,7 +176,8 @@ def transform(self, table: Table) -> Table: ValueError If the table contains 0 rows. ValueNotPresentWhenFittedError - If a column in the to-be-transformed table contains a new value that was not already present in the table the OneHotEncoder was fitted on. + If a column in the to-be-transformed table contains a new value that was not already present in the table + the OneHotEncoder was fitted on. """ import numpy as np @@ -243,10 +234,7 @@ def transform(self, table: Table) -> Table: # Drop old, non-encoded columns: # (Don't do this earlier - we need the old column nams for sorting, # plus we need to prevent the table from possibly having 0 columns temporarily.) - table = table.remove_columns(list(self._column_names.keys())) - - # Apply sorting and return: - return table.sort_columns(lambda col1, col2: column_names.index(col1.name) - column_names.index(col2.name)) + return table.remove_columns(list(self._column_names.keys())) def inverse_transform(self, transformed_table: Table) -> Table: """ @@ -275,8 +263,6 @@ def inverse_transform(self, transformed_table: Table) -> Table: ValueError If the table contains 0 rows. """ - import numpy as np - # Transformer has not been fitted yet if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: raise TransformerNotFittedError @@ -290,16 +276,16 @@ def inverse_transform(self, transformed_table: Table) -> Table: if transformed_table.number_of_rows == 0: raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") - if transformed_table._as_table().keep_only_columns( + if transformed_table.remove_columns_except( _transformed_column_names, - ).remove_columns_with_non_numerical_values().number_of_columns < len(_transformed_column_names): + ).remove_non_numeric_columns().number_of_columns < len(_transformed_column_names): raise NonNumericColumnError( str( sorted( set(_transformed_column_names) - set( - transformed_table.keep_only_columns(_transformed_column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(_transformed_column_names) + .remove_non_numeric_columns() .column_names, ), ), @@ -320,33 +306,16 @@ def inverse_transform(self, transformed_table: Table) -> Table: constructed_column = self._value_to_column_nans[original_column_name] for i in range(transformed_table.number_of_rows): if transformed_table.get_column(constructed_column)[i] == 1.0: - original_columns[original_column_name][i] = np.nan + original_columns[original_column_name][i] = None table = transformed_table for column_name, encoded_column in original_columns.items(): - table = table.add_column(Column(column_name, encoded_column)) - - column_names = [ - ( - name - if name not in [value for value_list in list(self._column_names.values()) for value in value_list] - else list(self._column_names.keys())[ - next( - list(self._column_names.values()).index(value) - for value in list(self._column_names.values()) - if name in value - ) - ] - ) - for name in table.column_names - ] + table = table.add_columns(Column(column_name, encoded_column)) # Drop old column names: table = table.remove_columns(list(self._value_to_column.values())) - table = table.remove_columns(list(self._value_to_column_nans.values())) - - return table.sort_columns(lambda col1, col2: column_names.index(col1.name) - column_names.index(col2.name)) + return table.remove_columns(list(self._value_to_column_nans.values())) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py index 1bf2cf433..14c65c332 100644 --- a/src/safeds/data/tabular/transformation/_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_range_scaler.py @@ -3,9 +3,10 @@ from typing import TYPE_CHECKING from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation._table_transformer import InvertibleTableTransformer from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError +from ._invertible_table_transformer import InvertibleTableTransformer + if TYPE_CHECKING: from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler @@ -16,9 +17,9 @@ class RangeScaler(InvertibleTableTransformer): Parameters ---------- - minimum: + min_: The minimum of the new range after the transformation - maximum: + max_: The maximum of the new range after the transformation Raises @@ -27,13 +28,13 @@ class RangeScaler(InvertibleTableTransformer): If the given minimum is greater or equal to the given maximum """ - def __init__(self, minimum: float = 0.0, maximum: float = 1.0): + def __init__(self, min_: float = 0.0, max_: float = 1.0): self._column_names: list[str] | None = None self._wrapped_transformer: sk_MinMaxScaler | None = None - if minimum >= maximum: + if min_ >= max_: raise ValueError('Parameter "maximum" must be higher than parameter "minimum".') - self._minimum = minimum - self._maximum = maximum + self._minimum = min_ + self._maximum = max_ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: """ @@ -75,24 +76,25 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") if ( - table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(column_names).number_of_columns + table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(column_names).column_names) + set(table.remove_columns_except(column_names).column_names) - set( - table.keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() - .column_names, + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, ), ), ), ) wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = RangeScaler() result._wrapped_transformer = wrapped_transformer @@ -140,26 +142,26 @@ def transform(self, table: Table) -> Table: raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") if ( - table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(self._column_names).number_of_columns + table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(self._column_names).column_names) + set(table.remove_columns_except(self._column_names).column_names) - set( - table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() - .column_names, + table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, ), ), ), ) - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return Table._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) def inverse_transform(self, transformed_table: Table) -> Table: """ @@ -174,7 +176,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: Returns ------- - table: + original_table: The original table. Raises @@ -200,28 +202,37 @@ def inverse_transform(self, transformed_table: Table) -> Table: raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") if ( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() - .number_of_columns - < transformed_table.keep_only_columns(self._column_names).number_of_columns + transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < transformed_table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(transformed_table.keep_only_columns(self._column_names).column_names) + set(transformed_table.remove_columns_except(self._column_names).column_names) - set( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = transformed_table._data.reset_index(drop=True) - data.columns = transformed_table.column_names - data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + import polars as pl + + new_data = pl.DataFrame( + self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ), + ) + + name_mapping = dict(zip(new_data.columns, self._column_names, strict=True)) + + new_data = new_data.rename(name_mapping) + + return Table._from_polars_data_frame( + transformed_table._data_frame.update(new_data), + ) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py b/src/safeds/data/tabular/transformation/_simple_imputer.py similarity index 82% rename from src/safeds/data/tabular/transformation/_experimental_simple_imputer.py rename to src/safeds/data/tabular/transformation/_simple_imputer.py index d75c87acf..9b7b82ea9 100644 --- a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py +++ b/src/safeds/data/tabular/transformation/_simple_imputer.py @@ -8,16 +8,16 @@ import pandas as pd from safeds._utils import _structural_hash -from safeds.data.tabular.containers import ExperimentalTable +from safeds.data.tabular.containers import Table from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError -from ._experimental_table_transformer import ExperimentalTableTransformer +from ._table_transformer import TableTransformer if TYPE_CHECKING: from sklearn.impute import SimpleImputer as sk_SimpleImputer -class ExperimentalSimpleImputer(ExperimentalTableTransformer): +class SimpleImputer(TableTransformer): """ Replace missing values using the given strategy. @@ -31,7 +31,7 @@ class ExperimentalSimpleImputer(ExperimentalTableTransformer): Examples -------- >>> from safeds.data.tabular.containers import Column, Table - >>> from safeds.data.tabular.transformation import Imputer + >>> from safeds.data.tabular.transformation import SimpleImputer >>> >>> table = Table.from_columns( ... [ @@ -39,34 +39,28 @@ class ExperimentalSimpleImputer(ExperimentalTableTransformer): ... Column("b", [None, 2, 3]), ... ], ... ) - >>> transformer = Imputer(Imputer.Strategy.Constant(0)) + >>> transformer = SimpleImputer(SimpleImputer.Strategy.Constant(0)) >>> transformed_table = transformer.fit_and_transform(table) """ class Strategy(ABC): - """Various strategies to replace missing values. Use the static methods to create instances of this class.""" + """Various strategies to replace missing values. Use the inner classes to create instances of this class.""" @abstractmethod - def __eq__(self, other: object) -> bool: - pass # pragma: no cover + def __eq__(self, other: object) -> bool: ... @abstractmethod - def __hash__(self) -> int: - pass # pragma: no cover + def __hash__(self) -> int: ... @abstractmethod - def _apply(self, imputer: sk_SimpleImputer) -> None: - """ - Set the imputer strategy of the given imputer. + def __str__(self) -> str: ... - Parameters - ---------- - imputer: - The imputer to augment. - """ + @abstractmethod + def _apply(self, imputer: sk_SimpleImputer) -> None: + """Set the imputer strategy of the given imputer.""" @staticmethod - def Constant(value: Any) -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + def Constant(value: Any) -> SimpleImputer.Strategy: # noqa: N802 """ Replace missing values with the given constant value. @@ -75,24 +69,24 @@ def Constant(value: Any) -> ExperimentalSimpleImputer.Strategy: # noqa: N802 value: The value to replace missing values. """ - return _Constant(value) # pragma: no cover + return _Constant(value) @staticmethod - def Mean() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + def Mean() -> SimpleImputer.Strategy: # noqa: N802 """Replace missing values with the mean of each column.""" - return _Mean() # pragma: no cover + return _Mean() @staticmethod - def Median() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + def Median() -> SimpleImputer.Strategy: # noqa: N802 """Replace missing values with the median of each column.""" - return _Median() # pragma: no cover + return _Median() @staticmethod - def Mode() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + def Mode() -> SimpleImputer.Strategy: # noqa: N802 """Replace missing values with the mode of each column.""" - return _Mode() # pragma: no cover + return _Mode() - def __init__(self, strategy: ExperimentalSimpleImputer.Strategy, *, value_to_replace: float | str | None = None): + def __init__(self, strategy: SimpleImputer.Strategy, *, value_to_replace: float | str | None = None): if value_to_replace is None: value_to_replace = pd.NA @@ -103,7 +97,7 @@ def __init__(self, strategy: ExperimentalSimpleImputer.Strategy, *, value_to_rep self._column_names: list[str] | None = None @property - def strategy(self) -> ExperimentalSimpleImputer.Strategy: + def strategy(self) -> SimpleImputer.Strategy: """The strategy used to replace missing values.""" return self._strategy @@ -117,7 +111,7 @@ def is_fitted(self) -> bool: """Whether the transformer is fitted.""" return self._wrapped_transformer is not None - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalSimpleImputer: + def fit(self, table: Table, column_names: list[str] | None) -> SimpleImputer: """ Learn a transformation for a set of columns in a table. @@ -142,7 +136,8 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper ValueError If the table contains 0 rows NonNumericColumnError - If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data. + If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical + data. """ from sklearn.impute import SimpleImputer as sk_SimpleImputer @@ -193,13 +188,13 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper table.remove_columns_except(column_names)._data_frame, ) - result = ExperimentalSimpleImputer(self._strategy) + result = SimpleImputer(self._strategy) result._wrapped_transformer = wrapped_transformer result._column_names = column_names return result - def transform(self, table: ExperimentalTable) -> ExperimentalTable: + def transform(self, table: Table) -> Table: """ Apply the learned transformation to a table. @@ -237,7 +232,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: raise ValueError("The Imputer cannot transform the table because it contains 0 rows") new_data = self._wrapped_transformer.transform(table.remove_columns_except(self._column_names)._data_frame) - return ExperimentalTable._from_polars_lazy_frame( + return Table._from_polars_lazy_frame( table._lazy_frame.update(new_data.lazy()), ) @@ -301,7 +296,7 @@ def get_names_of_removed_columns(self) -> list[str]: # ---------------------------------------------------------------------------------------------------------------------- -class _Constant(ExperimentalSimpleImputer.Strategy): +class _Constant(SimpleImputer.Strategy): def __init__(self, value: Any): self._value = value @@ -330,7 +325,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: imputer.fill_value = self._value -class _Mean(ExperimentalSimpleImputer.Strategy): +class _Mean(SimpleImputer.Strategy): def __eq__(self, other: object) -> bool: if not isinstance(other, _Mean): return NotImplemented @@ -346,7 +341,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "mean" -class _Median(ExperimentalSimpleImputer.Strategy): +class _Median(SimpleImputer.Strategy): def __eq__(self, other: object) -> bool: if not isinstance(other, _Median): return NotImplemented @@ -362,7 +357,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "median" -class _Mode(ExperimentalSimpleImputer.Strategy): +class _Mode(SimpleImputer.Strategy): def __eq__(self, other: object) -> bool: if not isinstance(other, _Mode): return NotImplemented @@ -380,7 +375,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: # Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. # This is needed for the DSL, where imputer strategies are variants of an enum. -ExperimentalSimpleImputer.Strategy.Constant = _Constant # type: ignore[method-assign] -ExperimentalSimpleImputer.Strategy.Mean = _Mean # type: ignore[method-assign] -ExperimentalSimpleImputer.Strategy.Median = _Median # type: ignore[method-assign] -ExperimentalSimpleImputer.Strategy.Mode = _Mode # type: ignore[method-assign] +SimpleImputer.Strategy.Constant = _Constant # type: ignore[method-assign] +SimpleImputer.Strategy.Mean = _Mean # type: ignore[method-assign] +SimpleImputer.Strategy.Median = _Median # type: ignore[method-assign] +SimpleImputer.Strategy.Mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py index b758f0cdd..75d7ef271 100644 --- a/src/safeds/data/tabular/transformation/_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_standard_scaler.py @@ -3,9 +3,10 @@ from typing import TYPE_CHECKING from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation._table_transformer import InvertibleTableTransformer from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError +from ._invertible_table_transformer import InvertibleTableTransformer + if TYPE_CHECKING: from sklearn.preprocessing import StandardScaler as sk_StandardScaler @@ -57,24 +58,25 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") if ( - table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(column_names).number_of_columns + table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(column_names).column_names) + set(table.remove_columns_except(column_names).column_names) - set( - table.keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() - .column_names, + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, ), ), ), ) wrapped_transformer = sk_StandardScaler() - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = StandardScaler() result._wrapped_transformer = wrapped_transformer @@ -122,26 +124,26 @@ def transform(self, table: Table) -> Table: raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") if ( - table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(self._column_names).number_of_columns + table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(self._column_names).column_names) + set(table.remove_columns_except(self._column_names).column_names) - set( - table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() - .column_names, + table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, ), ), ), ) - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return Table._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) def inverse_transform(self, transformed_table: Table) -> Table: """ @@ -156,7 +158,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: Returns ------- - table: + original_table: The original table. Raises @@ -182,28 +184,28 @@ def inverse_transform(self, transformed_table: Table) -> Table: raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") if ( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() - .number_of_columns - < transformed_table.keep_only_columns(self._column_names).number_of_columns + transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < transformed_table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(transformed_table.keep_only_columns(self._column_names).column_names) + set(transformed_table.remove_columns_except(self._column_names).column_names) - set( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = transformed_table._data.reset_index(drop=True) - data.columns = transformed_table.column_names - data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return Table._from_polars_data_frame( + transformed_table._data_frame.update(new_data), + ) @property def is_fitted(self) -> bool: @@ -228,7 +230,6 @@ def get_names_of_added_columns(self) -> list[str]: raise TransformerNotFittedError return [] - # (Must implement abstract method, cannot instantiate class otherwise.) def get_names_of_changed_columns(self) -> list[str]: """ Get the names of all columns that may have been changed by the StandardScaler. diff --git a/src/safeds/data/tabular/transformation/_table_transformer.py b/src/safeds/data/tabular/transformation/_table_transformer.py index 277543c6a..2968df41f 100644 --- a/src/safeds/data/tabular/transformation/_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_table_transformer.py @@ -12,6 +12,10 @@ class TableTransformer(ABC): """Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns.""" + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + def __hash__(self) -> int: """ Return a deterministic hash value for a table transformer. @@ -26,11 +30,19 @@ def __hash__(self) -> int: removed = self.get_names_of_removed_columns() if self.is_fitted else [] return _structural_hash(self.__class__.__qualname__, self.is_fitted, added, changed, removed) + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + @property @abstractmethod def is_fitted(self) -> bool: """Whether the transformer is fitted.""" + # ------------------------------------------------------------------------------------------------------------------ + # Methods + # ------------------------------------------------------------------------------------------------------------------ + @abstractmethod def fit(self, table: Table, column_names: list[str] | None) -> Self: """ @@ -74,6 +86,10 @@ def transform(self, table: Table) -> Table: If the transformer has not been fitted yet. """ + # ------------------------------------------------------------------------------------------------------------------ + # Introspection + # ------------------------------------------------------------------------------------------------------------------ + @abstractmethod def get_names_of_added_columns(self) -> list[str]: """ @@ -122,7 +138,9 @@ def get_names_of_removed_columns(self) -> list[str]: If the transformer has not been fitted yet. """ - def fit_and_transform(self, table: Table, column_names: list[str] | None = None) -> tuple[Self, Table]: + def fit_and_transform( + self, table: Table, column_names: list[str] | None = None, + ) -> tuple[Self, Table]: """ Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. @@ -137,36 +155,11 @@ def fit_and_transform(self, table: Table, column_names: list[str] | None = None) Returns ------- - fitted_transformer, transformed_table: - The fitted transformer and the transformed table.: + fitted_transformer: + The fitted transformer. + transformed_table: + The transformed table. """ fitted_transformer = self.fit(table, column_names) transformed_table = fitted_transformer.transform(table) return fitted_transformer, transformed_table - - -class InvertibleTableTransformer(TableTransformer): - """A `TableTransformer` that can also undo the learned transformation after it has been applied.""" - - @abstractmethod - def inverse_transform(self, transformed_table: Table) -> Table: - """ - Undo the learned transformation. - - The table is not modified. - - Parameters - ---------- - transformed_table: - The table to be transformed back to the original version. - - Returns - ------- - table: - The original table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ diff --git a/src/safeds/data/tabular/typing/__init__.py b/src/safeds/data/tabular/typing/__init__.py index e97d45e3b..92b66a5b6 100644 --- a/src/safeds/data/tabular/typing/__init__.py +++ b/src/safeds/data/tabular/typing/__init__.py @@ -5,36 +5,18 @@ import apipkg if TYPE_CHECKING: - from ._column_type import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String - from ._experimental_data_type import ExperimentalDataType - from ._experimental_schema import ExperimentalSchema + from ._data_type import DataType from ._schema import Schema apipkg.initpkg( __name__, { - "Anything": "._column_type:Anything", - "Boolean": "._column_type:Boolean", - "ColumnType": "._column_type:ColumnType", - "ExperimentalDataType": "._experimental_data_type:ExperimentalDataType", - "ExperimentalSchema": "._experimental_schema:ExperimentalSchema", - "Integer": "._column_type:Integer", - "Nothing": "._column_type:Nothing", - "RealNumber": "._column_type:RealNumber", + "DataType": "._data_type:DataType", "Schema": "._schema:Schema", - "String": "._column_type:String", }, ) __all__ = [ - "Anything", - "Boolean", - "ColumnType", - "ExperimentalDataType", - "ExperimentalSchema", - "Integer", - "Nothing", - "RealNumber", + "DataType", "Schema", - "String", ] diff --git a/src/safeds/data/tabular/typing/_column_type.py b/src/safeds/data/tabular/typing/_column_type.py deleted file mode 100644 index f5e7a7b99..000000000 --- a/src/safeds/data/tabular/typing/_column_type.py +++ /dev/null @@ -1,377 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass -from types import NoneType -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - import pandas as pd - - -class ColumnType(ABC): - """Abstract base class for column types.""" - - _is_nullable: bool # This line is just here so the linter doesn't throw an error - - @abstractmethod - def __init__(self, is_nullable: bool = False) -> None: - """ - Abstract initializer for ColumnType. - - Parameters - ---------- - is_nullable: - Whether the columntype is nullable. - """ - - @staticmethod - def _data_type(data: pd.Series) -> ColumnType: - """ - Return the column type for a given `Series` from `pandas`. - - Parameters - ---------- - data: - The data to be checked. - - Returns - ------- - column_type: - The ColumnType. - - Raises - ------ - NotImplementedError - If the given data type is not supported. - """ - import numpy as np - - def column_type_of_type(cell_type: Any) -> ColumnType: - if cell_type in (int, np.int64, np.int32): - return Integer(is_nullable) - if cell_type in (float, np.float64, np.float32): - return RealNumber(is_nullable) - if cell_type == bool: - return Boolean(is_nullable) - if cell_type == str: - return String(is_nullable) - if cell_type is NoneType: - return Nothing() - else: - message = f"Unsupported numpy data type '{cell_type}'." - raise NotImplementedError(message) - - result: ColumnType = Nothing() - is_nullable = False - for cell in data: - if result == Nothing(): - result = column_type_of_type(type(cell)) - if type(cell) is NoneType: - is_nullable = True - result._is_nullable = is_nullable - if result != column_type_of_type(type(cell)): - if type(cell) is NoneType: - is_nullable = True - result._is_nullable = is_nullable - elif (isinstance(result, Integer) and isinstance(column_type_of_type(type(cell)), RealNumber)) or ( - isinstance(result, RealNumber) and isinstance(column_type_of_type(type(cell)), Integer) - ): - result = RealNumber(is_nullable) - else: - result = Anything(is_nullable) - if isinstance(cell, float) and np.isnan(cell): - is_nullable = True - result._is_nullable = is_nullable - - if isinstance(result, RealNumber) and all( - data.apply(lambda c: bool(isinstance(c, float) and np.isnan(c) or c == float(int(c)))), - ): - result = Integer(is_nullable) - - return result - - @abstractmethod - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - - @abstractmethod - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - - -@dataclass -class Anything(ColumnType): - """ - Type for a column that contains anything. - - Parameters - ---------- - is_nullable: - Whether the type also allows null values. - """ - - _is_nullable: bool - - def __init__(self, is_nullable: bool = False) -> None: - self._is_nullable = is_nullable - - def __repr__(self) -> str: - result = "Anything" - if self._is_nullable: - result += "?" - return result - - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - return self._is_nullable - - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - return False - - -@dataclass -class Boolean(ColumnType): - """ - Type for a column that only contains booleans. - - Parameters - ---------- - is_nullable: - Whether the type also allows null values. - """ - - _is_nullable: bool - - def __init__(self, is_nullable: bool = False) -> None: - self._is_nullable = is_nullable - - def __repr__(self) -> str: - result = "Boolean" - if self._is_nullable: - result += "?" - return result - - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - return self._is_nullable - - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - return False - - -@dataclass -class RealNumber(ColumnType): - """ - Type for a column that only contains real numbers. - - Parameters - ---------- - is_nullable: - Whether the type also allows null values. - """ - - _is_nullable: bool - - def __init__(self, is_nullable: bool = False) -> None: - self._is_nullable = is_nullable - - def __repr__(self) -> str: - result = "RealNumber" - if self._is_nullable: - result += "?" - return result - - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - return self._is_nullable - - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - return True - - -@dataclass -class Integer(ColumnType): - """ - Type for a column that only contains integers. - - Parameters - ---------- - is_nullable: - Whether the type also allows null values. - """ - - _is_nullable: bool - - def __init__(self, is_nullable: bool = False) -> None: - self._is_nullable = is_nullable - - def __repr__(self) -> str: - result = "Integer" - if self._is_nullable: - result += "?" - return result - - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - return self._is_nullable - - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - return True - - -@dataclass -class String(ColumnType): - """ - Type for a column that only contains strings. - - Parameters - ---------- - is_nullable: - Whether the type also allows null values. - """ - - _is_nullable: bool - - def __init__(self, is_nullable: bool = False) -> None: - self._is_nullable = is_nullable - - def __repr__(self) -> str: - result = "String" - if self._is_nullable: - result += "?" - return result - - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - return self._is_nullable - - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - return False - - -@dataclass -class Nothing(ColumnType): - """Type for a column that contains None Values only.""" - - _is_nullable: bool - - def __init__(self) -> None: - self._is_nullable = True - - def __repr__(self) -> str: - result = "Nothing" - if self._is_nullable: - result += "?" - return result - - def is_nullable(self) -> bool: - """ - Return whether the given column type is nullable. - - Returns - ------- - is_nullable: - True if the column is nullable. - """ - return True - - def is_numeric(self) -> bool: - """ - Return whether the given column type is numeric. - - Returns - ------- - is_numeric: - True if the column is numeric. - """ - return False diff --git a/src/safeds/data/tabular/typing/_experimental_data_type.py b/src/safeds/data/tabular/typing/_data_type.py similarity index 87% rename from src/safeds/data/tabular/typing/_experimental_data_type.py rename to src/safeds/data/tabular/typing/_data_type.py index 72a49c69f..32d60b973 100644 --- a/src/safeds/data/tabular/typing/_experimental_data_type.py +++ b/src/safeds/data/tabular/typing/_data_type.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod -class ExperimentalDataType(ABC): +class DataType(ABC): """The type of a column or cell in a table.""" # ------------------------------------------------------------------------------------------------------------------ @@ -37,8 +37,8 @@ def is_numeric(self) -> bool: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable( + >>> from safeds.data.tabular.containers import Table + >>> table = Table( ... { ... "A": [1, 2, 3], ... "B": ["a", "b", "c"] @@ -60,8 +60,8 @@ def is_temporal(self) -> bool: Examples -------- >>> from datetime import datetime - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable( + >>> from safeds.data.tabular.containers import Table + >>> table = Table( ... { ... "A": [datetime.now(), datetime.now(), datetime.now()], ... "B": ["a", "b", "c"] diff --git a/src/safeds/data/tabular/typing/_experimental_schema.py b/src/safeds/data/tabular/typing/_experimental_schema.py deleted file mode 100644 index 2fb4e8f0b..000000000 --- a/src/safeds/data/tabular/typing/_experimental_schema.py +++ /dev/null @@ -1,146 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from safeds.data.tabular.typing import ExperimentalDataType - - -class ExperimentalSchema(ABC): - """The schema of a row or table.""" - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def __eq__(self, other: object) -> bool: ... - - @abstractmethod - def __hash__(self) -> int: ... - - @abstractmethod - def __repr__(self) -> str: ... - - @abstractmethod - def __sizeof__(self) -> int: ... - - @abstractmethod - def __str__(self) -> str: ... - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - @abstractmethod - def column_names(self) -> list[str]: - """ - Return a list of all column names contained in this schema. - - Returns - ------- - column_names: - The column names. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"A": [1, 2, 3], "B": ["a", "b", "c"]}) - >>> table.schema.column_names - ['A', 'B'] - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Getters - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def get_column_type(self, name: str) -> ExperimentalDataType: - """ - Return the type of the given column. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - type: - The type of the column. - - Raises - ------ - UnknownColumnNameError - If the specified column name does not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"A": [1, 2, 3], "B": ["a", "b", "c"]}) - >>> type_ = table.schema.get_column_type("A") - """ - - @abstractmethod - def has_column(self, name: str) -> bool: - """ - Return whether the schema contains a given column. - - Parameters - ---------- - name: - The name of the column. - - Returns - ------- - contains: - True if the schema contains the column. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"A": [1, 2, 3], "B": ["a", "b", "c"]}) - >>> table.schema.has_column("A") - True - - >>> table.schema.has_column("C") - False - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Conversion - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def to_dict(self) -> dict[str, ExperimentalDataType]: - """ - Return a dictionary that maps column names to column types. - - Returns - ------- - data: - Dictionary representation of the schema. - - Examples - -------- - >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"A": [1, 2, 3], "B": ["a", "b", "c"]}) - >>> dict_ = table.schema.to_dict() - """ - - # ------------------------------------------------------------------------------------------------------------------ - # IPython integration - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def _repr_markdown_(self) -> str: - """ - Return a Markdown representation of the schema for IPython. - - Returns - ------- - markdown: - The generated Markdown. - """ diff --git a/src/safeds/data/tabular/typing/_experimental_polars_data_type.py b/src/safeds/data/tabular/typing/_polars_data_type.py similarity index 93% rename from src/safeds/data/tabular/typing/_experimental_polars_data_type.py rename to src/safeds/data/tabular/typing/_polars_data_type.py index c4ac895ce..724998e5d 100644 --- a/src/safeds/data/tabular/typing/_experimental_polars_data_type.py +++ b/src/safeds/data/tabular/typing/_polars_data_type.py @@ -2,13 +2,13 @@ from typing import TYPE_CHECKING -from ._experimental_data_type import ExperimentalDataType +from ._data_type import DataType if TYPE_CHECKING: import polars as pl -class _PolarsDataType(ExperimentalDataType): +class _PolarsDataType(DataType): """ The type of a column or cell in a table. diff --git a/src/safeds/data/tabular/typing/_experimental_polars_schema.py b/src/safeds/data/tabular/typing/_polars_schema.py similarity index 90% rename from src/safeds/data/tabular/typing/_experimental_polars_schema.py rename to src/safeds/data/tabular/typing/_polars_schema.py index e7efcfd74..f7b339d16 100644 --- a/src/safeds/data/tabular/typing/_experimental_polars_schema.py +++ b/src/safeds/data/tabular/typing/_polars_schema.py @@ -6,16 +6,16 @@ from safeds._utils import _structural_hash from safeds.exceptions import UnknownColumnNameError -from ._experimental_polars_data_type import _PolarsDataType -from ._experimental_schema import ExperimentalSchema +from ._polars_data_type import _PolarsDataType +from ._schema import Schema if TYPE_CHECKING: import polars as pl - from safeds.data.tabular.typing import ExperimentalDataType + from safeds.data.tabular.typing import DataType -class _PolarsSchema(ExperimentalSchema): +class _PolarsSchema(Schema): """ The schema of a row or table. @@ -72,7 +72,7 @@ def column_names(self) -> list[str]: # Getters # ------------------------------------------------------------------------------------------------------------------ - def get_column_type(self, name: str) -> ExperimentalDataType: + def get_column_type(self, name: str) -> DataType: if not self.has_column(name): raise UnknownColumnNameError([name]) return _PolarsDataType(self._schema[name]) @@ -84,7 +84,7 @@ def has_column(self, name: str) -> bool: # Conversion # ------------------------------------------------------------------------------------------------------------------ - def to_dict(self) -> dict[str, ExperimentalDataType]: + def to_dict(self) -> dict[str, DataType]: return {name: _PolarsDataType(type_) for name, type_ in self._schema.items()} # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index 557996a28..c3c70bda4 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -1,152 +1,43 @@ from __future__ import annotations -import sys -from dataclasses import dataclass +from abc import ABC, abstractmethod from typing import TYPE_CHECKING -from safeds._utils import _structural_hash -from safeds.data.tabular.typing import Anything, Integer, Nothing, RealNumber -from safeds.data.tabular.typing._column_type import ColumnType -from safeds.exceptions import UnknownColumnNameError - if TYPE_CHECKING: - import pandas as pd - - -@dataclass -class Schema: - """ - Store column names and corresponding data types for a `Table` or `Row`. - - Parameters - ---------- - schema: - Map from column names to data types. - - Examples - -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer(), "B": String()}) - """ - - _schema: dict[str, ColumnType] - - # ------------------------------------------------------------------------------------------------------------------ - # Creation - # ------------------------------------------------------------------------------------------------------------------ - - @staticmethod - def _from_pandas_dataframe(dataframe: pd.DataFrame) -> Schema: - """ - Create a schema from a `pandas.DataFrame`. - - Parameters - ---------- - dataframe: - The dataframe. + from ._data_type import DataType - Returns - ------- - schema: - The schema. - """ - names = dataframe.columns - # noinspection PyProtectedMember - types = [] - for col in dataframe: - types.append(ColumnType._data_type(dataframe[col])) - return Schema(dict(zip(names, types, strict=True))) +class Schema(ABC): + """The schema of a row or table.""" # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, schema: dict[str, ColumnType]): - self._schema = dict(schema) # Defensive copy + @abstractmethod + def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: - """ - Return a deterministic hash value for the schema. + @abstractmethod + def __hash__(self) -> int: ... - Returns - ------- - hash: - The hash value. + @abstractmethod + def __repr__(self) -> str: ... - Examples - -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer(), "B": String()}) - >>> hash_value = hash(schema) - """ - column_names = self._schema.keys() - column_types = map(repr, self._schema.values()) - return _structural_hash(str(tuple(zip(column_names, column_types, strict=True)))) + @abstractmethod + def __sizeof__(self) -> int: ... - def __repr__(self) -> str: - """ - Return an unambiguous string representation of this row. + @abstractmethod + def __str__(self) -> str: ... - Returns - ------- - representation: - The string representation. - - Examples - -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer()}) - >>> repr(schema) - "Schema({'A': Integer})" - """ - return f"Schema({self!s})" - - def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return ( - sum(map(sys.getsizeof, self._schema.keys())) - + sum(map(sys.getsizeof, self._schema.values())) - + sys.getsizeof(self._schema) - ) - - def __str__(self) -> str: - """ - Return a user-friendly string representation of the schema. - - Returns - ------- - string: - The string representation. - - Examples - -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer()}) - >>> str(schema) - "{'A': Integer}" - """ - match len(self._schema): - case 0: - return "{}" - case 1: - return str(self._schema) - case _: - lines = (f" {name!r}: {type_}" for name, type_ in self._schema.items()) - joined = ",\n".join(lines) - return f"{{\n{joined}\n}}" + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ @property + @abstractmethod def column_names(self) -> list[str]: """ - Return a list of all column names saved in this schema. + Return a list of all column names contained in this schema. Returns ------- @@ -155,74 +46,75 @@ def column_names(self) -> list[str]: Examples -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer(), "B": String()}) - >>> schema.column_names + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + >>> table.schema.column_names ['A', 'B'] """ - return list(self._schema.keys()) - def has_column(self, column_name: str) -> bool: + # ------------------------------------------------------------------------------------------------------------------ + # Getters + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def get_column_type(self, name: str) -> DataType: """ - Return whether the schema contains a given column. + Return the type of the given column. Parameters ---------- - column_name: + name: The name of the column. Returns ------- - contains: - True if the schema contains the column. + type: + The type of the column. + + Raises + ------ + UnknownColumnNameError + If the specified column name does not exist. Examples -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer(), "B": String()}) - >>> schema.has_column("A") - True - - >>> schema.has_column("C") - False + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + >>> type_ = table.schema.get_column_type("A") """ - return column_name in self._schema - def get_column_type(self, column_name: str) -> ColumnType: + @abstractmethod + def has_column(self, name: str) -> bool: """ - Return the type of the given column. + Return whether the schema contains a given column. Parameters ---------- - column_name: + name: The name of the column. Returns ------- - type: - The type of the column. - - Raises - ------ - UnknownColumnNameError - If the specified column name does not exist. + contains: + True if the schema contains the column. Examples -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer(), "B": String()}) - >>> schema.get_column_type("A") - Integer + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + >>> table.schema.has_column("A") + True + + >>> table.schema.has_column("C") + False """ - if not self.has_column(column_name): - raise UnknownColumnNameError([column_name]) - return self._schema[column_name] # ------------------------------------------------------------------------------------------------------------------ # Conversion # ------------------------------------------------------------------------------------------------------------------ - def to_dict(self) -> dict[str, ColumnType]: + @abstractmethod + def to_dict(self) -> dict[str, DataType]: """ Return a dictionary that maps column names to column types. @@ -233,92 +125,22 @@ def to_dict(self) -> dict[str, ColumnType]: Examples -------- - >>> from safeds.data.tabular.typing import Integer, Schema, String - >>> schema = Schema({"A": Integer(), "B": String()}) - >>> schema.to_dict() - {'A': Integer, 'B': String} - """ - return dict(self._schema) # defensive copy - - @staticmethod - def _merge_multiple_schemas(schemas: list[Schema]) -> Schema: + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + >>> dict_ = table.schema.to_dict() """ - Merge multiple schemas into one. - - For each type missmatch the new schema will have the least common supertype. - - The type hierarchy is as follows: - * Anything - * RealNumber - * Integer - * Boolean - * String - - Parameters - ---------- - schemas: - the list of schemas you want to merge - - Returns - ------- - schema: - the new merged schema - - Raises - ------ - UnknownColumnNameError - if not all schemas have the same column names - """ - schema_dict = schemas[0]._schema - missing_col_names = set() - for schema in schemas: - missing_col_names.update(set(schema.column_names) - set(schema_dict.keys())) - if len(missing_col_names) > 0: - raise UnknownColumnNameError(list(missing_col_names)) - for schema in schemas: - if schema_dict != schema._schema: - for col_name in schema_dict: - nullable = False - if schema_dict[col_name].is_nullable() or schema.get_column_type(col_name).is_nullable(): - nullable = True - if isinstance(schema_dict[col_name], type(schema.get_column_type(col_name))): - if schema.get_column_type(col_name).is_nullable() and not schema_dict[col_name].is_nullable(): - schema_dict[col_name] = type(schema.get_column_type(col_name))(nullable) - continue - if ( - isinstance(schema_dict[col_name], RealNumber) - and isinstance(schema.get_column_type(col_name), Integer) - ) or ( - isinstance(schema_dict[col_name], Integer) - and isinstance(schema.get_column_type(col_name), RealNumber) - ): - schema_dict[col_name] = RealNumber(nullable) - continue - if isinstance(schema_dict[col_name], Nothing): - schema_dict[col_name] = type(schema.get_column_type(col_name))(nullable) - continue - if isinstance(schema.get_column_type(col_name), Nothing): - schema_dict[col_name] = type(schema_dict[col_name])(nullable) - continue - schema_dict[col_name] = Anything(nullable) - return Schema(schema_dict) # ------------------------------------------------------------------------------------------------------------------ - # IPython Integration + # IPython integration # ------------------------------------------------------------------------------------------------------------------ + @abstractmethod def _repr_markdown_(self) -> str: """ - Return a Markdown representation of the schema. + Return a Markdown representation of the schema for IPython. Returns ------- markdown: - The Markdown representation. + The generated Markdown. """ - if len(self._schema) == 0: - return "Empty Schema" - - lines = (f"| {name} | {type_} |" for name, type_ in self._schema.items()) - joined = "\n".join(lines) - return f"| Column Name | Column Type |\n| --- | --- |\n{joined}" diff --git a/src/safeds/exceptions/_generic.py b/src/safeds/exceptions/_generic.py index 990c99d9f..b67744328 100644 --- a/src/safeds/exceptions/_generic.py +++ b/src/safeds/exceptions/_generic.py @@ -44,11 +44,11 @@ def __init__( Raises ------ ValueError - * If one of the given Bounds is +/-inf. (For infinite Bounds, pass None instead.) - * If one of the given Bounds is nan. - * If upper_bound < lower_bound. - * If actual does not lie outside the given interval. - * If actual is not a real number. + - If one of the given Bounds is +/-inf. (For infinite Bounds, pass None instead.) + - If one of the given Bounds is nan. + - If upper_bound < lower_bound. + - If actual does not lie outside the given interval. + - If actual is not a real number. """ from numpy import isinf, isnan diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index 5b920153b..d443216f7 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -36,7 +36,7 @@ def __init__(self, reason: str): super().__init__(f"Error occurred while learning: {reason}") -class ModelNotFittedError(Exception): +class ModelNotFittedError(RuntimeError): """Raised when a model is used before fitting it.""" def __init__(self) -> None: diff --git a/src/safeds/ml/classical/__init__.py b/src/safeds/ml/classical/__init__.py index 720ab83bb..4584d8106 100644 --- a/src/safeds/ml/classical/__init__.py +++ b/src/safeds/ml/classical/__init__.py @@ -1 +1,19 @@ """Classes for classical machine learning, i.e. machine learning that does not use neural networks.""" + +from typing import TYPE_CHECKING + +import apipkg + +if TYPE_CHECKING: + from ._supervised_model import SupervisedModel + +apipkg.initpkg( + __name__, + { + "SupervisedModel": "._supervised_model:SupervisedModel", + }, +) + +__all__ = [ + "SupervisedModel", +] diff --git a/src/safeds/ml/classical/_bases/__init__.py b/src/safeds/ml/classical/_bases/__init__.py new file mode 100644 index 000000000..77c3cacbd --- /dev/null +++ b/src/safeds/ml/classical/_bases/__init__.py @@ -0,0 +1,34 @@ +"""Base classes for classical machine learning models.""" + +from typing import TYPE_CHECKING + +import apipkg + +if TYPE_CHECKING: + from ._ada_boost_base import _AdaBoostBase + from ._decision_tree_base import _DecisionTreeBase + from ._gradient_boosting_base import _GradientBoostingBase + from ._k_nearest_neighbors_base import _KNearestNeighborsBase + from ._random_forest_base import _RandomForestBase + from ._support_vector_machine_base import _SupportVectorMachineBase + +apipkg.initpkg( + __name__, + { + "_AdaBoostBase": "._ada_boost_base:_AdaBoostBase", + "_DecisionTreeBase": "._decision_tree_base:_DecisionTreeBase", + "_GradientBoostingBase": "._gradient_boosting_base:_GradientBoostingBase", + "_KNearestNeighborsBase": "._k_nearest_neighbors_base:_KNearestNeighborsBase", + "_RandomForestBase": "._random_forest_base:_RandomForestBase", + "_SupportVectorMachineBase": "._support_vector_machine_base:_SupportVectorMachineBase", + }, +) + +__all__ = [ + "_AdaBoostBase", + "_DecisionTreeBase", + "_GradientBoostingBase", + "_KNearestNeighborsBase", + "_RandomForestBase", + "_SupportVectorMachineBase", +] diff --git a/src/safeds/ml/classical/_bases/_ada_boost_base.py b/src/safeds/ml/classical/_bases/_ada_boost_base.py new file mode 100644 index 000000000..d4205696d --- /dev/null +++ b/src/safeds/ml/classical/_bases/_ada_boost_base.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError + +if TYPE_CHECKING: + from safeds.ml.classical import SupervisedModel + + +class _AdaBoostBase(ABC): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def __init__( + self, + maximum_number_of_learners: int, + learning_rate: float, + ) -> None: + # Validation + if maximum_number_of_learners < 1: + raise OutOfBoundsError( + maximum_number_of_learners, + name="maximum_number_of_learners", + lower_bound=ClosedBound(1), + ) + if learning_rate <= 0: + raise OutOfBoundsError(learning_rate, name="learning_rate", lower_bound=OpenBound(0)) + + # Hyperparameters + self._maximum_number_of_learners: int = maximum_number_of_learners + self._learning_rate: float = learning_rate + + def __hash__(self) -> int: + return _structural_hash( + self._maximum_number_of_learners, + self._learning_rate, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def maximum_number_of_learners(self) -> int: + """The maximum number of learners in the ensemble.""" + return self._maximum_number_of_learners + + @property + def learning_rate(self) -> float: + """The learning rate.""" + return self._learning_rate + + @property + @abstractmethod + def learner(self) -> SupervisedModel | None: + """The base learner used for training the ensemble.""" diff --git a/src/safeds/ml/classical/_bases/_decision_tree_base.py b/src/safeds/ml/classical/_bases/_decision_tree_base.py new file mode 100644 index 000000000..f26f30750 --- /dev/null +++ b/src/safeds/ml/classical/_bases/_decision_tree_base.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OutOfBoundsError + + +class _DecisionTreeBase(ABC): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def __init__( + self, + maximum_depth: int | None, + minimum_number_of_samples_in_leaves: int, + ) -> None: + # Validation + if maximum_depth is not None and maximum_depth < 1: + raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1)) + if minimum_number_of_samples_in_leaves < 1: + raise OutOfBoundsError( + minimum_number_of_samples_in_leaves, + name="minimum_number_of_samples_in_leaves", + lower_bound=ClosedBound(1), + ) + + # Hyperparameters + self._maximum_depth: int | None = maximum_depth + self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves + + def __hash__(self) -> int: + return _structural_hash( + self._maximum_depth, + self._minimum_number_of_samples_in_leaves, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def maximum_depth(self) -> int | None: + """The maximum depth of the tree.""" + return self._maximum_depth + + @property + def minimum_number_of_samples_in_leaves(self) -> int: + """The minimum number of samples that must remain in the leaves of the tree.""" + return self._minimum_number_of_samples_in_leaves diff --git a/src/safeds/ml/classical/_bases/_gradient_boosting_base.py b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py new file mode 100644 index 000000000..0750fb207 --- /dev/null +++ b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError + + +class _GradientBoostingBase(ABC): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def __init__( + self, + number_of_trees: int, + learning_rate: float, + ) -> None: + # Validation + if number_of_trees < 1: + raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1)) + if learning_rate <= 0: + raise OutOfBoundsError(learning_rate, name="learning_rate", lower_bound=OpenBound(0)) + + # Hyperparameters + self._number_of_trees = number_of_trees + self._learning_rate = learning_rate + + def __hash__(self) -> int: + return _structural_hash( + self._number_of_trees, + self._learning_rate, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def number_of_trees(self) -> int: + """The number of trees (estimators) in the ensemble.""" + return self._number_of_trees + + @property + def learning_rate(self) -> float: + """The learning rate.""" + return self._learning_rate diff --git a/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py new file mode 100644 index 000000000..307d16f27 --- /dev/null +++ b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OutOfBoundsError + + +class _KNearestNeighborsBase(ABC): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def __init__( + self, + number_of_neighbors: int, + ) -> None: + # Validation + if number_of_neighbors < 1: + raise OutOfBoundsError(number_of_neighbors, name="number_of_neighbors", lower_bound=ClosedBound(1)) + + # Hyperparameters + self._number_of_neighbors = number_of_neighbors + + def __hash__(self) -> int: + return _structural_hash( + self._number_of_neighbors, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def number_of_neighbors(self) -> int: + """The number of neighbors used for interpolation.""" + return self._number_of_neighbors diff --git a/src/safeds/ml/classical/_bases/_random_forest_base.py b/src/safeds/ml/classical/_bases/_random_forest_base.py new file mode 100644 index 000000000..6c6c9b7c2 --- /dev/null +++ b/src/safeds/ml/classical/_bases/_random_forest_base.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OutOfBoundsError + + +class _RandomForestBase(ABC): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def __init__( + self, + number_of_trees: int, + maximum_depth: int | None, + minimum_number_of_samples_in_leaves: int, + ) -> None: + # Validation + if number_of_trees < 1: + raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1)) + if maximum_depth is not None and maximum_depth < 1: + raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1)) + if minimum_number_of_samples_in_leaves < 1: + raise OutOfBoundsError( + minimum_number_of_samples_in_leaves, + name="minimum_number_of_samples_in_leaves", + lower_bound=ClosedBound(1), + ) + + # Hyperparameters + self._number_of_trees: int = number_of_trees + self._maximum_depth: int | None = maximum_depth + self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves + + def __hash__(self) -> int: + return _structural_hash( + self._number_of_trees, + self._maximum_depth, + self._minimum_number_of_samples_in_leaves, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def number_of_trees(self) -> int: + """The number of trees used in the random forest.""" + return self._number_of_trees + + @property + def maximum_depth(self) -> int | None: + """The maximum depth of each tree.""" + return self._maximum_depth + + @property + def minimum_number_of_samples_in_leaves(self) -> int: + """The minimum number of samples that must remain in the leaves of each tree.""" + return self._minimum_number_of_samples_in_leaves diff --git a/src/safeds/ml/classical/_bases/_support_vector_machine_base.py b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py new file mode 100644 index 000000000..e06dfd074 --- /dev/null +++ b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py @@ -0,0 +1,245 @@ +from __future__ import annotations + +import sys +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError + +if TYPE_CHECKING: + from sklearn.svm import SVC as SklearnSVC # noqa: N811 + from sklearn.svm import SVR as SklearnSVR # noqa: N811 + + +class _SupportVectorMachineBase(ABC): + # ------------------------------------------------------------------------------------------------------------------ + # Inner classes + # ------------------------------------------------------------------------------------------------------------------ + + class Kernel(ABC): + """ + Possible kernels for the support vector machine. + + Use the static factory methods to create instances of this class. + """ + + @abstractmethod + def __eq__(self, other: object) -> bool: ... + + @abstractmethod + def __hash__(self) -> int: ... + + @abstractmethod + def __str__(self) -> str: ... + + @abstractmethod + def _apply(self, model: SklearnSVC | SklearnSVR) -> None: + """Set the kernel of the given model.""" + + @staticmethod + def linear() -> _SupportVectorMachineBase.Kernel: + """Create a linear kernel.""" + raise NotImplementedError # pragma: no cover + + @staticmethod + def polynomial(degree: int) -> _SupportVectorMachineBase.Kernel: + """ + Create a polynomial kernel. + + Parameters + ---------- + degree: + The degree of the polynomial kernel. Must be greater than 0. + + Raises + ------ + ValueError + If `degree` is not greater than 0. + """ + raise NotImplementedError # pragma: no cover + + @staticmethod + def radial_basis_function() -> _SupportVectorMachineBase.Kernel: + """Create a radial basis function kernel.""" + raise NotImplementedError # pragma: no cover + + @staticmethod + def sigmoid() -> _SupportVectorMachineBase.Kernel: + """Create a sigmoid kernel.""" + raise NotImplementedError # pragma: no cover + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def __init__( + self, + c: float, + kernel: _SupportVectorMachineBase.Kernel | None, + ) -> None: + # Validation + if c <= 0: + raise OutOfBoundsError(c, name="c", lower_bound=OpenBound(0)) + if kernel is None: + kernel = _SupportVectorMachineBase.Kernel.radial_basis_function() + + # Hyperparameters + self._c: float = c + self._kernel: _SupportVectorMachineBase.Kernel = kernel + + def __hash__(self) -> int: + return _structural_hash( + self._c, + self.kernel, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def c(self) -> float: + """The regularization strength.""" + return self._c + + # This property is abstract, so subclasses must declare a public return type. + @property + @abstractmethod + def kernel(self) -> _SupportVectorMachineBase.Kernel: + """The type of kernel used.""" + + +# ---------------------------------------------------------------------------------------------------------------------- +# Kernels +# ---------------------------------------------------------------------------------------------------------------------- + +class _Linear(_SupportVectorMachineBase.Kernel): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Linear): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(self.__class__.__qualname__) + + def __str__(self) -> str: + return "Linear" + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _apply(self, model: SklearnSVC) -> None: + model.kernel = "linear" + + +class _Polynomial(_SupportVectorMachineBase.Kernel): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, degree: int): + if degree < 1: + raise OutOfBoundsError(degree, name="degree", lower_bound=ClosedBound(1)) + + self._degree = degree + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Polynomial): + return NotImplemented + return self._degree == other._degree + + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + self._degree, + ) + + def __sizeof__(self) -> int: + return sys.getsizeof(self._degree) + + def __str__(self) -> str: + return f"Polynomial(degree={self._degree})" + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def degree(self) -> int: + """The degree of the polynomial kernel.""" + return self._degree + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _apply(self, model: SklearnSVC) -> None: + model.kernel = "poly" + model.degree = self._degree + + +class _RadialBasisFunction(_SupportVectorMachineBase.Kernel): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _RadialBasisFunction): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(self.__class__.__qualname__) + + def __str__(self) -> str: + return "RadialBasisFunction" + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _apply(self, model: SklearnSVC) -> None: + model.kernel = "rbf" + + +class _Sigmoid(_SupportVectorMachineBase.Kernel): + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Sigmoid): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(self.__class__.__qualname__) + + def __str__(self) -> str: + return "Sigmoid" + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _apply(self, model: SklearnSVC) -> None: + model.kernel = "sigmoid" + + +# Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. +# This is needed for the DSL, where SVM kernels are variants of an enum. +_SupportVectorMachineBase.Kernel.linear = _Linear # type: ignore[method-assign] +_SupportVectorMachineBase.Kernel.polynomial = _Polynomial # type: ignore[method-assign] +_SupportVectorMachineBase.Kernel.radial_basis_function = _RadialBasisFunction # type: ignore[method-assign] +_SupportVectorMachineBase.Kernel.sigmoid = _Sigmoid # type: ignore[method-assign] diff --git a/src/safeds/ml/classical/_supervised_model.py b/src/safeds/ml/classical/_supervised_model.py new file mode 100644 index 000000000..8fc7e0ae4 --- /dev/null +++ b/src/safeds/ml/classical/_supervised_model.py @@ -0,0 +1,431 @@ +from __future__ import annotations + +import warnings +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Self + +from safeds._utils import _structural_hash +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Column, Table +from safeds.exceptions import ( + DatasetMissesDataError, + DatasetMissesFeaturesError, + LearningError, + MissingValuesColumnError, + ModelNotFittedError, + NonNumericColumnError, + PlainTableError, + PredictionError, +) + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin, RegressorMixin + + from safeds.data.tabular.typing import DataType, Schema + + +class SupervisedModel(ABC): + """A model for supervised learning tasks.""" + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + # The decorator is needed so the class really cannot be instantiated + @abstractmethod + def __init__(self) -> None: + self._feature_schema: Schema | None = None + self._target_name: str | None = None + self._target_type: DataType | None = None + self._wrapped_model: ClassifierMixin | RegressorMixin | None = None + + # The decorator ensures that the method is overridden in all subclasses + @abstractmethod + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + self._feature_schema, + self._target_name, + self._target_type, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def is_fitted(self) -> bool: + """Whether the model is fitted.""" + return None not in (self._feature_schema, self._target_name, self._target_type, self._wrapped_model) + + # ------------------------------------------------------------------------------------------------------------------ + # Machine learning + # ------------------------------------------------------------------------------------------------------------------ + + def fit(self, training_set: TabularDataset) -> Self: + """ + Create a copy of this model and fit it with the given training data. + + **Note:** This model is not modified. + + Parameters + ---------- + training_set: + The training data containing the features and target. + + Returns + ------- + fitted_model: + The fitted model. + + Raises + ------ + LearningError + If the training data contains invalid values or if the training failed. + """ + self._check_additional_fit_preconditions(training_set) + + wrapped_model = self._get_sklearn_model() + _fit_sklearn_model_in_place(wrapped_model, training_set) + + result = self._clone() + result._feature_schema = training_set.features.schema + result._target_name = training_set.target.name + result._target_type = training_set.target.type + result._wrapped_model = wrapped_model + + return result + + def predict( + self, + dataset: Table | TabularDataset, + ) -> TabularDataset: + """ + Predict the target values on the given dataset. + + **Note:** The model must be fitted. + + Parameters + ---------- + dataset: + The dataset containing at least the features. + + Returns + ------- + prediction: + The given dataset with an additional column for the predicted target values. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet. + DatasetMissesFeaturesError + If the dataset misses feature columns. + PredictionError + If predicting with the given dataset failed. + """ + self._check_additional_predict_preconditions(dataset) + + return _predict_with_sklearn_model( + self._wrapped_model, + dataset, + self.get_feature_names(), + self.get_target_name(), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Introspection + # ------------------------------------------------------------------------------------------------------------------ + + def get_feature_names(self) -> list[str]: + """ + Return the names of the feature columns. + + **Note:** The model must be fitted. + + Returns + ------- + feature_names: + The names of the feature columns. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet. + """ + # Used in favor of is_fitted, so the type checker is happy + if self._feature_schema is None: + raise ModelNotFittedError + + return self._feature_schema.column_names + + def get_features_schema(self) -> Schema: + """ + Return the schema of the feature columns. + + **Note:** The model must be fitted. + + Returns + ------- + feature_schema: + The schema of the feature columns. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet. + """ + # Used in favor of is_fitted, so the type checker is happy + if self._feature_schema is None: + raise ModelNotFittedError + + return self._feature_schema + + def get_target_name(self) -> str: + """ + Return the name of the target column. + + **Note:** The model must be fitted. + + Returns + ------- + target_name: + The name of the target column. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet. + """ + # Used in favor of is_fitted, so the type checker is happy + if self._target_name is None: + raise ModelNotFittedError + + return self._target_name + + def get_target_type(self) -> DataType: + """ + Return the type of the target column. + + **Note:** The model must be fitted. + + Returns + ------- + target_type: + The type of the target column. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet. + """ + # Used in favor of is_fitted, so the type checker is happy + if self._target_type is None: + raise ModelNotFittedError + + return self._target_type + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _check_additional_fit_preconditions(self, training_set: TabularDataset): # noqa: B027 + """ + Check additional preconditions for fitting the model and raise an error if any are violated. + + Parameters + ---------- + training_set: + The training data containing the features and target. + """ + + def _check_additional_predict_preconditions(self, dataset: Table | TabularDataset): # noqa: B027 + """ + Check additional preconditions for predicting with the model and raise an error if any are violated. + + Parameters + ---------- + dataset: + The dataset containing at least the features. + """ + + @abstractmethod + def _clone(self) -> Self: + """ + Return a new instance of this model with the same hyperparameters. + + Returns + ------- + clone: + A new instance of this model. + """ + + @abstractmethod + def _get_sklearn_model(self) -> ClassifierMixin | RegressorMixin: + """ + Return a new scikit-learn model that implements the algorithm of this model. + + Returns + ------- + sklearn_model: + The scikit-learn model. + """ + + +def _fit_sklearn_model_in_place(model: Any, tabular_dataset: TabularDataset) -> None: + """ + Fit a scikit-learn model in-place on the given tabular dataset. + + Parameters + ---------- + model: + Classifier or Regression from scikit-learn. + tabular_dataset: + The tabular dataset containing the feature and target vectors. + + Raises + ------ + LearningError + If the tabular dataset contains invalid values or if the training failed. + TypeError + If a table is passed instead of a tabular dataset. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. + """ + if not isinstance(tabular_dataset, TabularDataset) and isinstance(tabular_dataset, Table): + raise PlainTableError + + if tabular_dataset._table.number_of_rows == 0: + raise DatasetMissesDataError + + non_numerical_column_names = set(tabular_dataset.features.column_names) - set( + tabular_dataset.features.remove_non_numeric_columns().column_names, + ) + if len(non_numerical_column_names) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names), + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.", + ) + + null_containing_column_names = set(tabular_dataset.features.column_names) - set( + tabular_dataset.features.remove_columns_with_missing_values().column_names, + ) + if len(null_containing_column_names) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names), + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + ) + + try: + model.fit( + tabular_dataset.features._data_frame, + tabular_dataset.target._series, + ) + except ValueError as exception: + raise LearningError(str(exception)) from exception + + +def _predict_with_sklearn_model( + model: Any, + dataset: Table | TabularDataset, + feature_names: list[str] | None, + target_name: str | None, +) -> TabularDataset: + """ + Predict a target vector using a dataset containing feature vectors. The model has to be trained first. + + Parameters + ---------- + model: + Classifier or regressor from scikit-learn. + dataset: + The dataset containing the features. + target_name: + The name of the target column. + feature_names: + The names of the feature columns. + + Returns + ------- + table: + A dataset containing the given features and the predicted target. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet. + DatasetMissesFeaturesError + If the dataset misses feature columns. + PredictionError + If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. + """ + # Validation + if model is None or target_name is None or feature_names is None: + raise ModelNotFittedError + if isinstance(dataset, TabularDataset): # pragma: no cover + dataset = dataset.features + + missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] + if missing_feature_names: + raise DatasetMissesFeaturesError(missing_feature_names) + + if dataset.number_of_rows == 0: + raise DatasetMissesDataError + + non_numerical_column_names = set(dataset.column_names) - set( + dataset.remove_non_numeric_columns().column_names, + ) + if len(non_numerical_column_names) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names), + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.\n", + ) + + null_containing_column_names = set(dataset.column_names) - set( + dataset.remove_columns_with_missing_values().column_names, + ) + if len(null_containing_column_names) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names), + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + ) + + dataset_df = dataset.remove_columns_except(feature_names) + + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="X does not have valid feature names") + predicted_target_vector = model.predict(dataset_df._data_frame) + output = dataset.remove_columns(target_name).add_columns( + Column(target_name, predicted_target_vector), + ) + + extra_names = [ + column_name + for column_name in dataset.column_names + if column_name != target_name and column_name not in feature_names + ] + + return TabularDataset( + output, + target_name=target_name, + extra_names=extra_names, + ) + except ValueError as exception: + raise PredictionError(str(exception)) from exception diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py deleted file mode 100644 index df8ec362a..000000000 --- a/src/safeds/ml/classical/_util_sklearn.py +++ /dev/null @@ -1,240 +0,0 @@ -import warnings -from typing import Any - -from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset -from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable, Table -from safeds.exceptions import ( - DatasetMissesDataError, - DatasetMissesFeaturesError, - LearningError, - MissingValuesColumnError, - ModelNotFittedError, - NonNumericColumnError, - PlainTableError, - PredictionError, -) - - -def fit(model: Any, tabular_dataset: TabularDataset | ExperimentalTabularDataset) -> None: - """ - Fit a model for a given tabular dataset. - - Parameters - ---------- - model: - Classifier or Regression from scikit-learn. - tabular_dataset: - The tabular dataset containing the feature and target vectors. - - Raises - ------ - LearningError - If the tabular dataset contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - if not isinstance(tabular_dataset, TabularDataset) and isinstance(tabular_dataset, Table): - raise PlainTableError - - if tabular_dataset._table.number_of_rows == 0: - raise DatasetMissesDataError - - if isinstance(tabular_dataset, TabularDataset): - non_numerical_column_names = set(tabular_dataset.features.column_names) - set( - tabular_dataset.features.remove_columns_with_non_numerical_values().column_names, - ) - else: # pragma: no cover - non_numerical_column_names = set(tabular_dataset.features.column_names) - set( - tabular_dataset.features.remove_non_numeric_columns().column_names, - ) - if len(non_numerical_column_names) != 0: - raise NonNumericColumnError( - str(non_numerical_column_names), - "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" - " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" - " different values\nor is ordinal, you should use the LabelEncoder.", - ) - - null_containing_column_names = set(tabular_dataset.features.column_names) - set( - tabular_dataset.features.remove_columns_with_missing_values().column_names, - ) - if len(null_containing_column_names) != 0: - raise MissingValuesColumnError( - str(null_containing_column_names), - "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" - " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", - ) - - try: - if isinstance(tabular_dataset, TabularDataset): - model.fit( - tabular_dataset.features._data, - tabular_dataset.target._data, - ) - else: # pragma: no cover - model.fit( - tabular_dataset.features._data_frame, - tabular_dataset.target._series, - ) - except ValueError as exception: - raise LearningError(str(exception)) from exception - - -# noinspection PyProtectedMember -def predict( - model: Any, - dataset: Table | ExperimentalTable | ExperimentalTabularDataset, - feature_names: list[str] | None, - target_name: str | None, -) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - model: - Classifier or regressor from scikit-learn. - dataset: - The dataset containing the features. - target_name: - The name of the target column. - feature_names: - The names of the feature columns. - - Returns - ------- - table: - A dataset containing the given features and the predicted target. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - # Validation - if model is None or target_name is None or feature_names is None: - raise ModelNotFittedError - if isinstance(dataset, ExperimentalTabularDataset): # pragma: no cover - dataset = dataset.features - - if isinstance(dataset, Table): - missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] - if missing_feature_names: - raise DatasetMissesFeaturesError(missing_feature_names) - - if dataset.number_of_rows == 0: - raise DatasetMissesDataError - - non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( - dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names, - ) - if len(non_numerical_column_names) != 0: - raise NonNumericColumnError( - str(non_numerical_column_names), - "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" - " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" - " different values\nor is ordinal, you should use the LabelEncoder.\n", - ) - - null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( - dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names, - ) - if len(null_containing_column_names) != 0: - raise MissingValuesColumnError( - str(null_containing_column_names), - "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" - " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", - ) - - dataset_df = dataset.keep_only_columns(feature_names)._data - dataset_df.columns = feature_names - - result_set = dataset._data.reset_index(drop=True) - result_set.columns = dataset.column_names - - try: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="X does not have valid feature names") - predicted_target_vector = model.predict(dataset_df.values) - result_set[target_name] = predicted_target_vector - - extra_names = [ - column_name - for column_name in dataset.column_names - if column_name != target_name and column_name not in feature_names - ] - - return Table._from_pandas_dataframe(result_set).to_tabular_dataset( - target_name=target_name, - extra_names=extra_names, - ) - except ValueError as exception: - raise PredictionError(str(exception)) from exception - elif isinstance(dataset, ExperimentalTable): # pragma: no cover - missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] - if missing_feature_names: - raise DatasetMissesFeaturesError(missing_feature_names) - - if dataset.number_of_rows == 0: - raise DatasetMissesDataError - - non_numerical_column_names_2 = set(dataset.column_names) - set( - dataset.remove_non_numeric_columns().column_names, - ) - if len(non_numerical_column_names_2) != 0: - raise NonNumericColumnError( - str(non_numerical_column_names_2), - "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" - " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" - " different values\nor is ordinal, you should use the LabelEncoder.\n", - ) - - null_containing_column_names_2 = set(dataset.column_names) - set( - dataset.remove_columns_with_missing_values().column_names, - ) - if len(null_containing_column_names_2) != 0: - raise MissingValuesColumnError( - str(null_containing_column_names_2), - "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" - " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", - ) - - dataset_df = dataset.remove_columns_except(feature_names) - - try: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="X does not have valid feature names") - predicted_target_vector = model.predict(dataset_df._data_frame) - output = dataset.remove_columns(target_name).add_columns( - ExperimentalColumn(target_name, predicted_target_vector), - ) - - extra_names = [ - column_name - for column_name in dataset.column_names - if column_name != target_name and column_name not in feature_names - ] - - return TabularDataset( - output.to_dict(), - target_name=target_name, - extra_names=extra_names, - ) - except ValueError as exception: - raise PredictionError(str(exception)) from exception diff --git a/src/safeds/ml/classical/classification/__init__.py b/src/safeds/ml/classical/classification/__init__.py index da036d11a..6ad258333 100644 --- a/src/safeds/ml/classical/classification/__init__.py +++ b/src/safeds/ml/classical/classification/__init__.py @@ -5,26 +5,26 @@ import apipkg if TYPE_CHECKING: - from ._ada_boost import AdaBoostClassifier + from ._ada_boost_classifier import AdaBoostClassifier from ._classifier import Classifier - from ._decision_tree import DecisionTreeClassifier - from ._gradient_boosting import GradientBoostingClassifier - from ._k_nearest_neighbors import KNearestNeighborsClassifier - from ._logistic_regression import LogisticRegressionClassifier - from ._random_forest import RandomForestClassifier - from ._support_vector_machine import SupportVectorMachineClassifier + from ._decision_tree_classifier import DecisionTreeClassifier + from ._gradient_boosting_classifier import GradientBoostingClassifier + from ._k_nearest_neighbors_classifier import KNearestNeighborsClassifier + from ._logistic_classifier import LogisticClassifier + from ._random_forest_classifier import RandomForestClassifier + from ._support_vector_classifier import SupportVectorClassifier apipkg.initpkg( __name__, { - "AdaBoostClassifier": "._ada_boost:AdaBoostClassifier", + "AdaBoostClassifier": "._ada_boost_classifier:AdaBoostClassifier", "Classifier": "._classifier:Classifier", - "DecisionTreeClassifier": "._decision_tree:DecisionTreeClassifier", - "GradientBoostingClassifier": "._gradient_boosting:GradientBoostingClassifier", - "KNearestNeighborsClassifier": "._k_nearest_neighbors:KNearestNeighborsClassifier", - "LogisticRegressionClassifier": "._logistic_regression:LogisticRegressionClassifier", - "RandomForestClassifier": "._random_forest:RandomForestClassifier", - "SupportVectorMachineClassifier": "._support_vector_machine:SupportVectorMachineClassifier", + "DecisionTreeClassifier": "._decision_tree_classifier:DecisionTreeClassifier", + "GradientBoostingClassifier": "._gradient_boosting_classifier:GradientBoostingClassifier", + "KNearestNeighborsClassifier": "._k_nearest_neighbors_classifier:KNearestNeighborsClassifier", + "LogisticClassifier": "._logistic_classifier:LogisticClassifier", + "RandomForestClassifier": "._random_forest_classifier:RandomForestClassifier", + "SupportVectorClassifier": "._support_vector_classifier:SupportVectorClassifier", }, ) @@ -34,7 +34,7 @@ "DecisionTreeClassifier", "GradientBoostingClassifier", "KNearestNeighborsClassifier", - "LogisticRegressionClassifier", + "LogisticClassifier", "RandomForestClassifier", - "SupportVectorMachineClassifier", + "SupportVectorClassifier", ] diff --git a/src/safeds/ml/classical/classification/_ada_boost.py b/src/safeds/ml/classical/classification/_ada_boost.py deleted file mode 100644 index 20c1b2304..000000000 --- a/src/safeds/ml/classical/classification/_ada_boost.py +++ /dev/null @@ -1,207 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._classifier import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.ensemble import AdaBoostClassifier as sk_AdaBoostClassifier - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class AdaBoostClassifier(Classifier): - """ - Ada Boost classification. - - Parameters - ---------- - learner: - The learner from which the boosted ensemble is built. - maximum_number_of_learners: - The maximum number of learners at which boosting is terminated. In case of perfect fit, the learning procedure - is stopped early. Has to be greater than 0. - learning_rate: - Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution - of each classifier. Has to be greater than 0. - - Raises - ------ - OutOfBoundsError - If `maximum_number_of_learners` or `learning_rate` are less than or equal to 0. - """ - - def __hash__(self) -> int: - return _structural_hash( - Classifier.__hash__(self), - self._target_name, - self._feature_names, - self._learning_rate, - self._maximum_number_of_learners, - ) - - def __init__( - self, - *, - learner: Classifier | None = None, - maximum_number_of_learners: int = 50, - learning_rate: float = 1.0, - ) -> None: - # Validation - if maximum_number_of_learners < 1: - raise OutOfBoundsError( - maximum_number_of_learners, - name="maximum_number_of_learners", - lower_bound=ClosedBound(1), - ) - if learning_rate <= 0: - raise OutOfBoundsError(learning_rate, name="learning_rate", lower_bound=OpenBound(0)) - - # Hyperparameters - self._learner = learner - self._maximum_number_of_learners = maximum_number_of_learners - self._learning_rate = learning_rate - - # Internal state - self._wrapped_classifier: sk_AdaBoostClassifier | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def learner(self) -> Classifier | None: - """ - Get the base learner used for training the ensemble. - - Returns - ------- - result: - The base learner. - """ - return self._learner - - @property - def maximum_number_of_learners(self) -> int: - """ - Get the maximum number of learners in the ensemble. - - Returns - ------- - result: - The maximum number of learners. - """ - return self._maximum_number_of_learners - - @property - def learning_rate(self) -> float: - """ - Get the learning rate. - - Returns - ------- - result: - The learning rate. - """ - return self._learning_rate - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> AdaBoostClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = AdaBoostClassifier( - learner=self.learner, - maximum_number_of_learners=self.maximum_number_of_learners, - learning_rate=self._learning_rate, - ) - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ - from sklearn.ensemble import AdaBoostClassifier as sk_AdaBoostClassifier - - learner = self.learner._get_sklearn_classifier() if self.learner is not None else None - return sk_AdaBoostClassifier( - estimator=learner, - n_estimators=self.maximum_number_of_learners, - learning_rate=self._learning_rate, - ) diff --git a/src/safeds/ml/classical/classification/_ada_boost_classifier.py b/src/safeds/ml/classical/classification/_ada_boost_classifier.py new file mode 100644 index 000000000..3fe076b1b --- /dev/null +++ b/src/safeds/ml/classical/classification/_ada_boost_classifier.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _AdaBoostBase + +from ._classifier import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + +class AdaBoostClassifier(Classifier, _AdaBoostBase): + """ + Ada Boost classification. + + Parameters + ---------- + learner: + The learner from which the boosted ensemble is built. + maximum_number_of_learners: + The maximum number of learners at which boosting is terminated. In case of perfect fit, the learning procedure + is stopped early. Has to be greater than 0. + learning_rate: + Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution + of each classifier. Has to be greater than 0. + + Raises + ------ + OutOfBoundsError + If `maximum_number_of_learners` or `learning_rate` are less than or equal to 0. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + learner: Classifier | None = None, + maximum_number_of_learners: int = 50, + learning_rate: float = 1.0, + ) -> None: + # Initialize superclasses + Classifier.__init__(self) + _AdaBoostBase.__init__( + self, + maximum_number_of_learners=maximum_number_of_learners, + learning_rate=learning_rate, + ) + + # Hyperparameters + self._learner: Classifier | None = learner + + def __hash__(self) -> int: + return _structural_hash( + Classifier.__hash__(self), + _AdaBoostBase.__hash__(self), + self._learner, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def learner(self) -> Classifier | None: + """The base learner used for training the ensemble.""" + return self._learner + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> AdaBoostClassifier: + return AdaBoostClassifier( + learner=self.learner, + maximum_number_of_learners=self._maximum_number_of_learners, + learning_rate=self._learning_rate, + ) + + def _get_sklearn_model(self) -> ClassifierMixin: + from sklearn.ensemble import AdaBoostClassifier as SklearnAdaBoostClassifier + + learner = self.learner._get_sklearn_model() if self.learner is not None else None + return SklearnAdaBoostClassifier( + estimator=learner, + n_estimators=self._maximum_number_of_learners, + learning_rate=self._learning_rate, + ) diff --git a/src/safeds/ml/classical/classification/_classifier.py b/src/safeds/ml/classical/classification/_classifier.py index 614428092..c05159d69 100644 --- a/src/safeds/ml/classical/classification/_classifier.py +++ b/src/safeds/ml/classical/classification/_classifier.py @@ -1,99 +1,21 @@ from __future__ import annotations -from abc import ABC, abstractmethod +from abc import ABC from typing import TYPE_CHECKING -from safeds._utils import _structural_hash -from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset -from safeds.data.tabular.containers import ExperimentalTable, Table -from safeds.exceptions import PlainTableError +from safeds.data.labeled.containers import TabularDataset +from safeds.exceptions import ModelNotFittedError +from safeds.ml.classical import SupervisedModel +from safeds.ml.metrics import ClassificationMetrics if TYPE_CHECKING: from typing import Any - from sklearn.base import ClassifierMixin + from safeds.data.tabular.containers import Table -class Classifier(ABC): - """Abstract base class for all classifiers.""" - - def __hash__(self) -> int: - """ - Return a deterministic hash value for a classifier. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self.__class__.__qualname__, self.is_fitted) - - @abstractmethod - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Classifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - """ - - @abstractmethod - def predict( - self, - dataset: Table | ExperimentalTable | ExperimentalTabularDataset, - ) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - """ - - @property - @abstractmethod - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - - @abstractmethod - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ +class Classifier(SupervisedModel, ABC): + """A model for classification tasks.""" # ------------------------------------------------------------------------------------------------------------------ # Metrics @@ -101,12 +23,14 @@ def _get_sklearn_classifier(self) -> ClassifierMixin: def summarize_metrics( self, - validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + validation_or_test_set: Table | TabularDataset, positive_class: Any, ) -> Table: """ Summarize the classifier's metrics on the given data. + **Note:** The model must be fitted. + Parameters ---------- validation_or_test_set: @@ -121,25 +45,29 @@ def summarize_metrics( Raises ------ - TypeError - If a table is passed instead of a tabular dataset. + ModelNotFittedError + If the classifier has not been fitted yet. """ - accuracy = self.accuracy(validation_or_test_set) - precision = self.precision(validation_or_test_set, positive_class) - recall = self.recall(validation_or_test_set, positive_class) - f1_score = self.f1_score(validation_or_test_set, positive_class) - - return Table( - { - "metric": ["accuracy", "precision", "recall", "f1_score"], - "value": [accuracy, precision, recall, f1_score], - }, + if not self.is_fitted: + raise ModelNotFittedError + + validation_or_test_set = _extract_table(validation_or_test_set) + + return ClassificationMetrics.summarize( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + positive_class, ) - def accuracy(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: + def accuracy(self, validation_or_test_set: Table | TabularDataset) -> float: """ Compute the accuracy of the classifier on the given data. + The accuracy is the proportion of predicted target values that were correct. The **higher** the accuracy, the + better. Results range from 0.0 to 1.0. + + **Note:** The model must be fitted. + Parameters ---------- validation_or_test_set: @@ -148,34 +76,35 @@ def accuracy(self, validation_or_test_set: TabularDataset | ExperimentalTabularD Returns ------- accuracy: - The calculated accuracy score, i.e. the percentage of equal data. + The classifier's accuracy. Raises ------ - TypeError - If a table is passed instead of a tabular dataset. + ModelNotFittedError + If the classifier has not been fitted yet. """ - from sklearn.metrics import accuracy_score as sk_accuracy_score + if not self.is_fitted: + raise ModelNotFittedError - if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): - raise PlainTableError + validation_or_test_set = _extract_table(validation_or_test_set) - if isinstance(validation_or_test_set, TabularDataset): - expected_values = validation_or_test_set.target - else: # pragma: no cover - expected_values = validation_or_test_set.target._series - predicted_values = self.predict(validation_or_test_set.features).target._data - - # TODO: more efficient implementation using polars - return sk_accuracy_score(expected_values._data, predicted_values) + return ClassificationMetrics.accuracy( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + ) - def precision( + def f1_score( self, - validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + validation_or_test_set: Table | TabularDataset, positive_class: Any, ) -> float: """ - Compute the classifier's precision on the given data. + Compute the classifier's F₁ score on the given data. + + The F₁ score is the harmonic mean of precision and recall. The **higher** the F₁ score, the better the + classifier. Results range from 0.0 to 1.0. + + **Note:** The model must be fitted. Parameters ---------- @@ -186,34 +115,37 @@ def precision( Returns ------- - precision: - The calculated precision score, i.e. the ratio of correctly predicted positives to all predicted positives. - Return 1 if no positive predictions are made. + f1_score: + The classifier's F₁ score. + + Raises + ------ + ModelNotFittedError + If the classifier has not been fitted yet. """ - if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): - raise PlainTableError + if not self.is_fitted: + raise ModelNotFittedError - expected_values = validation_or_test_set.target - predicted_values = self.predict(validation_or_test_set.features).target + validation_or_test_set = _extract_table(validation_or_test_set) - n_true_positives = 0 - n_false_positives = 0 + return ClassificationMetrics.f1_score( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + positive_class, + ) - # TODO: more efficient implementation using polars - for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): - if predicted_value == positive_class: - if expected_value == positive_class: - n_true_positives += 1 - else: - n_false_positives += 1 + def precision( + self, + validation_or_test_set: Table | TabularDataset, + positive_class: Any, + ) -> float: + """ + Compute the classifier's precision on the given data. - if (n_true_positives + n_false_positives) == 0: - return 1.0 - return n_true_positives / (n_true_positives + n_false_positives) + The precision is the proportion of positive predictions that were correct. The **higher** the precision, the + better the classifier. Results range from 0.0 to 1.0. - def recall(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset, positive_class: Any) -> float: - """ - Compute the classifier's recall on the given data. + **Note:** The model must be fitted. Parameters ---------- @@ -224,38 +156,33 @@ def recall(self, validation_or_test_set: TabularDataset | ExperimentalTabularDat Returns ------- - recall: - The calculated recall score, i.e. the ratio of correctly predicted positives to all expected positives. - Return 1 if there are no positive expectations. + precision: + The classifier's precision. + + Raises + ------ + ModelNotFittedError + If the classifier has not been fitted yet. """ - if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): - raise PlainTableError + if not self.is_fitted: + raise ModelNotFittedError - expected_values = validation_or_test_set.target - predicted_values = self.predict(validation_or_test_set.features).target + validation_or_test_set = _extract_table(validation_or_test_set) - n_true_positives = 0 - n_false_negatives = 0 + return ClassificationMetrics.precision( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + positive_class, + ) - # TODO: more efficient implementation using polars - for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): - if predicted_value == positive_class: - if expected_value == positive_class: - n_true_positives += 1 - elif expected_value == positive_class: - n_false_negatives += 1 + def recall(self, validation_or_test_set: Table | TabularDataset, positive_class: Any) -> float: + """ + Compute the classifier's recall on the given data. - if (n_true_positives + n_false_negatives) == 0: - return 1.0 - return n_true_positives / (n_true_positives + n_false_negatives) + The recall is the proportion of actual positives that were predicted correctly. The **higher** the recall, the + better the classifier. Results range from 0.0 to 1.0. - def f1_score( - self, - validation_or_test_set: TabularDataset | ExperimentalTabularDataset, - positive_class: Any, - ) -> float: - """ - Compute the classifier's $F_1$-score on the given data. + **Note:** The model must be fitted. Parameters ---------- @@ -266,30 +193,29 @@ def f1_score( Returns ------- - f1_score: - The calculated $F_1$-score, i.e. the harmonic mean between precision and recall. - Return 1 if there are no positive expectations and predictions. + recall: + The classifier's recall. + + Raises + ------ + ModelNotFittedError + If the classifier has not been fitted yet. """ - if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): - raise PlainTableError - - expected_values = validation_or_test_set.target - predicted_values = self.predict(validation_or_test_set.features).target - - n_true_positives = 0 - n_false_negatives = 0 - n_false_positives = 0 - - # TODO: more efficient implementation using polars - for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): - if predicted_value == positive_class: - if expected_value == positive_class: - n_true_positives += 1 - else: - n_false_positives += 1 - elif expected_value == positive_class: - n_false_negatives += 1 - - if (2 * n_true_positives + n_false_positives + n_false_negatives) == 0: - return 1.0 - return 2 * n_true_positives / (2 * n_true_positives + n_false_positives + n_false_negatives) + if not self.is_fitted: + raise ModelNotFittedError + + validation_or_test_set = _extract_table(validation_or_test_set) + + return ClassificationMetrics.recall( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + positive_class, + ) + + +def _extract_table(table_or_dataset: Table | TabularDataset) -> Table: + """Extract the table from the given table or dataset.""" + if isinstance(table_or_dataset, TabularDataset): + return table_or_dataset.to_table() + else: + return table_or_dataset diff --git a/src/safeds/ml/classical/classification/_decision_tree.py b/src/safeds/ml/classical/classification/_decision_tree.py deleted file mode 100644 index e9a43466c..000000000 --- a/src/safeds/ml/classical/classification/_decision_tree.py +++ /dev/null @@ -1,164 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._classifier import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class DecisionTreeClassifier(Classifier): - """ - Decision tree classification. - - Parameters - ---------- - maximum_depth: - The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. - minimum_number_of_samples_in_leaves: - The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. - - Raises - ------ - OutOfBoundsError - If `maximum_depth` is less than 1. - OutOfBoundsError - If `minimum_number_of_samples_in_leaves` is less than 1. - """ - - def __init__( - self, - *, - maximum_depth: int | None = None, - minimum_number_of_samples_in_leaves: int = 1, - ) -> None: - # Validation - if maximum_depth is not None and maximum_depth < 1: - raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1)) - if minimum_number_of_samples_in_leaves < 1: - raise OutOfBoundsError( - minimum_number_of_samples_in_leaves, - name="minimum_number_of_samples_in_leaves", - lower_bound=ClosedBound(1), - ) - - # Hyperparameters - self._maximum_depth: int | None = maximum_depth - self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves - - # Internal state - self._wrapped_classifier: sk_DecisionTreeClassifier | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - def __hash__(self) -> int: - return _structural_hash( - Classifier.__hash__(self), - self._feature_names, - self._target_name, - ) - - @property - def maximum_depth(self) -> int | None: - """The maximum depth of the tree.""" - return self._maximum_depth - - @property - def minimum_number_of_samples_in_leaves(self) -> int: - """The minimum number of samples that must remain in the leaves of the tree.""" - return self._minimum_number_of_samples_in_leaves - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> DecisionTreeClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = DecisionTreeClassifier( - maximum_depth=self._maximum_depth, - minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, - ) - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier - - return sk_DecisionTreeClassifier( - max_depth=self._maximum_depth, - min_samples_leaf=self._minimum_number_of_samples_in_leaves, - ) diff --git a/src/safeds/ml/classical/classification/_decision_tree_classifier.py b/src/safeds/ml/classical/classification/_decision_tree_classifier.py new file mode 100644 index 000000000..c2b60a262 --- /dev/null +++ b/src/safeds/ml/classical/classification/_decision_tree_classifier.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _DecisionTreeBase + +from ._classifier import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + +class DecisionTreeClassifier(Classifier, _DecisionTreeBase): + """ + Decision tree classification. + + Parameters + ---------- + maximum_depth: + The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. + minimum_number_of_samples_in_leaves: + The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. + + Raises + ------ + OutOfBoundsError + If `maximum_depth` is less than 1. + OutOfBoundsError + If `minimum_number_of_samples_in_leaves` is less than 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + maximum_depth: int | None = None, + minimum_number_of_samples_in_leaves: int = 1, + ) -> None: + # Initialize superclasses + Classifier.__init__(self) + _DecisionTreeBase.__init__( + self, + maximum_depth=maximum_depth, + minimum_number_of_samples_in_leaves=minimum_number_of_samples_in_leaves, + ) + + def __hash__(self) -> int: + return _structural_hash( + Classifier.__hash__(self), + _DecisionTreeBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> DecisionTreeClassifier: + return DecisionTreeClassifier( + maximum_depth=self._maximum_depth, + minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, + ) + + def _get_sklearn_model(self) -> ClassifierMixin: + from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier + + return SklearnDecisionTreeClassifier( + max_depth=self._maximum_depth, + min_samples_leaf=self._minimum_number_of_samples_in_leaves, + ) diff --git a/src/safeds/ml/classical/classification/_gradient_boosting.py b/src/safeds/ml/classical/classification/_gradient_boosting.py deleted file mode 100644 index 869c77028..000000000 --- a/src/safeds/ml/classical/classification/_gradient_boosting.py +++ /dev/null @@ -1,173 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._classifier import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.ensemble import GradientBoostingClassifier as sk_GradientBoostingClassifier - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class GradientBoostingClassifier(Classifier): - """ - Gradient boosting classification. - - Parameters - ---------- - number_of_trees: - The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large - number usually results in better performance. - learning_rate: - The larger the value, the more the model is influenced by each additional tree. If the learning rate is too - low, the model might underfit. If the learning rate is too high, the model might overfit. - - Raises - ------ - OutOfBoundsError - If `number_of_trees` or `learning_rate` is less than or equal to 0. - """ - - def __hash__(self) -> int: - return _structural_hash( - Classifier.__hash__(self), - self._target_name, - self._feature_names, - self._learning_rate, - self._number_of_trees, - ) - - def __init__(self, *, number_of_trees: int = 100, learning_rate: float = 0.1) -> None: - # Validation - if number_of_trees < 1: - raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1)) - if learning_rate <= 0: - raise OutOfBoundsError(learning_rate, name="learning_rate", lower_bound=OpenBound(0)) - - # Hyperparameters - self._number_of_trees = number_of_trees - self._learning_rate = learning_rate - - # Internal state - self._wrapped_classifier: sk_GradientBoostingClassifier | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def number_of_trees(self) -> int: - """ - Get the number of trees (estimators) in the ensemble. - - Returns - ------- - result: - The number of trees. - """ - return self._number_of_trees - - @property - def learning_rate(self) -> float: - """ - Get the learning rate. - - Returns - ------- - result: - The learning rate. - """ - return self._learning_rate - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> GradientBoostingClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = GradientBoostingClassifier(number_of_trees=self._number_of_trees, learning_rate=self._learning_rate) - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ - from sklearn.ensemble import GradientBoostingClassifier as sk_GradientBoostingClassifier - - return sk_GradientBoostingClassifier(n_estimators=self._number_of_trees, learning_rate=self._learning_rate) diff --git a/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py b/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py new file mode 100644 index 000000000..0d3c6ace2 --- /dev/null +++ b/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _GradientBoostingBase + +from ._classifier import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + from safeds.data.labeled.containers import TabularDataset + from safeds.data.tabular.containers import Table + + +class GradientBoostingClassifier(Classifier, _GradientBoostingBase): + """ + Gradient boosting classification. + + Parameters + ---------- + number_of_trees: + The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large + number usually results in better performance. + learning_rate: + The larger the value, the more the model is influenced by each additional tree. If the learning rate is too + low, the model might underfit. If the learning rate is too high, the model might overfit. + + Raises + ------ + OutOfBoundsError + If `number_of_trees` or `learning_rate` is less than or equal to 0. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + number_of_trees: int = 100, + learning_rate: float = 0.1, + ) -> None: + # Initialize superclasses + Classifier.__init__(self) + _GradientBoostingBase.__init__( + self, + number_of_trees=number_of_trees, + learning_rate=learning_rate, + ) + + def __hash__(self) -> int: + return _structural_hash( + Classifier.__hash__(self), + _GradientBoostingBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> GradientBoostingClassifier: + return GradientBoostingClassifier( + number_of_trees=self._number_of_trees, + learning_rate=self._learning_rate, + ) + + def _get_sklearn_model(self) -> ClassifierMixin: + from sklearn.ensemble import GradientBoostingClassifier as SklearnGradientBoostingClassifier + + return SklearnGradientBoostingClassifier( + n_estimators=self._number_of_trees, + learning_rate=self._learning_rate, + ) diff --git a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py deleted file mode 100644 index 974a4f9a5..000000000 --- a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py +++ /dev/null @@ -1,166 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset -from safeds.data.tabular.containers import ExperimentalTable, Table -from safeds.exceptions import ClosedBound, DatasetMissesDataError, OutOfBoundsError, PlainTableError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._classifier import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.neighbors import KNeighborsClassifier as sk_KNeighborsClassifier - - -class KNearestNeighborsClassifier(Classifier): - """ - K-nearest-neighbors classification. - - Parameters - ---------- - number_of_neighbors: - The number of neighbors to use for interpolation. Has to be greater than 0 (validated in the constructor) and - less than or equal to the sample size (validated when calling `fit`). - - Raises - ------ - OutOfBoundsError - If `number_of_neighbors` is less than 1. - """ - - def __hash__(self) -> int: - return _structural_hash( - Classifier.__hash__(self), - self._target_name, - self._feature_names, - self._number_of_neighbors, - ) - - def __init__(self, number_of_neighbors: int) -> None: - # Validation - if number_of_neighbors < 1: - raise OutOfBoundsError(number_of_neighbors, name="number_of_neighbors", lower_bound=ClosedBound(1)) - - # Hyperparameters - self._number_of_neighbors = number_of_neighbors - - # Internal state - self._wrapped_classifier: sk_KNeighborsClassifier | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def number_of_neighbors(self) -> int: - """ - Get the number of neighbors used for interpolation. - - Returns - ------- - result: - The number of neighbors. - """ - return self._number_of_neighbors - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> KNearestNeighborsClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - ValueError - If `number_of_neighbors` is greater than the sample size. - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - if not isinstance(training_set, TabularDataset) and isinstance(training_set, Table): - raise PlainTableError - if training_set._table.number_of_rows == 0: - raise DatasetMissesDataError - if self._number_of_neighbors > training_set._table.number_of_rows: - raise ValueError( - ( - f"The parameter 'number_of_neighbors' ({self._number_of_neighbors}) has to be less than or equal to" - f" the sample size ({training_set._table.number_of_rows})." - ), - ) - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = KNearestNeighborsClassifier(self._number_of_neighbors) - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ - from sklearn.neighbors import KNeighborsClassifier as sk_KNeighborsClassifier - - return sk_KNeighborsClassifier(self._number_of_neighbors, n_jobs=-1) diff --git a/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py b/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py new file mode 100644 index 000000000..89b5972d3 --- /dev/null +++ b/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _KNearestNeighborsBase + +from ._classifier import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + from safeds.data.labeled.containers import TabularDataset + + +class KNearestNeighborsClassifier(Classifier, _KNearestNeighborsBase): + """ + K-nearest-neighbors classification. + + Parameters + ---------- + number_of_neighbors: + The number of neighbors to use for interpolation. Has to be greater than 0 (validated in the constructor) and + less than or equal to the sample size (validated when calling `fit`). + + Raises + ------ + OutOfBoundsError + If `number_of_neighbors` is less than 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + number_of_neighbors: int, + ) -> None: + # Initialize superclasses + Classifier.__init__(self) + _KNearestNeighborsBase.__init__( + self, + number_of_neighbors=number_of_neighbors, + ) + + def __hash__(self) -> int: + return _structural_hash( + Classifier.__hash__(self), + _KNearestNeighborsBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _check_additional_fit_preconditions(self, training_set: TabularDataset): + if self._number_of_neighbors > training_set._table.number_of_rows: + raise ValueError( + ( + f"The parameter 'number_of_neighbors' ({self._number_of_neighbors}) has to be less than or equal to" + f" the sample size ({training_set._table.number_of_rows})." + ), + ) + + def _clone(self) -> KNearestNeighborsClassifier: + return KNearestNeighborsClassifier( + number_of_neighbors=self._number_of_neighbors, + ) + + def _get_sklearn_model(self) -> ClassifierMixin: + from sklearn.neighbors import KNeighborsClassifier as SklearnKNeighborsClassifier + + return SklearnKNeighborsClassifier( + n_neighbors=self._number_of_neighbors, + n_jobs=-1, + ) diff --git a/src/safeds/ml/classical/classification/_logistic_classifier.py b/src/safeds/ml/classical/classification/_logistic_classifier.py new file mode 100644 index 000000000..e312e6b25 --- /dev/null +++ b/src/safeds/ml/classical/classification/_logistic_classifier.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _get_random_seed, _structural_hash + +from ._classifier import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + +class LogisticClassifier(Classifier): + """Regularized logistic regression for classification.""" + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self) -> None: + super().__init__() + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> LogisticClassifier: + return LogisticClassifier() + + def _get_sklearn_model(self) -> ClassifierMixin: + from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression + + return SklearnLogisticRegression( + random_state=_get_random_seed(), + n_jobs=-1, + ) diff --git a/src/safeds/ml/classical/classification/_logistic_regression.py b/src/safeds/ml/classical/classification/_logistic_regression.py deleted file mode 100644 index 22a6bcd00..000000000 --- a/src/safeds/ml/classical/classification/_logistic_regression.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.ml.classical._util_sklearn import fit, predict - -from ._classifier import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.linear_model import LogisticRegression as sk_LogisticRegression - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class LogisticRegressionClassifier(Classifier): - """Regularized logistic regression.""" - - def __hash__(self) -> int: - return _structural_hash(Classifier.__hash__(self), self._target_name, self._feature_names) - - def __init__(self) -> None: - # Internal state - self._wrapped_classifier: sk_LogisticRegression | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LogisticRegressionClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = LogisticRegressionClassifier() - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ - from sklearn.linear_model import LogisticRegression as sk_LogisticRegression - - return sk_LogisticRegression(n_jobs=-1) diff --git a/src/safeds/ml/classical/classification/_random_forest.py b/src/safeds/ml/classical/classification/_random_forest.py deleted file mode 100644 index ed5bb2681..000000000 --- a/src/safeds/ml/classical/classification/_random_forest.py +++ /dev/null @@ -1,191 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._classifier import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class RandomForestClassifier(Classifier): - """ - Random forest classification. - - Parameters - ---------- - number_of_trees: - The number of trees to be used in the random forest. Has to be greater than 0. - maximum_depth: - The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. - minimum_number_of_samples_in_leaves: - The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. - - Raises - ------ - OutOfBoundsError - If `number_of_trees` is less than 1. - OutOfBoundsError - If `maximum_depth` is less than 1. - OutOfBoundsError - If `minimum_number_of_samples_in_leaves` is less than 1. - """ - - def __init__( - self, - *, - number_of_trees: int = 100, - maximum_depth: int | None = None, - minimum_number_of_samples_in_leaves: int = 1, - ) -> None: - # Validation - if number_of_trees < 1: - raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1)) - if maximum_depth is not None and maximum_depth < 1: - raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1)) - if minimum_number_of_samples_in_leaves < 1: - raise OutOfBoundsError( - minimum_number_of_samples_in_leaves, - name="minimum_number_of_samples_in_leaves", - lower_bound=ClosedBound(1), - ) - - # Hyperparameters - self._number_of_trees: int = number_of_trees - self._maximum_depth: int | None = maximum_depth - self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves - - # Internal state - self._wrapped_classifier: sk_RandomForestClassifier | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - def __hash__(self) -> int: - return _structural_hash( - Classifier.__hash__(self), - self._feature_names, - self._target_name, - self._number_of_trees, - self._maximum_depth, - self._minimum_number_of_samples_in_leaves, - ) - - @property - def number_of_trees(self) -> int: - """The number of trees used in the random forest.""" - return self._number_of_trees - - @property - def maximum_depth(self) -> int | None: - """The maximum depth of each tree.""" - return self._maximum_depth - - @property - def minimum_number_of_samples_in_leaves(self) -> int: - """The minimum number of samples that must remain in the leaves of each tree.""" - return self._minimum_number_of_samples_in_leaves - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RandomForestClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = RandomForestClassifier( - number_of_trees=self._number_of_trees, - maximum_depth=self._maximum_depth, - minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, - ) - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ - from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier - - return sk_RandomForestClassifier( - n_estimators=self._number_of_trees, - max_depth=self._maximum_depth, - min_samples_leaf=self._minimum_number_of_samples_in_leaves, - n_jobs=-1, - ) diff --git a/src/safeds/ml/classical/classification/_random_forest_classifier.py b/src/safeds/ml/classical/classification/_random_forest_classifier.py new file mode 100644 index 000000000..dea747058 --- /dev/null +++ b/src/safeds/ml/classical/classification/_random_forest_classifier.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _get_random_seed, _structural_hash +from safeds.ml.classical._bases import _RandomForestBase + +from ._classifier import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + +class RandomForestClassifier(Classifier, _RandomForestBase): + """ + Random forest classification. + + Parameters + ---------- + number_of_trees: + The number of trees to be used in the random forest. Has to be greater than 0. + maximum_depth: + The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. + minimum_number_of_samples_in_leaves: + The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. + + Raises + ------ + OutOfBoundsError + If `number_of_trees` is less than 1. + OutOfBoundsError + If `maximum_depth` is less than 1. + OutOfBoundsError + If `minimum_number_of_samples_in_leaves` is less than 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + number_of_trees: int = 100, + maximum_depth: int | None = None, + minimum_number_of_samples_in_leaves: int = 1, + ) -> None: + # Initialize superclasses + Classifier.__init__(self) + _RandomForestBase.__init__( + self, + number_of_trees=number_of_trees, + maximum_depth=maximum_depth, + minimum_number_of_samples_in_leaves=minimum_number_of_samples_in_leaves, + ) + + def __hash__(self) -> int: + return _structural_hash( + Classifier.__hash__(self), + _RandomForestBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> RandomForestClassifier: + return RandomForestClassifier( + number_of_trees=self._number_of_trees, + maximum_depth=self._maximum_depth, + minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, + ) + + def _get_sklearn_model(self) -> ClassifierMixin: + from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier + + return SklearnRandomForestClassifier( + n_estimators=self._number_of_trees, + max_depth=self._maximum_depth, + min_samples_leaf=self._minimum_number_of_samples_in_leaves, + random_state=_get_random_seed(), + n_jobs=-1, + ) diff --git a/src/safeds/ml/classical/classification/_support_vector_classifier.py b/src/safeds/ml/classical/classification/_support_vector_classifier.py new file mode 100644 index 000000000..509503765 --- /dev/null +++ b/src/safeds/ml/classical/classification/_support_vector_classifier.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _get_random_seed, _structural_hash +from safeds.ml.classical._bases import _SupportVectorMachineBase +from safeds.ml.classical.classification import Classifier + +if TYPE_CHECKING: + from sklearn.base import ClassifierMixin + + +class SupportVectorClassifier(Classifier, _SupportVectorMachineBase): + """ + Support vector machine for classification. + + Parameters + ---------- + c: + The strength of regularization. Must be greater than 0. + kernel: + The type of kernel to be used. Defaults to a radial basis function kernel. + + Raises + ------ + OutOfBoundsError + If `c` is less than or equal to 0. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + c: float = 1.0, + kernel: SupportVectorClassifier.Kernel | None = None, + ) -> None: + # Initialize superclasses + Classifier.__init__(self) + _SupportVectorMachineBase.__init__( + self, + c=c, + kernel=kernel, + ) + + def __hash__(self) -> int: + return _structural_hash( + Classifier.__hash__(self), + _SupportVectorMachineBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def kernel(self) -> SupportVectorClassifier.Kernel: + """The type of kernel used.""" + return self._kernel + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> SupportVectorClassifier: + return SupportVectorClassifier( + c=self._c, + kernel=self._kernel, + ) + + def _get_sklearn_model(self) -> ClassifierMixin: + """ + Return a new wrapped Classifier from sklearn. + + Returns + ------- + wrapped_classifier: + The sklearn Classifier. + """ + from sklearn.svm import SVC as SklearnSVC # noqa: N811 + + result = SklearnSVC( + C=self._c, + random_state=_get_random_seed(), + ) + self._kernel._apply(result) + return result diff --git a/src/safeds/ml/classical/classification/_support_vector_machine.py b/src/safeds/ml/classical/classification/_support_vector_machine.py deleted file mode 100644 index 33499d782..000000000 --- a/src/safeds/ml/classical/classification/_support_vector_machine.py +++ /dev/null @@ -1,277 +0,0 @@ -from __future__ import annotations - -import sys -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict -from safeds.ml.classical.classification import Classifier - -if TYPE_CHECKING: - from sklearn.base import ClassifierMixin - from sklearn.svm import SVC as sk_SVC # noqa: N811 - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class SupportVectorMachineKernel(ABC): - """The abstract base class of the different subclasses supported by the `Kernel`.""" - - @abstractmethod - def _get_sklearn_arguments(self) -> dict[str, Any]: # TODO: use apply pattern (imputer strategy) instead - """Return the arguments to pass to scikit-learn.""" - - @abstractmethod - def __eq__(self, other: object) -> bool: - """ - Compare two kernels. - - Parameters - ---------- - other: - other object to compare to - - Returns - ------- - equals: - Whether the two kernels are equal - """ - - def __hash__(self) -> int: - """ - Return a deterministic hash value for this kernel. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self.__class__.__qualname__) - - -class SupportVectorMachineClassifier(Classifier): - """ - Support vector machine. - - Parameters - ---------- - c: - The strength of regularization. Must be strictly positive. - kernel: - The type of kernel to be used. Defaults to None. - - Raises - ------ - OutOfBoundsError - If `c` is less than or equal to 0. - """ - - def __hash__(self) -> int: - return _structural_hash(Classifier.__hash__(self), self._target_name, self._feature_names, self._c, self.kernel) - - def __init__(self, *, c: float = 1.0, kernel: SupportVectorMachineKernel | None = None) -> None: - # Inputs - if c <= 0: - raise OutOfBoundsError(c, name="c", lower_bound=OpenBound(0)) - if kernel is None: - kernel = self.Kernel.RadialBasisFunction() - - # Internal state - self._wrapped_classifier: sk_SVC | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - # Hyperparameters - self._c: float = c - self._kernel: SupportVectorMachineKernel = kernel - - @property - def c(self) -> float: - """ - Get the regularization strength. - - Returns - ------- - result: - The regularization strength. - """ - return self._c - - @property - def kernel(self) -> SupportVectorMachineKernel: - """ - Get the type of kernel used. - - Returns - ------- - result: - The type of kernel used. - """ - return self._kernel - - class Kernel: - class Linear(SupportVectorMachineKernel): - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "linear", - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineClassifier.Kernel.Linear): - return NotImplemented - return True - - __hash__ = SupportVectorMachineKernel.__hash__ - - class Polynomial(SupportVectorMachineKernel): - def __init__(self, degree: int): - if degree < 1: - raise OutOfBoundsError(degree, name="degree", lower_bound=ClosedBound(1)) - self._degree = degree - - @property - def degree(self) -> int: - """The degree of the polynomial kernel.""" - return self._degree - - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "poly", - "degree": self._degree, - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineClassifier.Kernel.Polynomial): - return NotImplemented - return self._degree == other._degree - - def __hash__(self) -> int: - return _structural_hash(SupportVectorMachineKernel.__hash__(self), self._degree) - - def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return sys.getsizeof(self._degree) - - class Sigmoid(SupportVectorMachineKernel): - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "sigmoid", - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineClassifier.Kernel.Sigmoid): - return NotImplemented - return True - - __hash__ = SupportVectorMachineKernel.__hash__ - - class RadialBasisFunction(SupportVectorMachineKernel): - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "rbf", - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineClassifier.Kernel.RadialBasisFunction): - return NotImplemented - return True - - __hash__ = SupportVectorMachineKernel.__hash__ - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> SupportVectorMachineClassifier: - """ - Create a copy of this classifier and fit it with the given training data. - - This classifier is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_classifier: - The fitted classifier. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_classifier = self._get_sklearn_classifier() - fit(wrapped_classifier, training_set) - - result = SupportVectorMachineClassifier(c=self._c, kernel=self._kernel) - result._wrapped_classifier = wrapped_classifier - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the classifier is fitted.""" - return self._wrapped_classifier is not None - - def _get_sklearn_classifier(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ - from sklearn.svm import SVC as sk_SVC # noqa: N811 - - return sk_SVC(C=self._c, **(self._kernel._get_sklearn_arguments())) diff --git a/src/safeds/ml/classical/regression/__init__.py b/src/safeds/ml/classical/regression/__init__.py index fc082d6aa..ed8c2bcbb 100644 --- a/src/safeds/ml/classical/regression/__init__.py +++ b/src/safeds/ml/classical/regression/__init__.py @@ -5,34 +5,34 @@ import apipkg if TYPE_CHECKING: - from ._ada_boost import AdaBoostRegressor + from ._ada_boost_regressor import AdaBoostRegressor from ._arima import ArimaModelRegressor - from ._decision_tree import DecisionTreeRegressor - from ._elastic_net_regression import ElasticNetRegressor - from ._gradient_boosting import GradientBoostingRegressor - from ._k_nearest_neighbors import KNearestNeighborsRegressor - from ._lasso_regression import LassoRegressor - from ._linear_regression import LinearRegressionRegressor - from ._random_forest import RandomForestRegressor + from ._decision_tree_regressor import DecisionTreeRegressor + from ._elastic_net_regressor import ElasticNetRegressor + from ._gradient_boosting_regressor import GradientBoostingRegressor + from ._k_nearest_neighbors_regressor import KNearestNeighborsRegressor + from ._lasso_regressor import LassoRegressor + from ._linear_regressor import LinearRegressor + from ._random_forest_regressor import RandomForestRegressor from ._regressor import Regressor - from ._ridge_regression import RidgeRegressor - from ._support_vector_machine import SupportVectorMachineRegressor + from ._ridge_regressor import RidgeRegressor + from ._support_vector_regressor import SupportVectorRegressor apipkg.initpkg( __name__, { - "AdaBoostRegressor": "._ada_boost:AdaBoostRegressor", + "AdaBoostRegressor": "._ada_boost_regressor:AdaBoostRegressor", "ArimaModelRegressor": "._arima:ArimaModelRegressor", - "DecisionTreeRegressor": "._decision_tree:DecisionTreeRegressor", - "ElasticNetRegressor": "._elastic_net_regression:ElasticNetRegressor", - "GradientBoostingRegressor": "._gradient_boosting:GradientBoostingRegressor", - "KNearestNeighborsRegressor": "._k_nearest_neighbors:KNearestNeighborsRegressor", - "LassoRegressor": "._lasso_regression:LassoRegressor", - "LinearRegressionRegressor": "._linear_regression:LinearRegressionRegressor", - "RandomForestRegressor": "._random_forest:RandomForestRegressor", + "DecisionTreeRegressor": "._decision_tree_regressor:DecisionTreeRegressor", + "ElasticNetRegressor": "._elastic_net_regressor:ElasticNetRegressor", + "GradientBoostingRegressor": "._gradient_boosting_regressor:GradientBoostingRegressor", + "KNearestNeighborsRegressor": "._k_nearest_neighbors_regressor:KNearestNeighborsRegressor", + "LassoRegressor": "._lasso_regressor:LassoRegressor", + "LinearRegressor": "._linear_regressor:LinearRegressor", + "RandomForestRegressor": "._random_forest_regressor:RandomForestRegressor", "Regressor": "._regressor:Regressor", - "RidgeRegressor": "._ridge_regression:RidgeRegressor", - "SupportVectorMachineRegressor": "._support_vector_machine:SupportVectorMachineRegressor", + "RidgeRegressor": "._ridge_regressor:RidgeRegressor", + "SupportVectorRegressor": "._support_vector_regressor:SupportVectorRegressor", }, ) @@ -44,9 +44,9 @@ "GradientBoostingRegressor", "KNearestNeighborsRegressor", "LassoRegressor", - "LinearRegressionRegressor", + "LinearRegressor", "RandomForestRegressor", "Regressor", "RidgeRegressor", - "SupportVectorMachineRegressor", + "SupportVectorRegressor", ] diff --git a/src/safeds/ml/classical/regression/_ada_boost.py b/src/safeds/ml/classical/regression/_ada_boost.py deleted file mode 100644 index 3b85df127..000000000 --- a/src/safeds/ml/classical/regression/_ada_boost.py +++ /dev/null @@ -1,207 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.ensemble import AdaBoostRegressor as sk_AdaBoostRegressor - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class AdaBoostRegressor(Regressor): - """ - Ada Boost regression. - - Parameters - ---------- - learner: - The learner from which the boosted ensemble is built. - maximum_number_of_learners: - The maximum number of learners at which boosting is terminated. In case of perfect fit, the learning procedure - is stopped early. Has to be greater than 0. - learning_rate: - Weight applied to each regressor at each boosting iteration. A higher learning rate increases the contribution - of each regressor. Has to be greater than 0. - - Raises - ------ - OutOfBoundsError - If `maximum_number_of_learners` or `learning_rate` are less than or equal to 0. - """ - - def __hash__(self) -> int: - return _structural_hash( - Regressor.__hash__(self), - self._target_name, - self._feature_names, - self._learning_rate, - self._maximum_number_of_learners, - ) - - def __init__( - self, - *, - learner: Regressor | None = None, - maximum_number_of_learners: int = 50, - learning_rate: float = 1.0, - ) -> None: - # Validation - if maximum_number_of_learners < 1: - raise OutOfBoundsError( - maximum_number_of_learners, - name="maximum_number_of_learners", - lower_bound=ClosedBound(1), - ) - if learning_rate <= 0: - raise OutOfBoundsError(learning_rate, name="learning_rate", lower_bound=OpenBound(0)) - - # Hyperparameters - self._learner = learner - self._maximum_number_of_learners = maximum_number_of_learners - self._learning_rate = learning_rate - - # Internal state - self._wrapped_regressor: sk_AdaBoostRegressor | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def learner(self) -> Regressor | None: - """ - Get the base learner used for training the ensemble. - - Returns - ------- - result: - The base learner. - """ - return self._learner - - @property - def maximum_number_of_learners(self) -> int: - """ - Get the maximum number of learners in the ensemble. - - Returns - ------- - result: - The maximum number of learners. - """ - return self._maximum_number_of_learners - - @property - def learning_rate(self) -> float: - """ - Get the learning rate. - - Returns - ------- - result: - The learning rate. - """ - return self._learning_rate - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> AdaBoostRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = AdaBoostRegressor( - learner=self._learner, - maximum_number_of_learners=self._maximum_number_of_learners, - learning_rate=self._learning_rate, - ) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.ensemble import AdaBoostRegressor as sk_AdaBoostRegressor - - learner = self._learner._get_sklearn_regressor() if self._learner is not None else None - return sk_AdaBoostRegressor( - estimator=learner, - n_estimators=self._maximum_number_of_learners, - learning_rate=self._learning_rate, - ) diff --git a/src/safeds/ml/classical/regression/_ada_boost_regressor.py b/src/safeds/ml/classical/regression/_ada_boost_regressor.py new file mode 100644 index 000000000..ecceb1048 --- /dev/null +++ b/src/safeds/ml/classical/regression/_ada_boost_regressor.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _AdaBoostBase + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class AdaBoostRegressor(Regressor, _AdaBoostBase): + """ + Ada Boost regression. + + Parameters + ---------- + learner: + The learner from which the boosted ensemble is built. + maximum_number_of_learners: + The maximum number of learners at which boosting is terminated. In case of perfect fit, the learning procedure + is stopped early. Has to be greater than 0. + learning_rate: + Weight applied to each regressor at each boosting iteration. A higher learning rate increases the contribution + of each regressor. Has to be greater than 0. + + Raises + ------ + OutOfBoundsError + If `maximum_number_of_learners` or `learning_rate` are less than or equal to 0. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + learner: Regressor | None = None, + maximum_number_of_learners: int = 50, + learning_rate: float = 1.0, + ) -> None: + # Initialize superclasses + Regressor.__init__(self) + _AdaBoostBase.__init__( + self, + maximum_number_of_learners=maximum_number_of_learners, + learning_rate=learning_rate, + ) + + # Hyperparameters + self._learner: Regressor | None = learner + + def __hash__(self) -> int: + return _structural_hash( + Regressor.__hash__(self), + _AdaBoostBase.__hash__(self), + self._learner, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def learner(self) -> Regressor | None: + """The base learner used for training the ensemble.""" + return self._learner + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> AdaBoostRegressor: + return AdaBoostRegressor( + learner=self.learner, + maximum_number_of_learners=self._maximum_number_of_learners, + learning_rate=self._learning_rate, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.ensemble import AdaBoostRegressor as SklearnAdaBoostRegressor + + learner = self.learner._get_sklearn_model() if self.learner is not None else None + return SklearnAdaBoostRegressor( + estimator=learner, + n_estimators=self._maximum_number_of_learners, + learning_rate=self._learning_rate, + ) diff --git a/src/safeds/ml/classical/regression/_arima.py b/src/safeds/ml/classical/regression/_arima.py index a5c9de89f..4f14a312a 100644 --- a/src/safeds/ml/classical/regression/_arima.py +++ b/src/safeds/ml/classical/regression/_arima.py @@ -6,7 +6,6 @@ from safeds._utils import _structural_hash from safeds.data.image.containers import Image -from safeds.data.labeled.containers import TimeSeriesDataset from safeds.data.tabular.containers import Column from safeds.exceptions import ( DatasetMissesDataError, @@ -18,6 +17,8 @@ if TYPE_CHECKING: from statsmodels.tsa.arima.model import ARIMA + from safeds.data.labeled.containers import TimeSeriesDataset + class ArimaModelRegressor: """Auto Regressive Integrated Moving Average Model.""" @@ -73,9 +74,9 @@ def fit(self, time_series: TimeSeriesDataset) -> ArimaModelRegressor: table = time_series.to_table() if table.number_of_rows == 0: raise DatasetMissesDataError - if not time_series.target.type.is_numeric(): + if not time_series.target.type.is_numeric: raise NonNumericColumnError(time_series.target.name) - if time_series.target.has_missing_values(): + if time_series.target.missing_value_count() > 0: raise MissingValuesColumnError( time_series.target.name, "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" @@ -91,7 +92,7 @@ def fit(self, time_series: TimeSeriesDataset) -> ArimaModelRegressor: best_param = (0, 0, 0) for param in pdq: # Create and fit an ARIMA model with the current parameters - mod = ARIMA(time_series.target._data.values, order=param) + mod = ARIMA(time_series.target._series.to_numpy(), order=param) # I wasnt able to invoke an learning Error # Add try catch when an learning error is found @@ -132,7 +133,7 @@ def predict(self, time_series: TimeSeriesDataset) -> TimeSeriesDataset: If predicting with the given dataset failed. """ # make a table without - forecast_horizon = len(time_series.target._data) + forecast_horizon = len(time_series.target._series.to_numpy()) result_table = time_series.to_table() result_table = result_table.remove_columns([time_series.target.name]) # Validation @@ -145,7 +146,7 @@ def predict(self, time_series: TimeSeriesDataset) -> TimeSeriesDataset: target_column: Column = Column(name=time_series.target.name + " " + "forecasted", data=forecast_results) # create new TimeSeries - result_table = result_table.add_column(target_column) + result_table = result_table.add_columns(target_column) return result_table.to_time_series_dataset( target_name=time_series.target.name + " " + "forecasted", time_name=time_series.time.name, @@ -178,7 +179,7 @@ def plot_predictions(self, test_series: TimeSeriesDataset) -> Image: if not self.is_fitted or self._arima is None: raise ModelNotFittedError - test_data = test_series.target._data.to_numpy() + test_data = test_series.target._series.to_numpy() n_steps = len(test_data) forecast_results = self._arima.forecast(steps=n_steps) diff --git a/src/safeds/ml/classical/regression/_decision_tree.py b/src/safeds/ml/classical/regression/_decision_tree.py deleted file mode 100644 index 33c40d1e6..000000000 --- a/src/safeds/ml/classical/regression/_decision_tree.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class DecisionTreeRegressor(Regressor): - """ - Decision tree regression. - - Parameters - ---------- - maximum_depth: - The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. - minimum_number_of_samples_in_leaves: - The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. - - Raises - ------ - OutOfBoundsError - If `maximum_depth` is less than 1. - OutOfBoundsError - If `minimum_number_of_samples_in_leaves` is less than 1. - """ - - def __init__( - self, - *, - maximum_depth: int | None = None, - minimum_number_of_samples_in_leaves: int = 5, - ) -> None: - # Validation - if maximum_depth is not None and maximum_depth < 1: - raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1)) - if minimum_number_of_samples_in_leaves < 1: - raise OutOfBoundsError( - minimum_number_of_samples_in_leaves, - name="minimum_number_of_samples_in_leaves", - lower_bound=ClosedBound(1), - ) - - # Hyperparameters - self._maximum_depth: int | None = maximum_depth - self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves - - # Internal state - self._wrapped_regressor: sk_DecisionTreeRegressor | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - def __hash__(self) -> int: - return _structural_hash( - Regressor.__hash__(self), - self._feature_names, - self._target_name, - ) - - @property - def maximum_depth(self) -> int | None: - """The maximum depth of the tree.""" - return self._maximum_depth - - @property - def minimum_number_of_samples_in_leaves(self) -> int: - """The minimum number of samples that must remain in the leaves of the tree.""" - return self._minimum_number_of_samples_in_leaves - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> DecisionTreeRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = DecisionTreeRegressor( - maximum_depth=self._maximum_depth, - minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, - ) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor - - return sk_DecisionTreeRegressor( - max_depth=self._maximum_depth, - min_samples_leaf=self._minimum_number_of_samples_in_leaves, - ) diff --git a/src/safeds/ml/classical/regression/_decision_tree_regressor.py b/src/safeds/ml/classical/regression/_decision_tree_regressor.py new file mode 100644 index 000000000..fb1ea5664 --- /dev/null +++ b/src/safeds/ml/classical/regression/_decision_tree_regressor.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _DecisionTreeBase + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class DecisionTreeRegressor(Regressor, _DecisionTreeBase): + """ + Decision tree regression. + + Parameters + ---------- + maximum_depth: + The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. + minimum_number_of_samples_in_leaves: + The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. + + Raises + ------ + OutOfBoundsError + If `maximum_depth` is less than 1. + OutOfBoundsError + If `minimum_number_of_samples_in_leaves` is less than 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + maximum_depth: int | None = None, + minimum_number_of_samples_in_leaves: int = 5, + ) -> None: + # Initialize superclasses + Regressor.__init__(self) + _DecisionTreeBase.__init__( + self, + maximum_depth=maximum_depth, + minimum_number_of_samples_in_leaves=minimum_number_of_samples_in_leaves, + ) + + def __hash__(self) -> int: + return _structural_hash( + Regressor.__hash__(self), + _DecisionTreeBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> DecisionTreeRegressor: + return DecisionTreeRegressor( + maximum_depth=self._maximum_depth, + minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.tree import DecisionTreeRegressor as SklearnDecisionTreeRegressor + + return SklearnDecisionTreeRegressor( + max_depth=self._maximum_depth, + min_samples_leaf=self._minimum_number_of_samples_in_leaves, + ) diff --git a/src/safeds/ml/classical/regression/_elastic_net_regression.py b/src/safeds/ml/classical/regression/_elastic_net_regression.py deleted file mode 100644 index 45b3069f4..000000000 --- a/src/safeds/ml/classical/regression/_elastic_net_regression.py +++ /dev/null @@ -1,203 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING -from warnings import warn - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.linear_model import ElasticNet as sk_ElasticNet - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class ElasticNetRegressor(Regressor): - """Elastic net regression. - - Parameters - ---------- - alpha: - Controls the regularization of the model. The higher the value, the more regularized it becomes. - lasso_ratio: - Number between 0 and 1 that controls the ratio between Lasso and Ridge regularization. If 0, only Ridge - regularization is used. If 1, only Lasso regularization is used. - - Raises - ------ - OutOfBoundsError - If `alpha` is negative or `lasso_ratio` is not between 0 and 1. - """ - - def __hash__(self) -> int: - return _structural_hash( - Regressor.__hash__(self), - self._target_name, - self._feature_names, - self._alpha, - self._lasso_ratio, - ) - - def __init__(self, *, alpha: float = 1.0, lasso_ratio: float = 0.5) -> None: - # Validation - if alpha < 0: - raise OutOfBoundsError(alpha, name="alpha", lower_bound=ClosedBound(0)) - if alpha == 0: - warn( - ( - "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " - "LinearRegression instead for better numerical stability." - ), - UserWarning, - stacklevel=2, - ) - if lasso_ratio < 0 or lasso_ratio > 1: - raise OutOfBoundsError( - lasso_ratio, - name="lasso_ratio", - lower_bound=ClosedBound(0), - upper_bound=ClosedBound(1), - ) - elif lasso_ratio == 0: - warnings.warn( - ( - "ElasticNetRegression with lasso_ratio = 0 is essentially RidgeRegression." - " Use RidgeRegression instead for better numerical stability." - ), - stacklevel=2, - ) - elif lasso_ratio == 1: - warnings.warn( - ( - "ElasticNetRegression with lasso_ratio = 0 is essentially LassoRegression." - " Use LassoRegression instead for better numerical stability." - ), - stacklevel=2, - ) - - # Hyperparameters - self._alpha = alpha - self._lasso_ratio = lasso_ratio - - # Internal state - self._wrapped_regressor: sk_ElasticNet | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def alpha(self) -> float: - """ - Get the regularization of the model. - - Returns - ------- - result: - The regularization of the model. - """ - return self._alpha - - @property - def lasso_ratio(self) -> float: - """ - Get the ratio between Lasso and Ridge regularization. - - Returns - ------- - result: - The ratio between Lasso and Ridge regularization. - """ - return self._lasso_ratio - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> ElasticNetRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = ElasticNetRegressor(alpha=self._alpha, lasso_ratio=self._lasso_ratio) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.linear_model import ElasticNet as sk_ElasticNet - - return sk_ElasticNet(alpha=self._alpha, l1_ratio=self._lasso_ratio) diff --git a/src/safeds/ml/classical/regression/_elastic_net_regressor.py b/src/safeds/ml/classical/regression/_elastic_net_regressor.py new file mode 100644 index 000000000..34ce211f4 --- /dev/null +++ b/src/safeds/ml/classical/regression/_elastic_net_regressor.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING +from warnings import warn + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OutOfBoundsError + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class ElasticNetRegressor(Regressor): + """Elastic net regression. + + Parameters + ---------- + alpha: + Controls the regularization of the model. The higher the value, the more regularized it becomes. + lasso_ratio: + Number between 0 and 1 that controls the ratio between Lasso and Ridge regularization. If 0, only Ridge + regularization is used. If 1, only Lasso regularization is used. + + Raises + ------ + OutOfBoundsError + If `alpha` is negative or `lasso_ratio` is not between 0 and 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, *, alpha: float = 1.0, lasso_ratio: float = 0.5) -> None: + super().__init__() + + # Validation + if alpha < 0: + raise OutOfBoundsError(alpha, name="alpha", lower_bound=ClosedBound(0)) + if alpha == 0: + warn( + ( + "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " + "LinearRegression instead for better numerical stability." + ), + UserWarning, + stacklevel=2, + ) + if lasso_ratio < 0 or lasso_ratio > 1: + raise OutOfBoundsError( + lasso_ratio, + name="lasso_ratio", + lower_bound=ClosedBound(0), + upper_bound=ClosedBound(1), + ) + elif lasso_ratio == 0: + warnings.warn( + ( + "ElasticNetRegression with lasso_ratio = 0 is essentially RidgeRegression." + " Use RidgeRegression instead for better numerical stability." + ), + stacklevel=2, + ) + elif lasso_ratio == 1: + warnings.warn( + ( + "ElasticNetRegression with lasso_ratio = 0 is essentially LassoRegression." + " Use LassoRegression instead for better numerical stability." + ), + stacklevel=2, + ) + + # Hyperparameters + self._alpha = alpha + self._lasso_ratio = lasso_ratio + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._alpha, + self._lasso_ratio, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def alpha(self) -> float: + """The regularization of the model.""" + return self._alpha + + @property + def lasso_ratio(self) -> float: + """Rhe ratio between Lasso and Ridge regularization.""" + return self._lasso_ratio + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> ElasticNetRegressor: + return ElasticNetRegressor( + alpha=self._alpha, + lasso_ratio=self._lasso_ratio, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.linear_model import ElasticNet as SklearnElasticNet + + return SklearnElasticNet( + alpha=self._alpha, + l1_ratio=self._lasso_ratio, + ) diff --git a/src/safeds/ml/classical/regression/_gradient_boosting.py b/src/safeds/ml/classical/regression/_gradient_boosting.py deleted file mode 100644 index 4cf46bc97..000000000 --- a/src/safeds/ml/classical/regression/_gradient_boosting.py +++ /dev/null @@ -1,173 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.ensemble import GradientBoostingRegressor as sk_GradientBoostingRegressor - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class GradientBoostingRegressor(Regressor): - """ - Gradient boosting regression. - - Parameters - ---------- - number_of_trees: - The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large - number usually results in better performance. - learning_rate: - The larger the value, the more the model is influenced by each additional tree. If the learning rate is too - low, the model might underfit. If the learning rate is too high, the model might overfit. - - Raises - ------ - OutOfBoundsError - If `number_of_trees` or `learning_rate` are less than or equal to 0. - """ - - def __hash__(self) -> int: - return _structural_hash( - Regressor.__hash__(self), - self._target_name, - self._feature_names, - self._learning_rate, - self._number_of_trees, - ) - - def __init__(self, *, number_of_trees: int = 100, learning_rate: float = 0.1) -> None: - # Validation - if number_of_trees < 1: - raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1)) - if learning_rate <= 0: - raise OutOfBoundsError(learning_rate, name="learning_rate", lower_bound=OpenBound(0)) - - # Hyperparameters - self._number_of_trees = number_of_trees - self._learning_rate = learning_rate - - # Internal state - self._wrapped_regressor: sk_GradientBoostingRegressor | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def number_of_trees(self) -> int: - """ - Get the number of trees (estimators) in the ensemble. - - Returns - ------- - result: - The number of trees. - """ - return self._number_of_trees - - @property - def learning_rate(self) -> float: - """ - Get the learning rate. - - Returns - ------- - result: - The learning rate. - """ - return self._learning_rate - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> GradientBoostingRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = GradientBoostingRegressor(number_of_trees=self._number_of_trees, learning_rate=self._learning_rate) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.ensemble import GradientBoostingRegressor as sk_GradientBoostingRegressor - - return sk_GradientBoostingRegressor(n_estimators=self._number_of_trees, learning_rate=self._learning_rate) diff --git a/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py b/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py new file mode 100644 index 000000000..a88a8da1a --- /dev/null +++ b/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _GradientBoostingBase + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class GradientBoostingRegressor(Regressor, _GradientBoostingBase): + """ + Gradient boosting regression. + + Parameters + ---------- + number_of_trees: + The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large + number usually results in better performance. + learning_rate: + The larger the value, the more the model is influenced by each additional tree. If the learning rate is too + low, the model might underfit. If the learning rate is too high, the model might overfit. + + Raises + ------ + OutOfBoundsError + If `number_of_trees` or `learning_rate` are less than or equal to 0. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + number_of_trees: int = 100, + learning_rate: float = 0.1, + ) -> None: + # Initialize superclasses + Regressor.__init__(self) + _GradientBoostingBase.__init__( + self, + number_of_trees=number_of_trees, + learning_rate=learning_rate, + ) + + def __hash__(self) -> int: + return _structural_hash( + Regressor.__hash__(self), + _GradientBoostingBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> GradientBoostingRegressor: + return GradientBoostingRegressor( + number_of_trees=self._number_of_trees, + learning_rate=self._learning_rate, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.ensemble import GradientBoostingRegressor as SklearnGradientBoostingRegressor + + return SklearnGradientBoostingRegressor( + n_estimators=self._number_of_trees, + learning_rate=self._learning_rate, + ) diff --git a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py deleted file mode 100644 index aa6198de3..000000000 --- a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py +++ /dev/null @@ -1,168 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset -from safeds.data.tabular.containers import ExperimentalTable, Table -from safeds.exceptions import ClosedBound, DatasetMissesDataError, OutOfBoundsError, PlainTableError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.neighbors import KNeighborsRegressor as sk_KNeighborsRegressor - - -class KNearestNeighborsRegressor(Regressor): - """ - K-nearest-neighbors regression. - - Parameters - ---------- - number_of_neighbors: - The number of neighbors to use for interpolation. Has to be greater than 0 (validated in the constructor) and - less than or equal to the sample size (validated when calling `fit`). - - Raises - ------ - OutOfBoundsError - If `number_of_neighbors` is less than 1. - """ - - def __hash__(self) -> int: - return _structural_hash( - Regressor.__hash__(self), - self._target_name, - self._feature_names, - self._number_of_neighbors, - ) - - def __init__(self, number_of_neighbors: int) -> None: - # Validation - if number_of_neighbors < 1: - raise OutOfBoundsError(number_of_neighbors, name="number_of_neighbors", lower_bound=ClosedBound(1)) - - # Hyperparameters - self._number_of_neighbors = number_of_neighbors - - # Internal state - self._wrapped_regressor: sk_KNeighborsRegressor | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def number_of_neighbors(self) -> int: - """ - Get the number of neighbors used for interpolation. - - Returns - ------- - result: - The number of neighbors. - """ - return self._number_of_neighbors - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> KNearestNeighborsRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - ValueError - If `number_of_neighbors` is greater than the sample size. - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - if not isinstance(training_set, TabularDataset) and isinstance(training_set, Table): - raise PlainTableError - - if training_set._table.number_of_rows == 0: - raise DatasetMissesDataError - if self._number_of_neighbors > training_set._table.number_of_rows: - raise ValueError( - ( - f"The parameter 'number_of_neighbors' ({self._number_of_neighbors}) has to be less than or equal to" - f" the sample size ({training_set._table.number_of_rows})." - ), - ) - - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = KNearestNeighborsRegressor(self._number_of_neighbors) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.neighbors import KNeighborsRegressor as sk_KNeighborsRegressor - - return sk_KNeighborsRegressor(self._number_of_neighbors, n_jobs=-1) diff --git a/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py b/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py new file mode 100644 index 000000000..082d5954f --- /dev/null +++ b/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _KNearestNeighborsBase + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + from safeds.data.labeled.containers import TabularDataset + + +class KNearestNeighborsRegressor(Regressor, _KNearestNeighborsBase): + """ + K-nearest-neighbors regression. + + Parameters + ---------- + number_of_neighbors: + The number of neighbors to use for interpolation. Has to be greater than 0 (validated in the constructor) and + less than or equal to the sample size (validated when calling `fit`). + + Raises + ------ + OutOfBoundsError + If `number_of_neighbors` is less than 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, number_of_neighbors: int) -> None: + # Initialize superclasses + Regressor.__init__(self) + _KNearestNeighborsBase.__init__( + self, + number_of_neighbors=number_of_neighbors, + ) + + def __hash__(self) -> int: + return _structural_hash( + Regressor.__hash__(self), + _KNearestNeighborsBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _check_additional_fit_preconditions(self, training_set: TabularDataset): + if self._number_of_neighbors > training_set._table.number_of_rows: + raise ValueError( + ( + f"The parameter 'number_of_neighbors' ({self._number_of_neighbors}) has to be less than or equal to" + f" the sample size ({training_set._table.number_of_rows})." + ), + ) + + def _clone(self) -> KNearestNeighborsRegressor: + return KNearestNeighborsRegressor( + number_of_neighbors=self._number_of_neighbors, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.neighbors import KNeighborsRegressor as SklearnKNeighborsRegressor + + return SklearnKNeighborsRegressor( + n_neighbors=self._number_of_neighbors, + n_jobs=-1, + ) diff --git a/src/safeds/ml/classical/regression/_lasso_regression.py b/src/safeds/ml/classical/regression/_lasso_regression.py deleted file mode 100644 index 2a74cc244..000000000 --- a/src/safeds/ml/classical/regression/_lasso_regression.py +++ /dev/null @@ -1,157 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING -from warnings import warn - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.linear_model import Lasso as sk_Lasso - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class LassoRegressor(Regressor): - """Lasso regression. - - Parameters - ---------- - alpha: - Controls the regularization of the model. The higher the value, the more regularized it becomes. - - Raises - ------ - OutOfBoundsError - If `alpha` is negative. - """ - - def __hash__(self) -> int: - return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names, self._alpha) - - def __init__(self, *, alpha: float = 1.0) -> None: - # Validation - if alpha < 0: - raise OutOfBoundsError(alpha, name="alpha", lower_bound=ClosedBound(0)) - if alpha == 0: - warn( - ( - "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " - "LinearRegression instead for better numerical stability." - ), - UserWarning, - stacklevel=2, - ) - - # Hyperparameters - self._alpha = alpha - - # Internal state - self._wrapped_regressor: sk_Lasso | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def alpha(self) -> float: - """ - Get the regularization of the model. - - Returns - ------- - result: - The regularization of the model. - """ - return self._alpha - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LassoRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = LassoRegressor(alpha=self._alpha) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.linear_model import Lasso as sk_Lasso - - return sk_Lasso(alpha=self._alpha) diff --git a/src/safeds/ml/classical/regression/_lasso_regressor.py b/src/safeds/ml/classical/regression/_lasso_regressor.py new file mode 100644 index 000000000..dc2318a75 --- /dev/null +++ b/src/safeds/ml/classical/regression/_lasso_regressor.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from warnings import warn + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OutOfBoundsError + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class LassoRegressor(Regressor): + """Lasso regression. + + Parameters + ---------- + alpha: + Controls the regularization of the model. The higher the value, the more regularized it becomes. + + Raises + ------ + OutOfBoundsError + If `alpha` is negative. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, *, alpha: float = 1.0) -> None: + super().__init__() + + # Validation + if alpha < 0: + raise OutOfBoundsError(alpha, name="alpha", lower_bound=ClosedBound(0)) + if alpha == 0: + warn( + ( + "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " + "LinearRegression instead for better numerical stability." + ), + UserWarning, + stacklevel=2, + ) + + # Hyperparameters + self._alpha = alpha + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._alpha, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def alpha(self) -> float: + """ + Get the regularization of the model. + + Returns + ------- + result: + The regularization of the model. + """ + return self._alpha + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> LassoRegressor: + return LassoRegressor( + alpha=self._alpha, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.linear_model import Lasso as SklearnLasso + + return SklearnLasso(alpha=self._alpha) diff --git a/src/safeds/ml/classical/regression/_linear_regression.py b/src/safeds/ml/classical/regression/_linear_regression.py deleted file mode 100644 index 8c9d5db4d..000000000 --- a/src/safeds/ml/classical/regression/_linear_regression.py +++ /dev/null @@ -1,117 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.linear_model import LinearRegression as sk_LinearRegression - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -# TODO: rename to linear regressor -class LinearRegressionRegressor(Regressor): - """Linear regression.""" - - def __hash__(self) -> int: - return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names) - - def __init__(self) -> None: - # Internal state - self._wrapped_regressor: sk_LinearRegression | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LinearRegressionRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = LinearRegressionRegressor() - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.linear_model import LinearRegression as sk_LinearRegression - - return sk_LinearRegression(n_jobs=-1) diff --git a/src/safeds/ml/classical/regression/_linear_regressor.py b/src/safeds/ml/classical/regression/_linear_regressor.py new file mode 100644 index 000000000..8a61d13fd --- /dev/null +++ b/src/safeds/ml/classical/regression/_linear_regressor.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class LinearRegressor(Regressor): + """Linear regression.""" + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self) -> None: + super().__init__() + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> LinearRegressor: + return LinearRegressor() + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.linear_model import LinearRegression as sk_LinearRegression + + return sk_LinearRegression(n_jobs=-1) diff --git a/src/safeds/ml/classical/regression/_random_forest.py b/src/safeds/ml/classical/regression/_random_forest.py deleted file mode 100644 index 2d4a8ad98..000000000 --- a/src/safeds/ml/classical/regression/_random_forest.py +++ /dev/null @@ -1,191 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.ensemble import RandomForestRegressor as sk_RandomForestRegressor - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class RandomForestRegressor(Regressor): - """ - Random forest regression. - - Parameters - ---------- - number_of_trees: - The number of trees to be used in the random forest. Has to be greater than 0. - maximum_depth: - The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. - minimum_number_of_samples_in_leaves: - The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. - - Raises - ------ - OutOfBoundsError - If `number_of_trees` is less than 1. - OutOfBoundsError - If `maximum_depth` is less than 1. - OutOfBoundsError - If `minimum_number_of_samples_in_leaves` is less than 1. - """ - - def __init__( - self, - *, - number_of_trees: int = 100, - maximum_depth: int | None = None, - minimum_number_of_samples_in_leaves: int = 5, - ) -> None: - # Validation - if number_of_trees < 1: - raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1)) - if maximum_depth is not None and maximum_depth < 1: - raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1)) - if minimum_number_of_samples_in_leaves < 1: - raise OutOfBoundsError( - minimum_number_of_samples_in_leaves, - name="minimum_number_of_samples_in_leaves", - lower_bound=ClosedBound(1), - ) - - # Hyperparameters - self._number_of_trees: int = number_of_trees - self._maximum_depth: int | None = maximum_depth - self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves - - # Internal state - self._wrapped_regressor: sk_RandomForestRegressor | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - def __hash__(self) -> int: - return _structural_hash( - Regressor.__hash__(self), - self._feature_names, - self._target_name, - self._number_of_trees, - self._maximum_depth, - self._minimum_number_of_samples_in_leaves, - ) - - @property - def number_of_trees(self) -> int: - """The number of trees used in the random forest.""" - return self._number_of_trees - - @property - def maximum_depth(self) -> int | None: - """The maximum depth of each tree.""" - return self._maximum_depth - - @property - def minimum_number_of_samples_in_leaves(self) -> int: - """The minimum number of samples that must remain in the leaves of each tree.""" - return self._minimum_number_of_samples_in_leaves - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RandomForestRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = RandomForestRegressor( - number_of_trees=self._number_of_trees, - maximum_depth=self._maximum_depth, - minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, - ) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.ensemble import RandomForestRegressor as sk_RandomForestRegressor - - return sk_RandomForestRegressor( - n_estimators=self._number_of_trees, - max_depth=self._maximum_depth, - min_samples_leaf=self._minimum_number_of_samples_in_leaves, - n_jobs=-1, - ) diff --git a/src/safeds/ml/classical/regression/_random_forest_regressor.py b/src/safeds/ml/classical/regression/_random_forest_regressor.py new file mode 100644 index 000000000..98fe35fbb --- /dev/null +++ b/src/safeds/ml/classical/regression/_random_forest_regressor.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _get_random_seed, _structural_hash +from safeds.ml.classical._bases import _RandomForestBase + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class RandomForestRegressor(Regressor, _RandomForestBase): + """ + Random forest regression. + + Parameters + ---------- + number_of_trees: + The number of trees to be used in the random forest. Has to be greater than 0. + maximum_depth: + The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0. + minimum_number_of_samples_in_leaves: + The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0. + + Raises + ------ + OutOfBoundsError + If `number_of_trees` is less than 1. + OutOfBoundsError + If `maximum_depth` is less than 1. + OutOfBoundsError + If `minimum_number_of_samples_in_leaves` is less than 1. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + number_of_trees: int = 100, + maximum_depth: int | None = None, + minimum_number_of_samples_in_leaves: int = 5, + ) -> None: + # Initialize superclasses + Regressor.__init__(self) + _RandomForestBase.__init__( + self, + number_of_trees=number_of_trees, + maximum_depth=maximum_depth, + minimum_number_of_samples_in_leaves=minimum_number_of_samples_in_leaves, + ) + + def __hash__(self) -> int: + return _structural_hash( + Regressor.__hash__(self), + _RandomForestBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> RandomForestRegressor: + return RandomForestRegressor( + number_of_trees=self._number_of_trees, + maximum_depth=self._maximum_depth, + minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.ensemble import RandomForestRegressor as SklearnRandomForestRegressor + + return SklearnRandomForestRegressor( + n_estimators=self._number_of_trees, + max_depth=self._maximum_depth, + min_samples_leaf=self._minimum_number_of_samples_in_leaves, + random_state=_get_random_seed(), + n_jobs=-1, + ) diff --git a/src/safeds/ml/classical/regression/_regressor.py b/src/safeds/ml/classical/regression/_regressor.py index d1ac75c2c..2c477a0ad 100644 --- a/src/safeds/ml/classical/regression/_regressor.py +++ b/src/safeds/ml/classical/regression/_regressor.py @@ -1,102 +1,104 @@ from __future__ import annotations -from abc import ABC, abstractmethod +from abc import ABC from typing import TYPE_CHECKING -from safeds._utils import _structural_hash -from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset -from safeds.data.tabular.containers import Column, ExperimentalColumn, ExperimentalTable, Table -from safeds.exceptions import ColumnLengthMismatchError, PlainTableError +from safeds.data.labeled.containers import TabularDataset +from safeds.exceptions import ColumnLengthMismatchError, ModelNotFittedError +from safeds.ml.classical import SupervisedModel +from safeds.ml.metrics import RegressionMetrics if TYPE_CHECKING: - from sklearn.base import RegressorMixin + from safeds.data.tabular.containers import Column, Table -class Regressor(ABC): - """Abstract base class for all regressors.""" +class Regressor(SupervisedModel, ABC): + """A model for regression tasks.""" - def __hash__(self) -> int: - """ - Return a deterministic hash value for a regressor. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self.__class__.__qualname__, self.is_fitted) + # ------------------------------------------------------------------------------------------------------------------ + # Metrics + # ------------------------------------------------------------------------------------------------------------------ - @abstractmethod - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Regressor: + def summarize_metrics(self, validation_or_test_set: Table | TabularDataset) -> Table: """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. + Summarize the regressor's metrics on the given data. Parameters ---------- - training_set: - The training data containing the feature and target vectors. + validation_or_test_set: + The validation or test set. Returns ------- - fitted_regressor: - The fitted regressor. + metrics: + A table containing the regressor's metrics. Raises ------ - LearningError - If the training data contains invalid values or if the training failed. + ModelNotFittedError + If the classifier has not been fitted yet. """ + if not self.is_fitted: + raise ModelNotFittedError + + validation_or_test_set = _extract_table(validation_or_test_set) + + return RegressionMetrics.summarize( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + ) - @abstractmethod - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: + def coefficient_of_determination(self, validation_or_test_set: Table | TabularDataset) -> float: """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. + Compute the coefficient of determination (R²) of the regressor on the given data. + + The coefficient of determination compares the regressor's predictions to another model that always predicts the + mean of the target values. It is a measure of how well the regressor explains the variance in the target values. + + The **higher** the coefficient of determination, the better the regressor. Results range from negative infinity + to 1.0. You can interpret the coefficient of determination as follows: + + | R² | Interpretation | + | ---------- | ------------------------------------------------------------------------------------------ | + | 1.0 | The model perfectly predicts the target values. Did you overfit? | + | (0.0, 1.0) | The model is better than predicting the mean of the target values. You should be here. | + | 0.0 | The model is as good as predicting the mean of the target values. Try something else. | + | (-∞, 0.0) | The model is worse than predicting the mean of the target values. Something is very wrong. | + + **Note:** Some other libraries call this metric `r2_score`. Parameters ---------- - dataset: - The dataset containing the feature vectors. + validation_or_test_set: + The validation or test set. Returns ------- - table: - A dataset containing the given feature vectors and the predicted target vector. + coefficient_of_determination: + The coefficient of determination of the regressor. Raises ------ ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. + If the classifier has not been fitted yet. """ + if not self.is_fitted: + raise ModelNotFittedError - @property - @abstractmethod - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" + validation_or_test_set = _extract_table(validation_or_test_set) - @abstractmethod - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. + return RegressionMetrics.coefficient_of_determination( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + ) - Returns - ------- - wrapped_regressor: - The sklearn Regressor. + def mean_absolute_error(self, validation_or_test_set: Table | TabularDataset) -> float: """ + Compute the mean absolute error (MAE) of the regressor on the given data. - # ------------------------------------------------------------------------------------------------------------------ - # Metrics - # ------------------------------------------------------------------------------------------------------------------ - - def summarize_metrics(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> Table: - """ - Summarize the regressor's metrics on the given data. + The mean absolute error is the average of the absolute differences between the predicted and expected target + values. The **lower** the mean absolute error, the better the regressor. Results range from 0.0 to positive + infinity. Parameters ---------- @@ -105,27 +107,35 @@ def summarize_metrics(self, validation_or_test_set: TabularDataset | Experimenta Returns ------- - metrics: - A table containing the regressor's metrics. + mean_absolute_error: + The mean absolute error of the regressor. Raises ------ - TypeError - If a table is passed instead of a tabular dataset. + ModelNotFittedError + If the classifier has not been fitted yet. """ - mean_absolute_error = self.mean_absolute_error(validation_or_test_set) - mean_squared_error = self.mean_squared_error(validation_or_test_set) - - return Table( - { - "metric": ["mean_absolute_error", "mean_squared_error"], - "value": [mean_absolute_error, mean_squared_error], - }, + if not self.is_fitted: + raise ModelNotFittedError + + validation_or_test_set = _extract_table(validation_or_test_set) + + return RegressionMetrics.mean_absolute_error( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), ) - def mean_absolute_error(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: + def mean_directional_accuracy(self, validation_or_test_set: Table | TabularDataset) -> float: """ - Compute the mean absolute error (MAE) of the regressor on the given data. + Compute the mean directional accuracy (MDA) of the regressor on the given data. + + This metric compares two consecutive target values and checks if the predicted direction (down/unchanged/up) + matches the expected direction. The mean directional accuracy is the proportion of correctly predicted + directions. The **higher** the mean directional accuracy, the better the regressor. Results range from 0.0 to + 1.0. + + This metric is useful for time series data, where the order of the target values has a meaning. It is not useful + for other types of data. Because of this, it is not included in the `summarize_metrics` method. Parameters ---------- @@ -134,38 +144,33 @@ def mean_absolute_error(self, validation_or_test_set: TabularDataset | Experimen Returns ------- - mean_absolute_error: - The calculated mean absolute error (the average of the distance of each individual row). + mean_directional_accuracy: + The mean directional accuracy of the regressor. Raises ------ - TypeError - If a table is passed instead of a tabular dataset. + ModelNotFittedError + If the classifier has not been fitted yet. """ - from sklearn.metrics import mean_absolute_error as sk_mean_absolute_error + if not self.is_fitted: + raise ModelNotFittedError - if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): - raise PlainTableError + validation_or_test_set = _extract_table(validation_or_test_set) - if isinstance(validation_or_test_set, TabularDataset): - expected = validation_or_test_set.target - predicted = self.predict(validation_or_test_set.features).target + return RegressionMetrics.mean_directional_accuracy( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + ) - # TODO: more efficient implementation using polars - _check_metrics_preconditions(predicted, expected) - return sk_mean_absolute_error(expected._data, predicted._data) - elif isinstance(validation_or_test_set, ExperimentalTabularDataset): # pragma: no cover - expected_2 = validation_or_test_set.target - predicted_2 = self.predict(validation_or_test_set.features).target + def mean_squared_error(self, validation_or_test_set: Table | TabularDataset) -> float: + """ + Compute the mean squared error (MSE) of the regressor on the given data. - # TODO: more efficient implementation using polars - _check_metrics_preconditions_experimental(predicted_2, expected_2) - return sk_mean_absolute_error(expected_2._series, predicted_2._data) + The mean squared error is the average of the squared differences between the predicted and expected target + values. The **lower** the mean squared error, the better the regressor. Results range from 0.0 to positive + infinity. - # noinspection PyProtectedMember - def mean_squared_error(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: - """ - Compute the mean squared error (MSE) on the given data. + **Note:** To get the root mean squared error (RMSE), take the square root of the result. Parameters ---------- @@ -175,48 +180,59 @@ def mean_squared_error(self, validation_or_test_set: TabularDataset | Experiment Returns ------- mean_squared_error: - The calculated mean squared error (the average of the distance of each individual row squared). + The mean squared error of the regressor. Raises ------ - TypeError - If a table is passed instead of a tabular dataset. + ModelNotFittedError + If the classifier has not been fitted yet. """ - from sklearn.metrics import mean_squared_error as sk_mean_squared_error + if not self.is_fitted: + raise ModelNotFittedError + + validation_or_test_set = _extract_table(validation_or_test_set) + + return RegressionMetrics.mean_squared_error( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), + ) - if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): - raise PlainTableError + def median_absolute_deviation(self, validation_or_test_set: Table | TabularDataset) -> float: + """ + Compute the median absolute deviation (MAD) of the regressor on the given data. - if isinstance(validation_or_test_set, TabularDataset): - expected = validation_or_test_set.target - predicted = self.predict(validation_or_test_set.features).target + The median absolute deviation is the median of the absolute differences between the predicted and expected + target values. The **lower** the median absolute deviation, the better the regressor. Results range from 0.0 to + positive infinity. - # TODO: more efficient implementation using polars - _check_metrics_preconditions(predicted, expected) - return sk_mean_squared_error(expected._data, predicted._data) - elif isinstance(validation_or_test_set, ExperimentalTabularDataset): # pragma: no cover - expected_2 = validation_or_test_set.target - predicted_2 = self.predict(validation_or_test_set.features).target + Parameters + ---------- + validation_or_test_set: + The validation or test set. - # TODO: more efficient implementation using polars - _check_metrics_preconditions_experimental(predicted_2, expected_2) - return sk_mean_squared_error(expected_2._series, predicted_2._data) + Returns + ------- + median_absolute_deviation: + The median absolute deviation of the regressor. + Raises + ------ + ModelNotFittedError + If the classifier has not been fitted yet. + """ + if not self.is_fitted: + raise ModelNotFittedError -def _check_metrics_preconditions(actual: Column, expected: Column) -> None: - if not actual.type.is_numeric(): - raise TypeError(f"Column 'actual' is not numerical but {actual.type}.") - if not expected.type.is_numeric(): - raise TypeError(f"Column 'expected' is not numerical but {expected.type}.") + validation_or_test_set = _extract_table(validation_or_test_set) - if actual._data.size != expected._data.size: - raise ColumnLengthMismatchError( - "\n".join([f"{column.name}: {column._data.size}" for column in [actual, expected]]), + return RegressionMetrics.median_absolute_deviation( + self.predict(validation_or_test_set), + validation_or_test_set.get_column(self.get_target_name()), ) -def _check_metrics_preconditions_experimental(actual: Column, expected: ExperimentalColumn) -> None: # pragma: no cover - if not actual.type.is_numeric(): +def _check_metrics_preconditions(actual: Column, expected: Column) -> None: # pragma: no cover + if not actual.type.is_numeric: raise TypeError(f"Column 'actual' is not numerical but {actual.type}.") if not expected.type.is_numeric: raise TypeError(f"Column 'expected' is not numerical but {expected.type}.") @@ -230,3 +246,11 @@ def _check_metrics_preconditions_experimental(actual: Column, expected: Experime ], ), ) + + +def _extract_table(table_or_dataset: Table | TabularDataset) -> Table: + """Extract the table from the given table or dataset.""" + if isinstance(table_or_dataset, TabularDataset): + return table_or_dataset.to_table() + else: + return table_or_dataset diff --git a/src/safeds/ml/classical/regression/_ridge_regression.py b/src/safeds/ml/classical/regression/_ridge_regression.py deleted file mode 100644 index 9a9b8f706..000000000 --- a/src/safeds/ml/classical/regression/_ridge_regression.py +++ /dev/null @@ -1,158 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.linear_model import Ridge as sk_Ridge - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class RidgeRegressor(Regressor): - """ - Ridge regression. - - Parameters - ---------- - alpha: - Controls the regularization of the model. The higher the value, the more regularized it becomes. - - Raises - ------ - OutOfBoundsError - If `alpha` is negative. - """ - - def __hash__(self) -> int: - return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names, self._alpha) - - def __init__(self, *, alpha: float = 1.0) -> None: - # Validation - if alpha < 0: - raise OutOfBoundsError(alpha, name="alpha", lower_bound=ClosedBound(0)) - if alpha == 0.0: - warnings.warn( - ( - "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " - "LinearRegression instead for better numerical stability." - ), - UserWarning, - stacklevel=2, - ) - - # Hyperparameters - self._alpha = alpha - - # Internal state - self._wrapped_regressor: sk_Ridge | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - @property - def alpha(self) -> float: - """ - Get the regularization of the model. - - Returns - ------- - result: - The regularization of the model. - """ - return self._alpha - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RidgeRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = RidgeRegressor(alpha=self._alpha) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.linear_model import Ridge as sk_Ridge - - return sk_Ridge(alpha=self._alpha) diff --git a/src/safeds/ml/classical/regression/_ridge_regressor.py b/src/safeds/ml/classical/regression/_ridge_regressor.py new file mode 100644 index 000000000..ffb975008 --- /dev/null +++ b/src/safeds/ml/classical/regression/_ridge_regressor.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.exceptions import ClosedBound, OutOfBoundsError + +from ._regressor import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class RidgeRegressor(Regressor): + """ + Ridge regression. + + Parameters + ---------- + alpha: + Controls the regularization of the model. The higher the value, the more regularized it becomes. + + Raises + ------ + OutOfBoundsError + If `alpha` is negative. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, *, alpha: float = 1.0) -> None: + super().__init__() + + # Validation + if alpha < 0: + raise OutOfBoundsError(alpha, name="alpha", lower_bound=ClosedBound(0)) + if alpha == 0.0: + warnings.warn( + ( + "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " + "LinearRegression instead for better numerical stability." + ), + UserWarning, + stacklevel=2, + ) + + # Hyperparameters + self._alpha = alpha + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._alpha, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def alpha(self) -> float: + """ + Get the regularization of the model. + + Returns + ------- + result: + The regularization of the model. + """ + return self._alpha + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> RidgeRegressor: + return RidgeRegressor(alpha=self._alpha) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.linear_model import Ridge as SklearnRidge + + return SklearnRidge(alpha=self._alpha) diff --git a/src/safeds/ml/classical/regression/_support_vector_machine.py b/src/safeds/ml/classical/regression/_support_vector_machine.py deleted file mode 100644 index 56627e978..000000000 --- a/src/safeds/ml/classical/regression/_support_vector_machine.py +++ /dev/null @@ -1,277 +0,0 @@ -from __future__ import annotations - -import sys -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any - -from safeds._utils import _structural_hash -from safeds.exceptions import ClosedBound, OpenBound, OutOfBoundsError -from safeds.ml.classical._util_sklearn import fit, predict -from safeds.ml.classical.regression import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - from sklearn.svm import SVC as sk_SVR # noqa: N811 - - from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset - from safeds.data.tabular.containers import ExperimentalTable, Table - - -class SupportVectorMachineKernel(ABC): - """The abstract base class of the different subclasses supported by the `Kernel`.""" - - @abstractmethod - def _get_sklearn_arguments(self) -> dict[str, Any]: # TODO: use apply pattern (imputer strategy) instead - """Return the arguments to pass to scikit-learn.""" - - @abstractmethod - def __eq__(self, other: object) -> bool: - """ - Compare two kernels. - - Parameters - ---------- - other: - other object to compare to - - Returns - ------- - equals: - Whether the two kernels are equal - """ - - def __hash__(self) -> int: - """ - Return a deterministic hash value for this kernel. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self.__class__.__qualname__) - - -class SupportVectorMachineRegressor(Regressor): - """ - Support vector machine. - - Parameters - ---------- - c: - The strength of regularization. Must be strictly positive. - kernel: - The type of kernel to be used. Defaults to None. - - Raises - ------ - OutOfBoundsError - If `c` is less than or equal to 0. - """ - - def __hash__(self) -> int: - return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names, self._c, self.kernel) - - def __init__(self, *, c: float = 1.0, kernel: SupportVectorMachineKernel | None = None) -> None: - # Inputs - if c <= 0: - raise OutOfBoundsError(c, name="c", lower_bound=OpenBound(0)) - if kernel is None: - kernel = self.Kernel.RadialBasisFunction() - - # Internal state - self._wrapped_regressor: sk_SVR | None = None - self._feature_names: list[str] | None = None - self._target_name: str | None = None - - # Hyperparameters - self._c: float = c - self._kernel: SupportVectorMachineKernel = kernel - - @property - def c(self) -> float: - """ - Get the regularization strength. - - Returns - ------- - result: - The regularization strength. - """ - return self._c - - @property - def kernel(self) -> SupportVectorMachineKernel: - """ - Get the type of kernel used. - - Returns - ------- - result: - The type of kernel used. - """ - return self._kernel - - class Kernel: - class Linear(SupportVectorMachineKernel): - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "linear", - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineRegressor.Kernel.Linear): - return NotImplemented - return True - - __hash__ = SupportVectorMachineKernel.__hash__ - - class Polynomial(SupportVectorMachineKernel): - def __init__(self, degree: int): - if degree < 1: - raise OutOfBoundsError(degree, name="degree", lower_bound=ClosedBound(1)) - self._degree = degree - - @property - def degree(self) -> int: - """The degree of the polynomial kernel.""" - return self._degree - - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "poly", - "degree": self._degree, - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineRegressor.Kernel.Polynomial): - return NotImplemented - return self._degree == other._degree - - def __hash__(self) -> int: - return _structural_hash(SupportVectorMachineKernel.__hash__(self), self._degree) - - def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return sys.getsizeof(self._degree) - - class Sigmoid(SupportVectorMachineKernel): - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "sigmoid", - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineRegressor.Kernel.Sigmoid): - return NotImplemented - return True - - __hash__ = SupportVectorMachineKernel.__hash__ - - class RadialBasisFunction(SupportVectorMachineKernel): - def _get_sklearn_arguments(self) -> dict[str, Any]: - return { - "kernel": "rbf", - } - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SupportVectorMachineRegressor.Kernel.RadialBasisFunction): - return NotImplemented - return True - - __hash__ = SupportVectorMachineKernel.__hash__ - - def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> SupportVectorMachineRegressor: - """ - Create a copy of this regressor and fit it with the given training data. - - This regressor is not modified. - - Parameters - ---------- - training_set: - The training data containing the feature and target vectors. - - Returns - ------- - fitted_regressor: - The fitted regressor. - - Raises - ------ - LearningError - If the training data contains invalid values or if the training failed. - TypeError - If a table is passed instead of a tabular dataset. - NonNumericColumnError - If the training data contains non-numerical values. - MissingValuesColumnError - If the training data contains missing values. - DatasetMissesDataError - If the training data contains no rows. - """ - wrapped_regressor = self._get_sklearn_regressor() - fit(wrapped_regressor, training_set) - - result = SupportVectorMachineRegressor(c=self._c, kernel=self._kernel) - result._wrapped_regressor = wrapped_regressor - result._feature_names = training_set.features.column_names - result._target_name = training_set.target.name - - return result - - def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: - """ - Predict a target vector using a dataset containing feature vectors. The model has to be trained first. - - Parameters - ---------- - dataset: - The dataset containing the feature vectors. - - Returns - ------- - table: - A dataset containing the given feature vectors and the predicted target vector. - - Raises - ------ - ModelNotFittedError - If the model has not been fitted yet. - DatasetMissesFeaturesError - If the dataset misses feature columns. - PredictionError - If predicting with the given dataset failed. - NonNumericColumnError - If the dataset contains non-numerical values. - MissingValuesColumnError - If the dataset contains missing values. - DatasetMissesDataError - If the dataset contains no rows. - """ - return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) - - @property - def is_fitted(self) -> bool: - """Whether the regressor is fitted.""" - return self._wrapped_regressor is not None - - def _get_sklearn_regressor(self) -> RegressorMixin: - """ - Return a new wrapped Regressor from sklearn. - - Returns - ------- - wrapped_regressor: - The sklearn Regressor. - """ - from sklearn.svm import SVC as sk_SVR # noqa: N811 - - return sk_SVR(C=self._c, **(self._kernel._get_sklearn_arguments())) diff --git a/src/safeds/ml/classical/regression/_support_vector_regressor.py b/src/safeds/ml/classical/regression/_support_vector_regressor.py new file mode 100644 index 000000000..03c08d664 --- /dev/null +++ b/src/safeds/ml/classical/regression/_support_vector_regressor.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash +from safeds.ml.classical._bases import _SupportVectorMachineBase +from safeds.ml.classical.regression import Regressor + +if TYPE_CHECKING: + from sklearn.base import RegressorMixin + + +class SupportVectorRegressor(Regressor, _SupportVectorMachineBase): + """ + Support vector machine for regression. + + Parameters + ---------- + c: + The strength of regularization. Must be greater than 0. + kernel: + The type of kernel to be used. Defaults to a radial basis function kernel. + + Raises + ------ + OutOfBoundsError + If `c` is less than or equal to 0. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + *, + c: float = 1.0, + kernel: SupportVectorRegressor.Kernel | None = None, + ) -> None: + # Initialize superclasses + Regressor.__init__(self) + _SupportVectorMachineBase.__init__( + self, + c=c, + kernel=kernel, + ) + + def __hash__(self) -> int: + return _structural_hash( + Regressor.__hash__(self), + _SupportVectorMachineBase.__hash__(self), + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def kernel(self) -> SupportVectorRegressor.Kernel: + """The type of kernel used.""" + return self._kernel + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _clone(self) -> SupportVectorRegressor: + return SupportVectorRegressor( + c=self._c, + kernel=self._kernel, + ) + + def _get_sklearn_model(self) -> RegressorMixin: + from sklearn.svm import SVR as SklearnSVR # noqa: N811 + + result = SklearnSVR( + C=self._c, + ) + self._kernel._apply(result) + return result diff --git a/src/safeds/ml/metrics/__init__.py b/src/safeds/ml/metrics/__init__.py new file mode 100644 index 000000000..aa465cff0 --- /dev/null +++ b/src/safeds/ml/metrics/__init__.py @@ -0,0 +1,22 @@ +"""Classes to evaluate the performance of machine learning models.""" + +from typing import TYPE_CHECKING + +import apipkg + +if TYPE_CHECKING: + from ._classification_metrics import ClassificationMetrics + from ._regression_metrics import RegressionMetrics + +apipkg.initpkg( + __name__, + { + "ClassificationMetrics": "._classification_metrics:ClassificationMetrics", + "RegressionMetrics": "._regression_metrics:RegressionMetrics", + }, +) + +__all__ = [ + "ClassificationMetrics", + "RegressionMetrics", +] diff --git a/src/safeds/ml/metrics/_classification_metrics.py b/src/safeds/ml/metrics/_classification_metrics.py new file mode 100644 index 000000000..87a51a15a --- /dev/null +++ b/src/safeds/ml/metrics/_classification_metrics.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Table +from safeds.exceptions import ColumnLengthMismatchError + +if TYPE_CHECKING: + from safeds.data.tabular.containers import Column + + +class ClassificationMetrics: + """A collection of classification metrics.""" + + @staticmethod + def summarize(predicted: Column | TabularDataset, expected: Column | TabularDataset, positive_class: Any) -> Table: + """ + Summarize classification metrics on the given data. + + Parameters + ---------- + predicted: + The predicted target values produced by the classifier. + expected: + The expected target values. + positive_class: + The class to be considered positive. All other classes are considered negative. + + Returns + ------- + metrics: + A table containing the classification metrics. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + accuracy = ClassificationMetrics.accuracy(predicted, expected) + precision = ClassificationMetrics.precision(predicted, expected, positive_class) + recall = ClassificationMetrics.recall(predicted, expected, positive_class) + f1_score = ClassificationMetrics.f1_score(predicted, expected, positive_class) + + return Table( + { + "metric": ["accuracy", "precision", "recall", "f1_score"], + "value": [accuracy, precision, recall, f1_score], + }, + ) + + @staticmethod + def accuracy(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> float: + """ + Compute the accuracy on the given data. + + The accuracy is the proportion of predicted target values that were correct. The **higher** the accuracy, the + better. Results range from 0.0 to 1.0. + + Parameters + ---------- + predicted: + The predicted target values produced by the classifier. + expected: + The expected target values. + + Returns + ------- + accuracy: + The calculated accuracy. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + if expected.number_of_rows == 0: + return 1.0 # Everything was predicted correctly (since there is nothing to predict) + + return expected._series.eq(predicted._series).mean() + + @staticmethod + def f1_score(predicted: Column | TabularDataset, expected: Column | TabularDataset, positive_class: Any) -> float: + """ + Compute the F₁ score on the given data. + + The F₁ score is the harmonic mean of precision and recall. The **higher** the F₁ score, the better the + classifier. Results range from 0.0 to 1.0. + + Parameters + ---------- + predicted: + The predicted target values produced by the classifier. + expected: + The expected target values. + positive_class: + The class to be considered positive. All other classes are considered negative. + + Returns + ------- + f1_score: + The calculated F₁ score. + """ + predicted = _extract_target(predicted) + expected = _extract_target(expected) + _check_equal_length(predicted, expected) + + true_positives = (expected._series.eq(positive_class) & predicted._series.eq(positive_class)).sum() + false_positives = (expected._series.ne(positive_class) & predicted._series.eq(positive_class)).sum() + false_negatives = (expected._series.eq(positive_class) & predicted._series.ne(positive_class)).sum() + + if true_positives + false_positives + false_negatives == 0: + return 1.0 # Only true negatives (so all predictions are correct) + + return 2 * true_positives / (2 * true_positives + false_positives + false_negatives) + + @staticmethod + def precision(predicted: Column | TabularDataset, expected: Column | TabularDataset, positive_class: Any) -> float: + """ + Compute the precision on the given data. + + The precision is the proportion of positive predictions that were correct. The **higher** the precision, the + better the classifier. Results range from 0.0 to 1.0. + + Parameters + ---------- + predicted: + The predicted target values produced by the classifier. + expected: + The expected target values. + positive_class: + The class to be considered positive. All other classes are considered negative. + + Returns + ------- + precision: + The calculated precision. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + true_positives = (expected._series.eq(positive_class) & predicted._series.eq(positive_class)).sum() + predicted_positives = predicted._series.eq(positive_class).sum() + + if predicted_positives == 0: + return 1.0 # All positive predictions were correct (since there are none) + + return true_positives / predicted_positives + + @staticmethod + def recall(predicted: Column | TabularDataset, expected: Column | TabularDataset, positive_class: Any) -> float: + """ + Compute the recall on the given data. + + The recall is the proportion of actual positives that were predicted correctly. The **higher** the recall, the + better the classifier. Results range from 0.0 to 1.0. + + Parameters + ---------- + predicted: + The predicted target values produced by the classifier. + expected: + The expected target values. + positive_class: + The class to be considered positive. All other classes are considered negative. + + Returns + ------- + recall: + The calculated recall. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + true_positives = (expected._series.eq(positive_class) & predicted._series.eq(positive_class)).sum() + actual_positives = expected._series.eq(positive_class).sum() + + if actual_positives == 0: + return 1.0 # All actual positives were predicted correctly (since there are none) + + return true_positives / actual_positives + + +def _extract_target(column_or_dataset: Column | TabularDataset) -> Column: + """Extract the target column from the given column or dataset.""" + if isinstance(column_or_dataset, TabularDataset): + return column_or_dataset.target + else: + return column_or_dataset + + +# TODO: collect validation in one place? +def _check_equal_length(column1: Column, column2: Column) -> None: + """ + Check if the columns have the same length and raise an error if they do not. + + Parameters + ---------- + column1: + The first column. + column2: + The second column. + + Raises + ------ + ValueError + If the columns have different lengths. + """ + if column1.number_of_rows != column2.number_of_rows: + ColumnLengthMismatchError("") # TODO: pass list of columns to exception, let it handle the formatting diff --git a/src/safeds/ml/metrics/_regression_metrics.py b/src/safeds/ml/metrics/_regression_metrics.py new file mode 100644 index 000000000..e9aea4231 --- /dev/null +++ b/src/safeds/ml/metrics/_regression_metrics.py @@ -0,0 +1,258 @@ +from __future__ import annotations + +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Column, Table +from safeds.exceptions import ColumnLengthMismatchError + + +class RegressionMetrics: + """A collection of regression metrics.""" + + @staticmethod + def summarize(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> Table: + """ + Summarize regression metrics on the given data. + + Parameters + ---------- + predicted: + The predicted target values produced by the regressor. + expected: + The expected target values. + + Returns + ------- + metrics: + A table containing the regression metrics. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + coefficient_of_determination = RegressionMetrics.coefficient_of_determination(expected, predicted) + mean_absolute_error = RegressionMetrics.mean_absolute_error(expected, predicted) + mean_squared_error = RegressionMetrics.mean_squared_error(expected, predicted) + median_absolute_deviation = RegressionMetrics.median_absolute_deviation(expected, predicted) + + return Table( + { + "metric": [ + "coefficient_of_determination", + "mean_absolute_error", + "mean_squared_error", + "median_absolute_deviation", + ], + "value": [ + coefficient_of_determination, + mean_absolute_error, + mean_squared_error, + median_absolute_deviation, + ], + }, + ) + + @staticmethod + def coefficient_of_determination(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> float: + """ + Compute the coefficient of determination (R²) on the given data. + + The coefficient of determination compares the regressor's predictions to another model that always predicts the + mean of the target values. It is a measure of how well the regressor explains the variance in the target values. + + The **higher** the coefficient of determination, the better the regressor. Results range from negative infinity + to 1.0. You can interpret the coefficient of determination as follows: + + | R² | Interpretation | + | ---------- | ------------------------------------------------------------------------------------------ | + | 1.0 | The model perfectly predicts the target values. Did you overfit? | + | (0.0, 1.0) | The model is better than predicting the mean of the target values. You should be here. | + | 0.0 | The model is as good as predicting the mean of the target values. Try something else. | + | (-∞, 0.0) | The model is worse than predicting the mean of the target values. Something is very wrong. | + + **Note:** Some other libraries call this metric `r2_score`. + + Parameters + ---------- + predicted: + The predicted target values produced by the regressor. + expected: + The expected target values. + + Returns + ------- + coefficient_of_determination: + The calculated coefficient of determination. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + residual_sum_of_squares = (expected._series - predicted._series).pow(2).sum() + total_sum_of_squares = (expected._series - expected._series.mean()).pow(2).sum() + + if total_sum_of_squares == 0: + if residual_sum_of_squares == 0: + return 1.0 # Everything was predicted correctly + else: + return 0.0 # Model could not even predict constant data + + return 1 - residual_sum_of_squares / total_sum_of_squares + + @staticmethod + def mean_absolute_error(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> float: + """ + Compute the mean absolute error (MAE) on the given data. + + The mean absolute error is the average of the absolute differences between the predicted and expected target + values. The **lower** the mean absolute error, the better the regressor. Results range from 0.0 to positive + infinity. + + Parameters + ---------- + predicted: + The predicted target values produced by the regressor. + expected: + The expected target values. + + Returns + ------- + mean_absolute_error: + The calculated mean absolute error. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + if expected.number_of_rows == 0: + return 0.0 # Everything was predicted correctly (since there is nothing to predict) + + return (expected._series - predicted._series).abs().mean() + + @staticmethod + def mean_directional_accuracy(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> float: + """ + Compute the mean directional accuracy (MDA) on the given data. + + This metric compares two consecutive target values and checks if the predicted direction (down/unchanged/up) + matches the expected direction. The mean directional accuracy is the proportion of correctly predicted + directions. The **higher** the mean directional accuracy, the better the regressor. Results range from 0.0 to + 1.0. + + This metric is useful for time series data, where the order of the target values has a meaning. It is not useful + for other types of data. Because of this, it is not included in the `summarize` method. + + Parameters + ---------- + predicted: + The predicted target values produced by the regressor. + expected: + The expected target values. + + Returns + ------- + mean_directional_accuracy: + The calculated mean directional accuracy. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + if expected.number_of_rows == 0: + return 1.0 + + # Calculate the differences between the target values + predicted_directions = predicted._series.diff().sign() + expected_directions = expected._series.diff().sign() + + return predicted_directions.eq(expected_directions).mean() + + @staticmethod + def mean_squared_error(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> float: + """ + Compute the mean squared error (MSE) on the given data. + + The mean squared error is the average of the squared differences between the predicted and expected target + values. The **lower** the mean squared error, the better the regressor. Results range from 0.0 to positive + infinity. + + **Note:** To get the root mean squared error (RMSE), take the square root of the result. + + Parameters + ---------- + predicted: + The predicted target values produced by the regressor. + expected: + The expected target values. + + Returns + ------- + mean_squared_error: + The calculated mean squared error. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + if expected.number_of_rows == 0: + return 0.0 # Everything was predicted correctly (since there is nothing to predict) + + return (expected._series - predicted._series).pow(2).mean() + + @staticmethod + def median_absolute_deviation(predicted: Column | TabularDataset, expected: Column | TabularDataset) -> float: + """ + Compute the median absolute deviation (MAD) on the given data. + + The median absolute deviation is the median of the absolute differences between the predicted and expected + target values. The **lower** the median absolute deviation, the better the regressor. Results range from 0.0 to + positive infinity. + + Parameters + ---------- + predicted: + The predicted target values produced by the regressor. + expected: + The expected target values. + + Returns + ------- + median_absolute_deviation: + The calculated median absolute deviation. + """ + expected = _extract_target(expected) + predicted = _extract_target(predicted) + _check_equal_length(predicted, expected) + + if expected.number_of_rows == 0: + return 0.0 + + return (expected._series - predicted._series).abs().median() + + +def _extract_target(column_or_dataset: Column | TabularDataset) -> Column: + """Extract the target column from the given column or dataset.""" + if isinstance(column_or_dataset, TabularDataset): + return column_or_dataset.target + else: + return column_or_dataset + + +# TODO: collect validation in one place? +def _check_equal_length(column1: Column, column2: Column) -> None: + """ + Check if the columns have the same length and raise an error if they do not. + + Parameters + ---------- + column1: + The first column. + column2: + The second column. + + Raises + ------ + ValueError + If the columns have different lengths. + """ + if column1.number_of_rows != column2.number_of_rows: + ColumnLengthMismatchError("") # TODO: pass list of columns to exception, let it handle the formatting diff --git a/src/safeds/ml/nn/_output_conversion_time_series.py b/src/safeds/ml/nn/_output_conversion_time_series.py index f3ad6d43f..dadfd03b5 100644 --- a/src/safeds/ml/nn/_output_conversion_time_series.py +++ b/src/safeds/ml/nn/_output_conversion_time_series.py @@ -81,7 +81,7 @@ def _data_conversion(self, input_data: TimeSeriesDataset, output_data: Tensor, * input_data_table = Table.from_rows(input_data_table.to_rows()[window_size + forecast_horizon :]) return input_data_table.add_columns( - [Column(self._prediction_name, output_data.tolist())] + [Column(self._prediction_name, output_data.tolist())], ).to_time_series_dataset( target_name=self._prediction_name, time_name=input_data.time.name, diff --git a/src/safeds/ml/nn/converters/__init__.py b/src/safeds/ml/nn/converters/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/safeds/ml/nn/layers/__init__.py b/src/safeds/ml/nn/layers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/safeds/_utils/test_hashing.py b/tests/safeds/_utils/test_hashing.py index ed1d32d85..b04d8e081 100644 --- a/tests/safeds/_utils/test_hashing.py +++ b/tests/safeds/_utils/test_hashing.py @@ -22,7 +22,7 @@ ({1, "2", 3.0}, 17310946488773236131), (frozenset({1, "2", 3.0}), 17310946488773236131), ({"a": "b", 1: 2}, 17924302838573884393), - (Table({"col1": [1, 2], "col2:": [3, 4]}), 18297321136957342689), + (Table({"col1": [1, 2], "col2:": [3, 4]}), 1702597496720952006), ], ids=[ "none", @@ -63,7 +63,7 @@ def test_structural_hash(value: Any, expected: int) -> None: ({1, "2", 3.0}, b"\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x08@2"), (frozenset({1, "2", 3.0}), b"\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x08@2"), ({"a": "b", 1: 2}, b"\0\0\0\0\0\0\0\x02\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\x02ab"), - (Table({"col1": [1, 2], "col2:": [3, 4]}), b"\x04P\xbfS$\xaf\xf4W"), + (Table({"col1": [1, 2], "col2:": [3, 4]}), b'"?m\x96\xb6\x9a\xf7\x88'), ], ids=[ "none", diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py index 6c84f2a1a..4826bd846 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py @@ -2,7 +2,7 @@ import pytest from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Row, Table +from safeds.data.tabular.containers import Table @pytest.mark.parametrize( @@ -66,12 +66,10 @@ def test_should_return_whether_two_tabular_datasets_are_equal( ("table", "other"), [ (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), None), - (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), Row()), (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), Table()), ], ids=[ "TabularDataset vs. None", - "TabularDataset vs. Row", "TabularDataset vs. Table", ], ) diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py index 0ac34111e..4f42c6e4f 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py @@ -250,5 +250,5 @@ def test_should_create_a_tabular_dataset( assert isinstance(tabular_dataset, TabularDataset) assert tabular_dataset._extras.column_names == extra_names assert tabular_dataset._target.name == target_name - assert tabular_dataset._extras == data.keep_only_columns(extra_names) + assert tabular_dataset._extras == data.remove_columns_except(extra_names) assert tabular_dataset._target == data.get_column(target_name) diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py index 7743da63a..d14a1b92d 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py @@ -2,7 +2,7 @@ import pytest from safeds.data.labeled.containers import TimeSeriesDataset -from safeds.data.tabular.containers import Row, Table +from safeds.data.tabular.containers import Table @pytest.mark.parametrize( @@ -66,12 +66,10 @@ def test_should_return_whether_two_tabular_datasets_are_equal( ("table", "other"), [ (TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c"), None), - (TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c"), Row()), (TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c"), Table()), ], ids=[ "TabularDataset vs. None", - "TabularDataset vs. Row", "TabularDataset vs. Table", ], ) diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py index 99719be02..e27e8ba8d 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py @@ -241,5 +241,5 @@ def test_should_create_a_tabular_dataset( assert isinstance(tabular_dataset, TimeSeriesDataset) assert tabular_dataset._extras.column_names == extra_names assert tabular_dataset._target.name == target_name - assert tabular_dataset._extras == data.keep_only_columns(extra_names) + assert tabular_dataset._extras == data.remove_columns_except(extra_names) assert tabular_dataset._target == data.get_column(target_name) diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py index 66816a747..90ba09664 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py @@ -1,15 +1,12 @@ -from typing import Type - import pytest -from torch.types import Device - from safeds._config import _get_device -from safeds.data.tabular.containers import Table from safeds.data.labeled.containers import TimeSeriesDataset +from safeds.data.tabular.containers import Table +from safeds.exceptions import OutOfBoundsError +from torch.types import Device from torch.utils.data import DataLoader -from safeds.exceptions import OutOfBoundsError -from tests.helpers import get_devices, get_devices_ids, configure_test_with_device +from tests.helpers import configure_test_with_device, get_devices, get_devices_ids @pytest.mark.parametrize( @@ -93,7 +90,7 @@ def test_should_create_dataloader_predict( "B": [2, 5], "C": [3, 6], "T": [0, 1], - } + }, ).to_time_series_dataset("T", "B"), 1, 2, @@ -107,7 +104,7 @@ def test_should_create_dataloader_predict( "B": [2, 5], "C": [3, 6], "T": [0, 1], - } + }, ).to_time_series_dataset("T", "B"), 1, 0, @@ -121,7 +118,7 @@ def test_should_create_dataloader_predict( "B": [2, 5], "C": [3, 6], "T": [0, 1], - } + }, ).to_time_series_dataset("T", "B"), 0, 1, @@ -140,7 +137,7 @@ def test_should_create_dataloader_invalid( data: TimeSeriesDataset, window_size: int, forecast_horizon: int, - error_type: Type[ValueError], + error_type: type[ValueError], error_msg: str, device: Device, ) -> None: @@ -159,7 +156,7 @@ def test_should_create_dataloader_invalid( "B": [2, 5], "C": [3, 6], "T": [0, 1], - } + }, ).to_time_series_dataset("T", "B"), 1, 2, @@ -173,7 +170,7 @@ def test_should_create_dataloader_invalid( "B": [2, 5], "C": [3, 6], "T": [0, 1], - } + }, ).to_time_series_dataset("T", "B"), 1, 0, @@ -187,7 +184,7 @@ def test_should_create_dataloader_invalid( "B": [2, 5], "C": [3, 6], "T": [0, 1], - } + }, ).to_time_series_dataset("T", "B"), 0, 1, @@ -206,12 +203,12 @@ def test_should_create_dataloader_predict_invalid( data: TimeSeriesDataset, window_size: int, forecast_horizon: int, - error_type: Type[ValueError], + error_type: type[ValueError], error_msg: str, device: Device, ) -> None: configure_test_with_device(device) with pytest.raises(error_type, match=error_msg): data._into_dataloader_with_window_predict( - window_size=window_size, forecast_horizon=forecast_horizon, batch_size=1 + window_size=window_size, forecast_horizon=forecast_horizon, batch_size=1, ) diff --git a/tests/safeds/data/labeled/containers/test_image_dataset.py b/tests/safeds/data/labeled/containers/test_image_dataset.py index f11af31e1..120e18d80 100644 --- a/tests/safeds/data/labeled/containers/test_image_dataset.py +++ b/tests/safeds/data/labeled/containers/test_image_dataset.py @@ -5,8 +5,6 @@ import pytest import torch -from torch.types import Device - from safeds._config import _get_device from safeds.data.image.containers import ImageList from safeds.data.image.containers._empty_image_list import _EmptyImageList @@ -23,15 +21,16 @@ TransformerNotFittedError, ) from torch import Tensor +from torch.types import Device from tests.helpers import ( + configure_test_with_device, + get_devices, + get_devices_ids, images_all, plane_png_path, resolve_resource_path, white_square_png_path, - get_devices, - get_devices_ids, - configure_test_with_device, ) T = TypeVar("T", Column, Table, ImageList) @@ -101,7 +100,7 @@ class TestImageDatasetInit: ], ) def test_should_raise_with_invalid_data( - self, input_data: ImageList, output_data: T, error: type[Exception], error_msg: str, device: Device + self, input_data: ImageList, output_data: T, error: type[Exception], error_msg: str, device: Device, ) -> None: configure_test_with_device(device) with pytest.raises(error, match=error_msg): @@ -257,7 +256,7 @@ class TestSizeOf: ], ) def test_should_size_be_greater_than_normal_object( - self, image_dataset_output: str | Column | Table, device: Device + self, image_dataset_output: str | Column | Table, device: Device, ) -> None: configure_test_with_device(device) image_dataset = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset_output)) if isinstance(image_dataset_output, str) else image_dataset_output) # type: ignore[type-var] diff --git a/tests/safeds/data/tabular/containers/_column/test_eq.py b/tests/safeds/data/tabular/containers/_column/test_eq.py index a4e4e04e5..b4b7a2c2b 100644 --- a/tests/safeds/data/tabular/containers/_column/test_eq.py +++ b/tests/safeds/data/tabular/containers/_column/test_eq.py @@ -1,7 +1,7 @@ from typing import Any import pytest -from safeds.data.tabular.containers import Column, Row +from safeds.data.tabular.containers import Column, Table @pytest.mark.parametrize( @@ -44,11 +44,11 @@ def test_should_return_true_if_objects_are_identical(column: Column) -> None: ("column", "other"), [ (Column("a"), None), - (Column("a", [1, 2, 3]), Row()), + (Column("a", [1, 2, 3]), Table()), ], ids=[ "Column vs. None", - "Column vs. Row", + "Column vs. Table", ], ) def test_should_return_not_implemented_if_other_is_not_column(column: Column, other: Any) -> None: diff --git a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py b/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py deleted file mode 100644 index b62e3d1af..000000000 --- a/tests/safeds/data/tabular/containers/_column/test_from_pandas_series.py +++ /dev/null @@ -1,49 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Column -from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String - - -@pytest.mark.parametrize( - ("series", "expected"), - [ - (pd.Series([]), []), - (pd.Series([True, False, True]), [True, False, True]), - (pd.Series([1, 2, 3]), [1, 2, 3]), - (pd.Series([1.0, 2.0, 3.0]), [1.0, 2.0, 3.0]), - (pd.Series(["a", "b", "c"]), ["a", "b", "c"]), - (pd.Series([1, 2.0, "a", True]), [1, 2.0, "a", True]), - ], - ids=["empty", "boolean", "integer", "real number", "string", "mixed"], -) -def test_should_store_the_data(series: pd.Series, expected: Column) -> None: - assert list(Column._from_pandas_series(series)) == expected - - -@pytest.mark.parametrize( - ("series", "type_"), - [ - (pd.Series([True, False, True]), Boolean()), - (pd.Series([1, 2, 3]), Boolean()), - ], - ids=["type is correct", "type is wrong"], -) -def test_should_use_type_if_passed(series: pd.Series, type_: ColumnType) -> None: - assert Column._from_pandas_series(series, type_).type == type_ - - -@pytest.mark.parametrize( - ("series", "expected"), - [ - (pd.Series([]), Nothing()), - (pd.Series([True, False, True]), Boolean()), - (pd.Series([1, 2, 3]), Integer()), - (pd.Series([1.0, 2.0, 3.0]), Integer()), - (pd.Series([1.0, 2.5, 3.0]), RealNumber()), - (pd.Series(["a", "b", "c"]), String()), - (pd.Series([1, 2.0, "a", True]), Anything(is_nullable=False)), - ], - ids=["empty", "boolean", "integer", "real number .0", "real number", "string", "mixed"], -) -def test_should_infer_type_if_not_passed(series: pd.Series, expected: ColumnType) -> None: - assert Column._from_pandas_series(series).type == expected diff --git a/tests/safeds/data/tabular/containers/_column/test_from_polars_series.py b/tests/safeds/data/tabular/containers/_column/test_from_polars_series.py new file mode 100644 index 000000000..942dde7bb --- /dev/null +++ b/tests/safeds/data/tabular/containers/_column/test_from_polars_series.py @@ -0,0 +1,47 @@ +import polars as pl +import pytest +from safeds.data.tabular.containers import Column + + +@pytest.mark.parametrize( + ("series", "expected"), + [ + (pl.Series([]), []), + (pl.Series([True, False, True]), [True, False, True]), + (pl.Series([1, 2, 3]), [1, 2, 3]), + (pl.Series([1.0, 2.0, 3.0]), [1.0, 2.0, 3.0]), + (pl.Series(["a", "b", "c"]), ["a", "b", "c"]), + ], + ids=["empty", "boolean", "integer", "real number", "string"], +) +def test_should_store_the_data(series: pl.Series, expected: Column) -> None: + assert list(Column._from_polars_series(series)) == expected + +# TODO +# @pytest.mark.parametrize( +# ("series", "type_"), +# [ +# (pd.Series([True, False, True]), Boolean()), +# (pd.Series([1, 2, 3]), Boolean()), +# ], +# ids=["type is correct", "type is wrong"], +# ) +# def test_should_use_type_if_passed(series: pd.Series, type_: ColumnType) -> None: +# assert Column._from_polars_series(series, type_).type == type_ +# +# +# @pytest.mark.parametrize( +# ("series", "expected"), +# [ +# (pd.Series([]), Nothing()), +# (pd.Series([True, False, True]), Boolean()), +# (pd.Series([1, 2, 3]), Integer()), +# (pd.Series([1.0, 2.0, 3.0]), Integer()), +# (pd.Series([1.0, 2.5, 3.0]), RealNumber()), +# (pd.Series(["a", "b", "c"]), String()), +# (pd.Series([1, 2.0, "a", True]), Anything(is_nullable=False)), +# ], +# ids=["empty", "boolean", "integer", "real number .0", "real number", "string", "mixed"], +# ) +# def test_should_infer_type_if_not_passed(series: pd.Series, expected: ColumnType) -> None: +# assert Column._from_polars_series(series).type == expected diff --git a/tests/safeds/data/tabular/containers/_column/test_get_unique_values.py b/tests/safeds/data/tabular/containers/_column/test_get_distinct_values.py similarity index 91% rename from tests/safeds/data/tabular/containers/_column/test_get_unique_values.py rename to tests/safeds/data/tabular/containers/_column/test_get_distinct_values.py index 9a433725f..e40a8a2d5 100644 --- a/tests/safeds/data/tabular/containers/_column/test_get_unique_values.py +++ b/tests/safeds/data/tabular/containers/_column/test_get_distinct_values.py @@ -26,4 +26,4 @@ ) def test_should_list_unique_values(values: list[Any], unique_values: list[Any]) -> None: column: Column = Column("", values) - assert column.get_unique_values() == unique_values + assert column.get_distinct_values() == unique_values diff --git a/tests/safeds/data/tabular/containers/_column/test_has_missing_values.py b/tests/safeds/data/tabular/containers/_column/test_has_missing_values.py deleted file mode 100644 index ae98cd92b..000000000 --- a/tests/safeds/data/tabular/containers/_column/test_has_missing_values.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest -from safeds.data.tabular.containers import Column - - -@pytest.mark.parametrize( - ("values", "expected"), - [ - ([], False), - ([1, 2, 3], False), - ([1, 2, 3, None], True), - ([None, None, None], True), - ], - ids=[ - "empty", - "no missing values", - "some missing values", - "all missing values", - ], -) -def test_should_return_whether_the_column_has_missing_values(values: list, expected: bool) -> None: - column = Column("A", values) - assert column.has_missing_values() == expected diff --git a/tests/safeds/data/tabular/containers/_column/test_idness.py b/tests/safeds/data/tabular/containers/_column/test_idness.py index b1deba798..02f02aaee 100644 --- a/tests/safeds/data/tabular/containers/_column/test_idness.py +++ b/tests/safeds/data/tabular/containers/_column/test_idness.py @@ -1,18 +1,17 @@ -from typing import Any - import pytest from safeds.data.tabular.containers import Column -from safeds.exceptions import ColumnSizeError @pytest.mark.parametrize( ("values", "result"), [ + ([], 1), (["A", "B"], 1), (["A", "A", "A", "B"], 0.5), (["A", "A", "A", "A"], 0.25), ], ids=[ + "empty", "all unique values", "some unique values", "all same values", @@ -21,9 +20,3 @@ def test_should_return_idness_of_column(values: list[str], result: float) -> None: column = Column("A", values) assert column.idness() == result - - -def test_should_raise_if_column_is_empty() -> None: - column: Column[Any] = Column("A", []) - with pytest.raises(ColumnSizeError): - column.idness() diff --git a/tests/safeds/data/tabular/containers/_column/test_init.py b/tests/safeds/data/tabular/containers/_column/test_init.py index d7015b3ff..90fa7b3ef 100644 --- a/tests/safeds/data/tabular/containers/_column/test_init.py +++ b/tests/safeds/data/tabular/containers/_column/test_init.py @@ -3,7 +3,6 @@ import pandas as pd import pytest from safeds.data.tabular.containers import Column -from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String def test_should_store_the_name() -> None: @@ -20,7 +19,7 @@ def test_should_store_the_name() -> None: ids=["data as list", "data as series"], ) def test_should_set_the_name_of_internal_series(column: Column, expected: str) -> None: - assert column._data.name == expected + assert column._series.name == expected @pytest.mark.parametrize( @@ -40,18 +39,19 @@ def test_should_store_the_data(column: Column, expected: list) -> None: assert list(column) == expected -@pytest.mark.parametrize( - ("column", "expected"), - [ - (Column("A", []), Nothing()), - (Column("A", [True, False, True]), Boolean()), - (Column("A", [1, 2, 3]), Integer()), - (Column("A", [1.0, 2.0, 3.0]), Integer()), - (Column("A", [1.0, 2.5, 3.0]), RealNumber()), - (Column("A", ["a", "b", "c"]), String()), - (Column("A", [1, 2.0, "a", True]), Anything()), - ], - ids=["empty", "boolean", "integer", "real number .0", "real number", "string", "mixed"], -) -def test_should_infer_type(column: Column, expected: ColumnType) -> None: - assert column.type == expected +# TODO +# @pytest.mark.parametrize( +# ("column", "expected"), +# [ +# (Column("A", []), Nothing()), +# (Column("A", [True, False, True]), Boolean()), +# (Column("A", [1, 2, 3]), Integer()), +# (Column("A", [1.0, 2.0, 3.0]), Integer()), +# (Column("A", [1.0, 2.5, 3.0]), RealNumber()), +# (Column("A", ["a", "b", "c"]), String()), +# (Column("A", [1, 2.0, "a", True]), Anything()), +# ], +# ids=["empty", "boolean", "integer", "real number .0", "real number", "string", "mixed"], +# ) +# def test_should_infer_type(column: Column, expected: ColumnType) -> None: +# assert column.type == expected diff --git a/tests/safeds/data/tabular/containers/_column/test_iter.py b/tests/safeds/data/tabular/containers/_column/test_iter.py index c0568369b..2e3fcc9a9 100644 --- a/tests/safeds/data/tabular/containers/_column/test_iter.py +++ b/tests/safeds/data/tabular/containers/_column/test_iter.py @@ -7,7 +7,7 @@ [ (Column("a", []), []), (Column("a", [0]), [0]), - (Column("a", [0, "1"]), [0, "1"]), + (Column("a", [0, 1]), [0, 1]), ], ids=[ "empty", diff --git a/tests/safeds/data/tabular/containers/_column/test_mean.py b/tests/safeds/data/tabular/containers/_column/test_mean.py index 7b4859be3..2ac35afa0 100644 --- a/tests/safeds/data/tabular/containers/_column/test_mean.py +++ b/tests/safeds/data/tabular/containers/_column/test_mean.py @@ -6,33 +6,20 @@ @pytest.mark.parametrize( ("values", "expected"), [ + ([], None), ([1, 2, 3], 2), ([1, 2, 3, None], 2), + (["a", "b", "c"], None), + ([None, None, None], None), ], ids=[ + "empty", "no missing values", "some missing values", - ], -) -def test_should_return_the_mean_value(values: list, expected: int) -> None: - column = Column("A", values) - assert column.mean() == expected - - -@pytest.mark.parametrize( - "values", - [ - [], - ["a", "b", "c"], - [None, None, None], - ], - ids=[ - "empty", "non-numeric", "all missing values", ], ) -def test_should_raise_if_column_is_not_numeric(values: list) -> None: +def test_should_return_the_mean_value(values: list, expected: int) -> None: column = Column("A", values) - with pytest.raises(NonNumericColumnError): - column.mean() + assert column.mean() == expected diff --git a/tests/safeds/data/tabular/containers/_column/test_rename.py b/tests/safeds/data/tabular/containers/_column/test_rename.py index 4c0d38585..beffe0d8e 100644 --- a/tests/safeds/data/tabular/containers/_column/test_rename.py +++ b/tests/safeds/data/tabular/containers/_column/test_rename.py @@ -5,11 +5,9 @@ def test_should_return_new_column_with_new_name() -> None: column = Column("A", [1, 2, 3]) new_column = column.rename("B") assert new_column.name == "B" - assert new_column._data.name == "B" def test_should_not_change_name_of_original_column() -> None: column = Column("A", [1, 2, 3]) column.rename("B") assert column.name == "A" - assert column._data.name == "A" diff --git a/tests/safeds/data/tabular/containers/_column/test_stability.py b/tests/safeds/data/tabular/containers/_column/test_stability.py index abb9f346f..2382daa37 100644 --- a/tests/safeds/data/tabular/containers/_column/test_stability.py +++ b/tests/safeds/data/tabular/containers/_column/test_stability.py @@ -8,12 +8,16 @@ @pytest.mark.parametrize( ("values", "expected"), [ + ([], 1), + ([None], 1), ([1, 2, 3, 4], 1 / 4), ([1, 2, 3, 4, None], 1 / 4), (["b", "a", "abc", "abc", "abc"], 3 / 5), ([1, 1, 3, "abc", None], 2 / 4), ], ids=[ + "empty", + "only missing values", "numeric", "numeric with missing values", "non-numeric", diff --git a/tests/safeds/data/tabular/containers/_table/test_eq.py b/tests/safeds/data/tabular/containers/_table/test_eq.py index 21f185340..04a5256f3 100644 --- a/tests/safeds/data/tabular/containers/_table/test_eq.py +++ b/tests/safeds/data/tabular/containers/_table/test_eq.py @@ -1,7 +1,7 @@ from typing import Any import pytest -from safeds.data.tabular.containers import Row, Table +from safeds.data.tabular.containers import Column, Table @pytest.mark.parametrize( @@ -43,11 +43,11 @@ def test_should_return_true_if_objects_are_identical(table: Table) -> None: ("table", "other"), [ (Table({"col1": [1]}), None), - (Table({"col1": [1]}), Row()), + (Table({"col1": [1]}), Column("a")), ], ids=[ "Table vs. None", - "Table vs. Row", + "Table vs. Column", ], ) def test_should_return_not_implemented_if_other_is_not_table(table: Table, other: Any) -> None: diff --git a/tests/safeds/data/tabular/containers/_table/test_filter_rows.py b/tests/safeds/data/tabular/containers/_table/test_filter_rows.py deleted file mode 100644 index 75b08e480..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_filter_rows.py +++ /dev/null @@ -1,38 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import ColumnType, Integer, Schema - - -@pytest.mark.parametrize( - ("table1", "filter_column", "filter_value", "table2"), - [ - ( - Table(), - "col1", - 1, - Table._from_pandas_dataframe(pd.DataFrame(), Schema({})), - ), - ( - Table({"col1": [3, 2, 4], "col2": [1, 2, 4]}), - "col1", - 1, - Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})), - ), - ( - Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - "col1", - 1, - Table({"col1": [1, 1], "col2": [1, 4]}), - ), - ], - ids=[ - "empty table", - "no matches", - "matches", - ], -) -def test_should_filter_rows(table1: Table, filter_column: str, filter_value: ColumnType, table2: Table) -> None: - table1 = table1.filter_rows(lambda row: row.get_value(filter_column) == filter_value) - assert table1.schema == table2.schema - assert table2 == table1 diff --git a/tests/safeds/data/tabular/containers/_table/test_from_pandas_dataframe.py b/tests/safeds/data/tabular/containers/_table/test_from_pandas_dataframe.py deleted file mode 100644 index 6ed19a906..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_from_pandas_dataframe.py +++ /dev/null @@ -1,78 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import Integer, RealNumber, Schema, String - - -@pytest.mark.parametrize( - ("dataframe", "schema", "expected", "expected_table"), - [ - (pd.DataFrame({"col1": [0]}), Schema({"col1": Integer()}), Schema({"col1": Integer()}), Table({"col1": [0]})), - ( - pd.DataFrame({"col1": [0], "col2": ["a"]}), - Schema({"col1": Integer(), "col2": String()}), - Schema({"col1": Integer(), "col2": String()}), - Table({"col1": [0], "col2": ["a"]}), - ), - ( - pd.DataFrame({"col1": [0, 1.1]}), - Schema({"col1": String()}), - Schema({"col1": String()}), - Table({"col1": [0, 1.1]}), - ), - ( - pd.DataFrame({"col1": [0, 1.1], "col2": ["a", "b"]}), - Schema({"col1": String(), "col2": String()}), - Schema({"col1": String(), "col2": String()}), - Table({"col1": [0, 1.1], "col2": ["a", "b"]}), - ), - (pd.DataFrame(), Schema({}), Schema({}), Table()), - ], - ids=["one row, one column", "one row, two columns", "two rows, one column", "two rows, two columns", "empty"], -) -def test_should_use_the_schema_if_passed( - dataframe: pd.DataFrame, - schema: Schema, - expected: Schema, - expected_table: Table, -) -> None: - table = Table._from_pandas_dataframe(dataframe, schema) - assert table._schema == expected - assert table == expected_table - - -@pytest.mark.parametrize( - ("dataframe", "expected"), - [ - ( - pd.DataFrame({"col1": [0]}), - Schema({"col1": Integer()}), - ), - ( - pd.DataFrame({"col1": [0], "col2": ["a"]}), - Schema({"col1": Integer(), "col2": String()}), - ), - ( - pd.DataFrame({"col1": [0, 1.1]}), - Schema({"col1": RealNumber()}), - ), - ( - pd.DataFrame({"col1": [0, 1.1], "col2": ["a", "b"]}), - Schema({"col1": RealNumber(), "col2": String()}), - ), - ], - ids=[ - "one row, one column", - "one row, two columns", - "two rows, one column", - "two rows, two columns", - ], -) -def test_should_infer_the_schema_if_not_passed(dataframe: pd.DataFrame, expected: Schema) -> None: - table = Table._from_pandas_dataframe(dataframe) - assert table._schema == expected - - -def test_should_be_able_to_handle_empty_dataframe_with_given_schema() -> None: - table = Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})) - table.get_column("col1") diff --git a/tests/safeds/data/tabular/containers/_table/test_from_polars_dataframe.py b/tests/safeds/data/tabular/containers/_table/test_from_polars_dataframe.py new file mode 100644 index 000000000..c3b880aa0 --- /dev/null +++ b/tests/safeds/data/tabular/containers/_table/test_from_polars_dataframe.py @@ -0,0 +1,79 @@ +# import pandas as pd +# import pytest +# from safeds.data.tabular.containers import Table +# from safeds.data.tabular.typing import Schema + + +# TODO +# @pytest.mark.parametrize( +# ("dataframe", "schema", "expected", "expected_table"), +# [ +# (pd.DataFrame({"col1": [0]}), Schema({"col1": Integer()}), Schema({"col1": Integer()}), Table({"col1": [0]})), +# ( +# pd.DataFrame({"col1": [0], "col2": ["a"]}), +# Schema({"col1": Integer(), "col2": String()}), +# Schema({"col1": Integer(), "col2": String()}), +# Table({"col1": [0], "col2": ["a"]}), +# ), +# ( +# pd.DataFrame({"col1": [0, 1.1]}), +# Schema({"col1": String()}), +# Schema({"col1": String()}), +# Table({"col1": [0, 1.1]}), +# ), +# ( +# pd.DataFrame({"col1": [0, 1.1], "col2": ["a", "b"]}), +# Schema({"col1": String(), "col2": String()}), +# Schema({"col1": String(), "col2": String()}), +# Table({"col1": [0, 1.1], "col2": ["a", "b"]}), +# ), +# (pd.DataFrame(), Schema({}), Schema({}), Table()), +# ], +# ids=["one row, one column", "one row, two columns", "two rows, one column", "two rows, two columns", "empty"], +# ) +# def test_should_use_the_schema_if_passed( +# dataframe: pd.DataFrame, +# schema: Schema, +# expected: Schema, +# expected_table: Table, +# ) -> None: +# table = Table._from_pandas_dataframe(dataframe, schema) +# assert table._schema == expected +# assert table == expected_table + +# TODO +# @pytest.mark.parametrize( +# ("dataframe", "expected"), +# [ +# ( +# pd.DataFrame({"col1": [0]}), +# Schema({"col1": Integer()}), +# ), +# ( +# pd.DataFrame({"col1": [0], "col2": ["a"]}), +# Schema({"col1": Integer(), "col2": String()}), +# ), +# ( +# pd.DataFrame({"col1": [0, 1.1]}), +# Schema({"col1": RealNumber()}), +# ), +# ( +# pd.DataFrame({"col1": [0, 1.1], "col2": ["a", "b"]}), +# Schema({"col1": RealNumber(), "col2": String()}), +# ), +# ], +# ids=[ +# "one row, one column", +# "one row, two columns", +# "two rows, one column", +# "two rows, two columns", +# ], +# ) +# def test_should_infer_the_schema_if_not_passed(dataframe: pd.DataFrame, expected: Schema) -> None: +# table = Table._from_pandas_dataframe(dataframe) +# assert table._schema == expected + +# TODO +# def test_should_be_able_to_handle_empty_dataframe_with_given_schema() -> None: +# table = Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})) +# table.get_column("col1") diff --git a/tests/safeds/data/tabular/containers/_table/test_from_rows.py b/tests/safeds/data/tabular/containers/_table/test_from_rows.py deleted file mode 100644 index c82ee294f..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_from_rows.py +++ /dev/null @@ -1,56 +0,0 @@ -import pytest -from safeds.data.tabular.containers import Row, Table -from safeds.exceptions import UnknownColumnNameError - - -@pytest.mark.parametrize( - ("rows", "expected"), - [ - ( - [], - Table(), - ), - ( - [ - Row({"A": 1, "B": 4, "C": "d"}), - Row({"A": 2, "B": 5, "C": "e"}), - Row({"A": 3, "B": 6, "C": "f"}), - ], - Table( - { - "A": [1, 2, 3], - "B": [4, 5, 6], - "C": ["d", "e", "f"], - }, - ), - ), - ( - [ - Row({"A": 1, "B": 4, "C": "d"}), - Row({"A": 2, "B": 5, "C": "e"}), - Row({"A": 3, "B": "6", "C": "f"}), - ], - Table( - { - "A": [1, 2, 3], - "B": [4, 5, "6"], - "C": ["d", "e", "f"], - }, - ), - ), - ], - ids=["empty", "non-empty", "different schemas"], -) -def test_should_create_table_from_rows(rows: list[Row], expected: Table) -> None: - table = Table.from_rows(rows) - assert table.schema == expected.schema - assert table == expected - - -@pytest.mark.parametrize( - ("rows", "expected_error_msg"), - [([Row({"A": 1, "B": 2}), Row({"A": 2, "C": 4})], r"Could not find column\(s\) 'B'")], -) -def test_should_raise_error_if_unknown_column_names(rows: list[Row], expected_error_msg: str) -> None: - with pytest.raises(UnknownColumnNameError, match=expected_error_msg): - Table.from_rows(rows) diff --git a/tests/safeds/data/tabular/containers/_table/test_init.py b/tests/safeds/data/tabular/containers/_table/test_init.py index 0f7895715..870ca035c 100644 --- a/tests/safeds/data/tabular/containers/_table/test_init.py +++ b/tests/safeds/data/tabular/containers/_table/test_init.py @@ -1,24 +1,23 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import Integer, Schema from safeds.exceptions import ColumnLengthMismatchError - -@pytest.mark.parametrize( - ("table", "expected"), - [ - (Table(), Schema({})), - (Table({}), Schema({})), - (Table({"col1": [0]}), Schema({"col1": Integer()})), - ], - ids=[ - "empty", - "empty (explicit)", - "one column", - ], -) -def test_should_infer_the_schema(table: Table, expected: Schema) -> None: - assert table.schema == expected +# TODO +# @pytest.mark.parametrize( +# ("table", "expected"), +# [ +# (Table(), Schema({})), +# (Table({}), Schema({})), +# (Table({"col1": [0]}), Schema({"col1": Integer()})), +# ], +# ids=[ +# "empty", +# "empty (explicit)", +# "one column", +# ], +# ) +# def test_should_infer_the_schema(table: Table, expected: Schema) -> None: +# assert table.schema == expected def test_should_raise_error_if_columns_have_different_lengths() -> None: diff --git a/tests/safeds/data/tabular/containers/_table/test_keep_only_rows.py b/tests/safeds/data/tabular/containers/_table/test_keep_only_rows.py deleted file mode 100644 index 3c3e24f25..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_keep_only_rows.py +++ /dev/null @@ -1,38 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import ColumnType, Integer, Schema - - -@pytest.mark.parametrize( - ("table1", "filter_column", "filter_value", "table2"), - [ - ( - Table(), - "col1", - 1, - Table._from_pandas_dataframe(pd.DataFrame(), Schema({})), - ), - ( - Table({"col1": [3, 2, 4], "col2": [1, 2, 4]}), - "col1", - 1, - Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})), - ), - ( - Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - "col1", - 1, - Table({"col1": [1, 1], "col2": [1, 4]}), - ), - ], - ids=[ - "empty table", - "no matches", - "matches", - ], -) -def test_should_keep_only_rows(table1: Table, filter_column: str, filter_value: ColumnType, table2: Table) -> None: - table1 = table1.keep_only_rows(lambda row: row.get_value(filter_column) == filter_value) - assert table1.schema == table2.schema - assert table2 == table1 diff --git a/tests/safeds/data/tabular/containers/_table/test_remove_rows.py b/tests/safeds/data/tabular/containers/_table/test_remove_rows.py index 35399b999..5ca8829a4 100644 --- a/tests/safeds/data/tabular/containers/_table/test_remove_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_remove_rows.py @@ -1,18 +1,12 @@ -import pandas as pd +from typing import Any + import pytest from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import ColumnType, Schema @pytest.mark.parametrize( ("table1", "filter_column", "filter_value", "table2"), [ - ( - Table(), - "col1", - 1, - Table._from_pandas_dataframe(pd.DataFrame(), Schema({})), - ), ( Table({"col1": [3, 2, 4], "col2": [1, 2, 4]}), "col1", @@ -27,12 +21,11 @@ ), ], ids=[ - "empty table", "no match", "matches", ], ) -def test_should_remove_rows(table1: Table, filter_column: str, filter_value: ColumnType, table2: Table) -> None: +def test_should_remove_rows(table1: Table, filter_column: str, filter_value: Any, table2: Table) -> None: table1 = table1.remove_rows(lambda row: row.get_value(filter_column) == filter_value) assert table1.schema == table2.schema assert table2 == table1 diff --git a/tests/safeds/data/tabular/containers/_table/test_split_rows.py b/tests/safeds/data/tabular/containers/_table/test_split_rows.py index daa5708a6..e57898804 100644 --- a/tests/safeds/data/tabular/containers/_table/test_split_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_split_rows.py @@ -1,7 +1,5 @@ -import pandas as pd import pytest from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import Integer, Nothing, Schema @pytest.mark.parametrize( @@ -15,14 +13,14 @@ ), ( Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Nothing(), "col2": Nothing()})), + Table(), Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), 0, ), ( Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}), - Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})), + Table(), 1, ), ], @@ -50,7 +48,7 @@ def test_should_split_table( ) def test_should_raise_if_value_not_in_range(percentage_in_first: float) -> None: table = Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}) - with pytest.raises(ValueError, match=r"The given percentage is not between 0 and 1"): + with pytest.raises(ValueError, match=r"is not inside \[0, 1\]"): table.split_rows(percentage_in_first) diff --git a/tests/safeds/data/tabular/containers/_table/test_to_rows.py b/tests/safeds/data/tabular/containers/_table/test_to_rows.py deleted file mode 100644 index 92c695a7a..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_to_rows.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Row, Table -from safeds.data.tabular.typing import Integer, Schema, String - - -@pytest.mark.parametrize( - ("table", "rows_expected"), - [ - ( - Table._from_pandas_dataframe( - pd.DataFrame([[1, 4, "d"], [2, 5, "e"], [3, 6, "f"]]), - Schema({"A": Integer(), "B": Integer(), "D": String()}), - ), - [ - Row._from_pandas_dataframe( - pd.DataFrame({"A": [1], "B": [4], "D": ["d"]}), - Schema({"A": Integer(), "B": Integer(), "D": String()}), - ), - Row._from_pandas_dataframe( - pd.DataFrame({"A": [2], "B": [5], "D": ["e"]}), - Schema({"A": Integer(), "B": Integer(), "D": String()}), - ), - Row._from_pandas_dataframe( - pd.DataFrame({"A": [3], "B": [6], "D": ["f"]}), - Schema({"A": Integer(), "B": Integer(), "D": String()}), - ), - ], - ), - (Table(), []), - ], - ids=["normal", "empty"], -) -def test_should_return_list_of_rows(table: Table, rows_expected: list[Row]) -> None: - rows_is = table.to_rows() - - for row_is, row_expected in zip(rows_is, rows_expected, strict=True): - assert row_is.schema == row_expected.schema - assert row_is == row_expected diff --git a/tests/safeds/data/tabular/containers/test_row.py b/tests/safeds/data/tabular/containers/test_row.py index 1c431846b..611523627 100644 --- a/tests/safeds/data/tabular/containers/test_row.py +++ b/tests/safeds/data/tabular/containers/test_row.py @@ -1,517 +1,518 @@ -import re -import sys -from typing import Any - -import pandas as pd -import pytest -from safeds.data.tabular.containers import Row, Table -from safeds.data.tabular.typing import ColumnType, Integer, Schema, String -from safeds.exceptions import UnknownColumnNameError - - -class TestFromDict: - @pytest.mark.parametrize( - ("data", "expected"), - [ - ( - {}, - Row({}), - ), - ( - { - "a": 1, - "b": 2, - }, - Row({"a": 1, "b": 2}), - ), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_create_row_from_dict(self, data: dict[str, Any], expected: Row) -> None: - assert Row.from_dict(data) == expected - - -class TestFromPandasDataFrame: - @pytest.mark.parametrize( - ("dataframe", "schema", "expected"), - [ - ( - pd.DataFrame({"col1": [0]}), - Schema({"col1": String()}), - Schema({"col1": String()}), - ), - ( - pd.DataFrame({"col1": [0], "col2": ["a"]}), - Schema({"col1": String(), "col2": String()}), - Schema({"col1": String(), "col2": String()}), - ), - ], - ids=[ - "one column", - "two columns", - ], - ) - def test_should_use_the_schema_if_passed(self, dataframe: pd.DataFrame, schema: Schema, expected: Schema) -> None: - row = Row._from_pandas_dataframe(dataframe, schema) - assert row._schema == expected - - @pytest.mark.parametrize( - ("dataframe", "expected"), - [ - ( - pd.DataFrame({"col1": [0]}), - Schema({"col1": Integer()}), - ), - ( - pd.DataFrame({"col1": [0], "col2": ["a"]}), - Schema({"col1": Integer(), "col2": String()}), - ), - ], - ids=[ - "one column", - "two columns", - ], - ) - def test_should_infer_the_schema_if_not_passed(self, dataframe: pd.DataFrame, expected: Schema) -> None: - row = Row._from_pandas_dataframe(dataframe) - assert row._schema == expected - - @pytest.mark.parametrize( - "dataframe", - [ - pd.DataFrame(), - pd.DataFrame({"col1": [0, 1]}), - ], - ids=[ - "empty", - "two rows", - ], - ) - def test_should_raise_if_dataframe_does_not_contain_exactly_one_row(self, dataframe: pd.DataFrame) -> None: - with pytest.raises(ValueError, match=re.escape("The dataframe has to contain exactly one row.")): - Row._from_pandas_dataframe(dataframe) - - -class TestInit: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), Schema({})), - (Row({}), Schema({})), - (Row({"col1": 0}), Schema({"col1": Integer()})), - ], - ids=[ - "empty", - "empty (explicit)", - "one column", - ], - ) - def test_should_infer_the_schema(self, row: Row, expected: Schema) -> None: - assert row._schema == expected - - -class TestContains: - @pytest.mark.parametrize( - ("row", "column_name", "expected"), - [ - (Row({}), "col1", False), - (Row({"col1": 0}), "col1", True), - (Row({"col1": 0}), "col2", False), - (Row({"col1": 0}), 1, False), - ], - ids=[ - "empty row", - "column exists", - "column does not exist", - "not a string", - ], - ) - def test_should_return_whether_the_row_has_the_column(self, row: Row, column_name: str, expected: bool) -> None: - assert (column_name in row) == expected - - -class TestEq: - @pytest.mark.parametrize( - ("row1", "row2", "expected"), - [ - (Row(), Row(), True), - (Row({"col1": 0}), Row({"col1": 0}), True), - (Row({"col1": 0}), Row({"col1": 1}), False), - (Row({"col1": 0}), Row({"col2": 0}), False), - (Row({"col1": 0}), Row({"col1": "a"}), False), - ], - ids=[ - "empty rows", - "equal rows", - "different values", - "different columns", - "different types", - ], - ) - def test_should_return_whether_two_rows_are_equal(self, row1: Row, row2: Row, expected: bool) -> None: - assert (row1.__eq__(row2)) == expected - - @pytest.mark.parametrize( - "row", - [ - Row(), - Row({"col1": 0}), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_return_true_if_objects_are_identical(self, row: Row) -> None: - assert (row.__eq__(row)) is True - - @pytest.mark.parametrize( - ("row", "other"), - [ - (Row({"col1": 0}), None), - (Row({"col1": 0}), Table()), - ], - ids=[ - "Row vs. None", - "Row vs. Table", - ], - ) - def test_should_return_not_implemented_if_other_is_not_row(self, row: Row, other: Any) -> None: - assert (row.__eq__(other)) is NotImplemented - - -class TestHash: - @pytest.mark.parametrize( - ("row1", "row2"), - [ - (Row(), Row()), - (Row({"col1": 0}), Row({"col1": 0})), - ], - ids=[ - "empty rows", - "equal rows", - ], - ) - def test_should_return_same_hash_for_equal_rows(self, row1: Row, row2: Row) -> None: - assert hash(row1) == hash(row2) - - @pytest.mark.parametrize( - ("row1", "row2"), - [ - (Row({"col1": 0}), Row({"col1": 1})), - (Row({"col1": 0}), Row({"col2": 0})), - (Row({"col1": 0}), Row({"col1": "a"})), - ], - ids=[ - "different values", - "different columns", - "different types", - ], - ) - def test_should_return_different_hash_for_unequal_rows(self, row1: Row, row2: Row) -> None: - assert hash(row1) != hash(row2) - - -class TestGetitem: - @pytest.mark.parametrize( - ("row", "column_name", "expected"), - [ - (Row({"col1": 0}), "col1", 0), - (Row({"col1": 0, "col2": "a"}), "col2", "a"), - ], - ids=[ - "one column", - "two columns", - ], - ) - def test_should_return_the_value_in_the_column(self, row: Row, column_name: str, expected: Any) -> None: - assert row[column_name] == expected - - @pytest.mark.parametrize( - ("row", "column_name"), - [ - (Row(), "col1"), - (Row({"col1": 0}), "col2"), - ], - ids=[ - "empty row", - "column does not exist", - ], - ) - def test_should_raise_if_column_does_not_exist(self, row: Row, column_name: str) -> None: - with pytest.raises(UnknownColumnNameError): - # noinspection PyStatementEffect - row[column_name] - - -class TestIter: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), []), - (Row({"col1": 0}), ["col1"]), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_return_an_iterator_for_the_column_names(self, row: Row, expected: list[str]) -> None: - assert list(row) == expected - - -class TestLen: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), 0), - (Row({"col1": 0, "col2": "a"}), 2), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_return_the_number_of_columns(self, row: Row, expected: int) -> None: - assert len(row) == expected - - -class TestRepr: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), "Row({})"), - (Row({"col1": 0}), "Row({'col1': 0})"), - (Row({"col1": 0, "col2": "a"}), "Row({\n 'col1': 0,\n 'col2': 'a'\n})"), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_return_a_string_representation(self, row: Row, expected: str) -> None: - assert repr(row) == expected - - -class TestStr: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), "{}"), - (Row({"col1": 0}), "{'col1': 0}"), - (Row({"col1": 0, "col2": "a"}), "{\n 'col1': 0,\n 'col2': 'a'\n}"), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_return_a_string_representation(self, row: Row, expected: str) -> None: - assert str(row) == expected - - -class TestColumnNames: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), []), - (Row({"col1": 0}), ["col1"]), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_return_the_column_names(self, row: Row, expected: list[str]) -> None: - assert row.column_names == expected - - -class TestNumberOfColumns: - @pytest.mark.parametrize( - ("row", "expected"), - [ - (Row(), 0), - (Row({"col1": 0, "col2": "a"}), 2), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_return_the_number_of_columns(self, row: Row, expected: int) -> None: - assert row.number_of_columns == expected - - -class TestGetValue: - @pytest.mark.parametrize( - ("row", "column_name", "expected"), - [ - (Row({"col1": 0}), "col1", 0), - (Row({"col1": 0, "col2": "a"}), "col2", "a"), - ], - ids=[ - "one column", - "two columns", - ], - ) - def test_should_return_the_value_in_the_column(self, row: Row, column_name: str, expected: Any) -> None: - assert row.get_value(column_name) == expected - - @pytest.mark.parametrize( - ("row", "column_name"), - [ - (Row({}), "col1"), - (Row({"col1": 0}), "col2"), - ], - ids=[ - "empty row", - "column does not exist", - ], - ) - def test_should_raise_if_column_does_not_exist(self, row: Row, column_name: str) -> None: - with pytest.raises(UnknownColumnNameError): - row.get_value(column_name) - - -class TestHasColumn: - @pytest.mark.parametrize( - ("row", "column_name", "expected"), - [ - (Row(), "col1", False), - (Row({"col1": 0}), "col1", True), - (Row({"col1": 0}), "col2", False), - ], - ids=[ - "empty row", - "column exists", - "column does not exist", - ], - ) - def test_should_return_whether_the_row_has_the_column(self, row: Row, column_name: str, expected: bool) -> None: - assert row.has_column(column_name) == expected - - -class TestGetColumnType: - @pytest.mark.parametrize( - ("row", "column_name", "expected"), - [ - (Row({"col1": 0}), "col1", Integer()), - (Row({"col1": 0, "col2": "a"}), "col2", String()), - ], - ids=[ - "one column", - "two columns", - ], - ) - def test_should_return_the_type_of_the_column(self, row: Row, column_name: str, expected: ColumnType) -> None: - assert row.get_column_type(column_name) == expected - - @pytest.mark.parametrize( - ("row", "column_name"), - [ - (Row(), "col1"), - (Row({"col1": 0}), "col2"), - ], - ids=[ - "empty row", - "column does not exist", - ], - ) - def test_should_raise_if_column_does_not_exist(self, row: Row, column_name: str) -> None: - with pytest.raises(UnknownColumnNameError): - row.get_column_type(column_name) - - -class TestToDict: - @pytest.mark.parametrize( - ("row", "expected"), - [ - ( - Row(), - {}, - ), - ( - Row({"a": 1, "b": 2}), - { - "a": 1, - "b": 2, - }, - ), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_return_dict_for_table(self, row: Row, expected: dict[str, Any]) -> None: - assert row.to_dict() == expected - - -class TestReprHtml: - @pytest.mark.parametrize( - "row", - [ - Row(), - Row({"a": 1, "b": 2}), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_contain_table_element(self, row: Row) -> None: - pattern = r".*?" - assert re.search(pattern, row._repr_html_(), flags=re.S) is not None - - @pytest.mark.parametrize( - "row", - [ - Row(), - Row({"a": 1, "b": 2}), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_contain_th_element_for_each_column_name(self, row: Row) -> None: - for column_name in row.column_names: - assert f"{column_name}" in row._repr_html_() - - @pytest.mark.parametrize( - "row", - [ - Row(), - Row({"a": 1, "b": 2}), - ], - ids=[ - "empty", - "non-empty", - ], - ) - def test_should_contain_td_element_for_each_value(self, row: Row) -> None: - for value in row.values(): - assert f"{value}" in row._repr_html_() - - -class TestSizeof: - @pytest.mark.parametrize( - "row", - [ - Row(), - Row({"col1": 0}), - Row({"col1": 0, "col2": "a"}), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_size_be_greater_than_normal_object(self, row: Row) -> None: - assert sys.getsizeof(row) > sys.getsizeof(object()) +# TODO +# import re +# import sys +# from typing import Any +# +# import pandas as pd +# import pytest +# from safeds.data.tabular.containers import Row, Table +# from safeds.data.tabular.typing import ColumnType, Integer, Schema, String +# from safeds.exceptions import UnknownColumnNameError +# +# +# class TestFromDict: +# @pytest.mark.parametrize( +# ("data", "expected"), +# [ +# ( +# {}, +# Row({}), +# ), +# ( +# { +# "a": 1, +# "b": 2, +# }, +# Row({"a": 1, "b": 2}), +# ), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_create_row_from_dict(self, data: dict[str, Any], expected: Row) -> None: +# assert Row.from_dict(data) == expected +# +# +# class TestFromPandasDataFrame: +# @pytest.mark.parametrize( +# ("dataframe", "schema", "expected"), +# [ +# ( +# pd.DataFrame({"col1": [0]}), +# Schema({"col1": String()}), +# Schema({"col1": String()}), +# ), +# ( +# pd.DataFrame({"col1": [0], "col2": ["a"]}), +# Schema({"col1": String(), "col2": String()}), +# Schema({"col1": String(), "col2": String()}), +# ), +# ], +# ids=[ +# "one column", +# "two columns", +# ], +# ) +# def test_should_use_the_schema_if_passed(self, dataframe: pd.DataFrame, schema: Schema, expected: Schema) -> None: +# row = Row._from_pandas_dataframe(dataframe, schema) +# assert row._schema == expected +# +# @pytest.mark.parametrize( +# ("dataframe", "expected"), +# [ +# ( +# pd.DataFrame({"col1": [0]}), +# Schema({"col1": Integer()}), +# ), +# ( +# pd.DataFrame({"col1": [0], "col2": ["a"]}), +# Schema({"col1": Integer(), "col2": String()}), +# ), +# ], +# ids=[ +# "one column", +# "two columns", +# ], +# ) +# def test_should_infer_the_schema_if_not_passed(self, dataframe: pd.DataFrame, expected: Schema) -> None: +# row = Row._from_pandas_dataframe(dataframe) +# assert row._schema == expected +# +# @pytest.mark.parametrize( +# "dataframe", +# [ +# pd.DataFrame(), +# pd.DataFrame({"col1": [0, 1]}), +# ], +# ids=[ +# "empty", +# "two rows", +# ], +# ) +# def test_should_raise_if_dataframe_does_not_contain_exactly_one_row(self, dataframe: pd.DataFrame) -> None: +# with pytest.raises(ValueError, match=re.escape("The dataframe has to contain exactly one row.")): +# Row._from_pandas_dataframe(dataframe) +# +# +# class TestInit: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), Schema({})), +# (Row({}), Schema({})), +# (Row({"col1": 0}), Schema({"col1": Integer()})), +# ], +# ids=[ +# "empty", +# "empty (explicit)", +# "one column", +# ], +# ) +# def test_should_infer_the_schema(self, row: Row, expected: Schema) -> None: +# assert row._schema == expected +# +# +# class TestContains: +# @pytest.mark.parametrize( +# ("row", "column_name", "expected"), +# [ +# (Row({}), "col1", False), +# (Row({"col1": 0}), "col1", True), +# (Row({"col1": 0}), "col2", False), +# (Row({"col1": 0}), 1, False), +# ], +# ids=[ +# "empty row", +# "column exists", +# "column does not exist", +# "not a string", +# ], +# ) +# def test_should_return_whether_the_row_has_the_column(self, row: Row, column_name: str, expected: bool) -> None: +# assert (column_name in row) == expected +# +# +# class TestEq: +# @pytest.mark.parametrize( +# ("row1", "row2", "expected"), +# [ +# (Row(), Row(), True), +# (Row({"col1": 0}), Row({"col1": 0}), True), +# (Row({"col1": 0}), Row({"col1": 1}), False), +# (Row({"col1": 0}), Row({"col2": 0}), False), +# (Row({"col1": 0}), Row({"col1": "a"}), False), +# ], +# ids=[ +# "empty rows", +# "equal rows", +# "different values", +# "different columns", +# "different types", +# ], +# ) +# def test_should_return_whether_two_rows_are_equal(self, row1: Row, row2: Row, expected: bool) -> None: +# assert (row1.__eq__(row2)) == expected +# +# @pytest.mark.parametrize( +# "row", +# [ +# Row(), +# Row({"col1": 0}), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_return_true_if_objects_are_identical(self, row: Row) -> None: +# assert (row.__eq__(row)) is True +# +# @pytest.mark.parametrize( +# ("row", "other"), +# [ +# (Row({"col1": 0}), None), +# (Row({"col1": 0}), Table()), +# ], +# ids=[ +# "Row vs. None", +# "Row vs. Table", +# ], +# ) +# def test_should_return_not_implemented_if_other_is_not_row(self, row: Row, other: Any) -> None: +# assert (row.__eq__(other)) is NotImplemented +# +# +# class TestHash: +# @pytest.mark.parametrize( +# ("row1", "row2"), +# [ +# (Row(), Row()), +# (Row({"col1": 0}), Row({"col1": 0})), +# ], +# ids=[ +# "empty rows", +# "equal rows", +# ], +# ) +# def test_should_return_same_hash_for_equal_rows(self, row1: Row, row2: Row) -> None: +# assert hash(row1) == hash(row2) +# +# @pytest.mark.parametrize( +# ("row1", "row2"), +# [ +# (Row({"col1": 0}), Row({"col1": 1})), +# (Row({"col1": 0}), Row({"col2": 0})), +# (Row({"col1": 0}), Row({"col1": "a"})), +# ], +# ids=[ +# "different values", +# "different columns", +# "different types", +# ], +# ) +# def test_should_return_different_hash_for_unequal_rows(self, row1: Row, row2: Row) -> None: +# assert hash(row1) != hash(row2) +# +# +# class TestGetitem: +# @pytest.mark.parametrize( +# ("row", "column_name", "expected"), +# [ +# (Row({"col1": 0}), "col1", 0), +# (Row({"col1": 0, "col2": "a"}), "col2", "a"), +# ], +# ids=[ +# "one column", +# "two columns", +# ], +# ) +# def test_should_return_the_value_in_the_column(self, row: Row, column_name: str, expected: Any) -> None: +# assert row[column_name] == expected +# +# @pytest.mark.parametrize( +# ("row", "column_name"), +# [ +# (Row(), "col1"), +# (Row({"col1": 0}), "col2"), +# ], +# ids=[ +# "empty row", +# "column does not exist", +# ], +# ) +# def test_should_raise_if_column_does_not_exist(self, row: Row, column_name: str) -> None: +# with pytest.raises(UnknownColumnNameError): +# # noinspection PyStatementEffect +# row[column_name] +# +# +# class TestIter: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), []), +# (Row({"col1": 0}), ["col1"]), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_return_an_iterator_for_the_column_names(self, row: Row, expected: list[str]) -> None: +# assert list(row) == expected +# +# +# class TestLen: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), 0), +# (Row({"col1": 0, "col2": "a"}), 2), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_return_the_number_of_columns(self, row: Row, expected: int) -> None: +# assert len(row) == expected +# +# +# class TestRepr: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), "Row({})"), +# (Row({"col1": 0}), "Row({'col1': 0})"), +# (Row({"col1": 0, "col2": "a"}), "Row({\n 'col1': 0,\n 'col2': 'a'\n})"), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_return_a_string_representation(self, row: Row, expected: str) -> None: +# assert repr(row) == expected +# +# +# class TestStr: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), "{}"), +# (Row({"col1": 0}), "{'col1': 0}"), +# (Row({"col1": 0, "col2": "a"}), "{\n 'col1': 0,\n 'col2': 'a'\n}"), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_return_a_string_representation(self, row: Row, expected: str) -> None: +# assert str(row) == expected +# +# +# class TestColumnNames: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), []), +# (Row({"col1": 0}), ["col1"]), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_return_the_column_names(self, row: Row, expected: list[str]) -> None: +# assert row.column_names == expected +# +# +# class TestNumberOfColumns: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# (Row(), 0), +# (Row({"col1": 0, "col2": "a"}), 2), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_return_the_number_of_columns(self, row: Row, expected: int) -> None: +# assert row.number_of_columns == expected +# +# +# class TestGetValue: +# @pytest.mark.parametrize( +# ("row", "column_name", "expected"), +# [ +# (Row({"col1": 0}), "col1", 0), +# (Row({"col1": 0, "col2": "a"}), "col2", "a"), +# ], +# ids=[ +# "one column", +# "two columns", +# ], +# ) +# def test_should_return_the_value_in_the_column(self, row: Row, column_name: str, expected: Any) -> None: +# assert row.get_value(column_name) == expected +# +# @pytest.mark.parametrize( +# ("row", "column_name"), +# [ +# (Row({}), "col1"), +# (Row({"col1": 0}), "col2"), +# ], +# ids=[ +# "empty row", +# "column does not exist", +# ], +# ) +# def test_should_raise_if_column_does_not_exist(self, row: Row, column_name: str) -> None: +# with pytest.raises(UnknownColumnNameError): +# row.get_value(column_name) +# +# +# class TestHasColumn: +# @pytest.mark.parametrize( +# ("row", "column_name", "expected"), +# [ +# (Row(), "col1", False), +# (Row({"col1": 0}), "col1", True), +# (Row({"col1": 0}), "col2", False), +# ], +# ids=[ +# "empty row", +# "column exists", +# "column does not exist", +# ], +# ) +# def test_should_return_whether_the_row_has_the_column(self, row: Row, column_name: str, expected: bool) -> None: +# assert row.has_column(column_name) == expected +# +# +# class TestGetColumnType: +# @pytest.mark.parametrize( +# ("row", "column_name", "expected"), +# [ +# (Row({"col1": 0}), "col1", Integer()), +# (Row({"col1": 0, "col2": "a"}), "col2", String()), +# ], +# ids=[ +# "one column", +# "two columns", +# ], +# ) +# def test_should_return_the_type_of_the_column(self, row: Row, column_name: str, expected: ColumnType) -> None: +# assert row.get_column_type(column_name) == expected +# +# @pytest.mark.parametrize( +# ("row", "column_name"), +# [ +# (Row(), "col1"), +# (Row({"col1": 0}), "col2"), +# ], +# ids=[ +# "empty row", +# "column does not exist", +# ], +# ) +# def test_should_raise_if_column_does_not_exist(self, row: Row, column_name: str) -> None: +# with pytest.raises(UnknownColumnNameError): +# row.get_column_type(column_name) +# +# +# class TestToDict: +# @pytest.mark.parametrize( +# ("row", "expected"), +# [ +# ( +# Row(), +# {}, +# ), +# ( +# Row({"a": 1, "b": 2}), +# { +# "a": 1, +# "b": 2, +# }, +# ), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_return_dict_for_table(self, row: Row, expected: dict[str, Any]) -> None: +# assert row.to_dict() == expected +# +# +# class TestReprHtml: +# @pytest.mark.parametrize( +# "row", +# [ +# Row(), +# Row({"a": 1, "b": 2}), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_contain_table_element(self, row: Row) -> None: +# pattern = r".*?" +# assert re.search(pattern, row._repr_html_(), flags=re.S) is not None +# +# @pytest.mark.parametrize( +# "row", +# [ +# Row(), +# Row({"a": 1, "b": 2}), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_contain_th_element_for_each_column_name(self, row: Row) -> None: +# for column_name in row.column_names: +# assert f"{column_name}" in row._repr_html_() +# +# @pytest.mark.parametrize( +# "row", +# [ +# Row(), +# Row({"a": 1, "b": 2}), +# ], +# ids=[ +# "empty", +# "non-empty", +# ], +# ) +# def test_should_contain_td_element_for_each_value(self, row: Row) -> None: +# for value in row.values(): +# assert f"{value}" in row._repr_html_() +# +# +# class TestSizeof: +# @pytest.mark.parametrize( +# "row", +# [ +# Row(), +# Row({"col1": 0}), +# Row({"col1": 0, "col2": "a"}), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_size_be_greater_than_normal_object(self, row: Row) -> None: +# assert sys.getsizeof(row) > sys.getsizeof(object()) diff --git a/tests/safeds/data/tabular/transformation/test_imputer.py b/tests/safeds/data/tabular/transformation/test_simple_imputer.py similarity index 74% rename from tests/safeds/data/tabular/transformation/test_imputer.py rename to tests/safeds/data/tabular/transformation/test_simple_imputer.py index 4002fae9c..ee84ae1a9 100644 --- a/tests/safeds/data/tabular/transformation/test_imputer.py +++ b/tests/safeds/data/tabular/transformation/test_simple_imputer.py @@ -3,12 +3,12 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import Imputer -from safeds.data.tabular.transformation._imputer import _Mode +from safeds.data.tabular.transformation import SimpleImputer +from safeds.data.tabular.transformation._simple_imputer import _Mode from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError -def strategies() -> list[Imputer.Strategy]: +def strategies() -> list[SimpleImputer.Strategy]: """ Return the list of imputer strategies to test. @@ -20,26 +20,31 @@ def strategies() -> list[Imputer.Strategy]: strategies : list[Imputer.Strategy] The list of classifiers to test. """ - return [Imputer.Strategy.Constant(2), Imputer.Strategy.Mean(), Imputer.Strategy.Median(), Imputer.Strategy.Mode()] + return [ + SimpleImputer.Strategy.Constant(2), + SimpleImputer.Strategy.Mean(), + SimpleImputer.Strategy.Median(), + SimpleImputer.Strategy.Mode(), + ] class TestStrategyClass: def test_should_be_able_to_get_value_of_constant_strategy(self) -> None: - assert Imputer.Strategy.Constant(1).value == 1 # type: ignore[attr-defined] + assert SimpleImputer.Strategy.Constant(1).value == 1 # type: ignore[attr-defined] @pytest.mark.parametrize( ("strategy", "type_", "expected"), [ - (Imputer.Strategy.Constant(0), Imputer.Strategy.Constant, True), - (Imputer.Strategy.Mean(), Imputer.Strategy.Mean, True), - (Imputer.Strategy.Median(), Imputer.Strategy.Median, True), - (Imputer.Strategy.Mode(), Imputer.Strategy.Mode, True), - (Imputer.Strategy.Mode(), Imputer.Strategy.Mean, False), + (SimpleImputer.Strategy.Constant(0), SimpleImputer.Strategy.Constant, True), + (SimpleImputer.Strategy.Mean(), SimpleImputer.Strategy.Mean, True), + (SimpleImputer.Strategy.Median(), SimpleImputer.Strategy.Median, True), + (SimpleImputer.Strategy.Mode(), SimpleImputer.Strategy.Mode, True), + (SimpleImputer.Strategy.Mode(), SimpleImputer.Strategy.Mean, False), ], ) def test_should_be_able_to_use_strategy_in_isinstance( self, - strategy: Imputer.Strategy, + strategy: SimpleImputer.Strategy, type_: type, expected: bool, ) -> None: @@ -53,8 +58,8 @@ class TestEq: ) def test_equal_strategy( self, - strategy1: Imputer.Strategy, - strategy2: Imputer.Strategy, + strategy1: SimpleImputer.Strategy, + strategy2: SimpleImputer.Strategy, ) -> None: assert strategy1 == strategy2 @@ -65,7 +70,7 @@ def test_equal_strategy( ) def test_equal_identity_strategy( self, - strategy: Imputer.Strategy, + strategy: SimpleImputer.Strategy, ) -> None: assert strategy == strategy # noqa: PLR0124 @@ -76,8 +81,8 @@ def test_equal_identity_strategy( ) def test_unequal_strategy( self, - strategy1: Imputer.Strategy, - strategy2: Imputer.Strategy, + strategy1: SimpleImputer.Strategy, + strategy2: SimpleImputer.Strategy, ) -> None: assert strategy1 != strategy2 @@ -89,8 +94,8 @@ class TestHash: ) def test_should_return_same_hash_for_equal_strategy( self, - strategy1: Imputer.Strategy, - strategy2: Imputer.Strategy, + strategy1: SimpleImputer.Strategy, + strategy2: SimpleImputer.Strategy, ) -> None: assert hash(strategy1) == hash(strategy2) @@ -101,20 +106,20 @@ def test_should_return_same_hash_for_equal_strategy( ) def test_should_return_different_hash_for_unequal_strategy( self, - strategy1: Imputer.Strategy, - strategy2: Imputer.Strategy, + strategy1: SimpleImputer.Strategy, + strategy2: SimpleImputer.Strategy, ) -> None: assert hash(strategy1) != hash(strategy2) class TestSizeof: @pytest.mark.parametrize( "strategy", - ([Imputer.Strategy.Constant(1)]), + ([SimpleImputer.Strategy.Constant(1)]), ids=lambda x: x.__class__.__name__, ) def test_sizeof_strategy( self, - strategy: Imputer.Strategy, + strategy: SimpleImputer.Strategy, ) -> None: assert sys.getsizeof(strategy) > sys.getsizeof(object()) @@ -122,14 +127,14 @@ class TestStr: @pytest.mark.parametrize( ("strategy", "expected"), [ - (Imputer.Strategy.Constant(0), "Constant(0)"), - (Imputer.Strategy.Mean(), "Mean"), - (Imputer.Strategy.Median(), "Median"), - (Imputer.Strategy.Mode(), "Mode"), + (SimpleImputer.Strategy.Constant(0), "Constant(0)"), + (SimpleImputer.Strategy.Mean(), "Mean"), + (SimpleImputer.Strategy.Median(), "Median"), + (SimpleImputer.Strategy.Mode(), "Mode"), ], ids=lambda x: x.__class__.__name__, ) - def test_should_return_correct_string_representation(self, strategy: Imputer.Strategy, expected: str) -> None: + def test_should_return_correct_string_representation(self, strategy: SimpleImputer.Strategy, expected: str) -> None: assert str(strategy) == expected @@ -139,8 +144,8 @@ class TestStrategyProperty: strategies(), ids=lambda x: x.__class__.__name__, ) - def test_should_return_correct_strategy(self, strategy: Imputer.Strategy) -> None: - assert Imputer(strategy).strategy == strategy + def test_should_return_correct_strategy(self, strategy: SimpleImputer.Strategy) -> None: + assert SimpleImputer(strategy).strategy == strategy class TestValueToReplaceProperty: @@ -149,12 +154,12 @@ class TestValueToReplaceProperty: [0], ) def test_should_return_correct_value_to_replace(self, value_to_replace: float | str | None) -> None: - assert Imputer(Imputer.Strategy.Mode(), value_to_replace=value_to_replace).value_to_replace == value_to_replace + assert SimpleImputer(SimpleImputer.Strategy.Mode(), value_to_replace=value_to_replace).value_to_replace == value_to_replace class TestFit: @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> None: + def test_should_raise_if_column_not_found(self, strategy: SimpleImputer.Strategy) -> None: table = Table( { "a": [1, 3, None], @@ -162,18 +167,18 @@ def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> N ) with pytest.raises(UnknownColumnNameError, match=r"Could not find column\(s\) 'b, c'"): - Imputer(strategy).fit(table, ["b", "c"]) + SimpleImputer(strategy).fit(table, ["b", "c"]) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_table_contains_no_rows(self, strategy: Imputer.Strategy) -> None: - with pytest.raises(ValueError, match=r"The Imputer cannot be fitted because the table contains 0 rows"): - Imputer(strategy).fit(Table({"col1": []}), ["col1"]) + def test_should_raise_if_table_contains_no_rows(self, strategy: SimpleImputer.Strategy) -> None: + with pytest.raises(ValueError, match=r"The SimpleImputer cannot be fitted because the table contains 0 rows"): + SimpleImputer(strategy).fit(Table({"col1": []}), ["col1"]) @pytest.mark.parametrize( ("table", "col_names", "strategy"), [ - (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], Imputer.Strategy.Mean()), - (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], Imputer.Strategy.Median()), + (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], SimpleImputer.Strategy.Mean()), + (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], SimpleImputer.Strategy.Median()), ], ids=["Strategy Mean", "Strategy Median"], ) @@ -181,13 +186,13 @@ def test_should_raise_if_table_contains_non_numerical_data( self, table: Table, col_names: list[str], - strategy: Imputer.Strategy, + strategy: SimpleImputer.Strategy, ) -> None: with pytest.raises( NonNumericColumnError, match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", ): - Imputer(strategy).fit(table, col_names) + SimpleImputer(strategy).fit(table, col_names) @pytest.mark.parametrize( ("table", "most_frequent"), @@ -210,17 +215,17 @@ def test_should_warn_if_multiple_mode_values(self, table: Table, most_frequent: rf" values:\n{most_frequent}" ), ): - Imputer(Imputer.Strategy.Mode()).fit(table, None) + SimpleImputer(SimpleImputer.Strategy.Mode()).fit(table, None) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_not_change_original_transformer(self, strategy: Imputer.Strategy) -> None: + def test_should_not_change_original_transformer(self, strategy: SimpleImputer.Strategy) -> None: table = Table( { "a": [1, 3, 3, None], }, ) - transformer = Imputer(strategy) + transformer = SimpleImputer(strategy) transformer.fit(table, None) assert transformer._wrapped_transformer is None @@ -229,7 +234,7 @@ def test_should_not_change_original_transformer(self, strategy: Imputer.Strategy class TestTransform: @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> None: + def test_should_raise_if_column_not_found(self, strategy: SimpleImputer.Strategy) -> None: table_to_fit = Table( { "a": [1, 3, 3, None], @@ -244,9 +249,9 @@ def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> N message=r"There are multiple most frequent values in a column given to the Imputer\..*", category=UserWarning, ) - transformer = Imputer(strategy).fit(table_to_fit, None) + transformer = SimpleImputer(strategy).fit(table_to_fit, None) else: - transformer = Imputer(strategy).fit(table_to_fit, None) + transformer = SimpleImputer(strategy).fit(table_to_fit, None) table_to_transform = Table( { @@ -258,19 +263,19 @@ def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> N transformer.transform(table_to_transform) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_table_contains_no_rows(self, strategy: Imputer.Strategy) -> None: + def test_should_raise_if_table_contains_no_rows(self, strategy: SimpleImputer.Strategy) -> None: with pytest.raises(ValueError, match=r"The Imputer cannot transform the table because it contains 0 rows"): - Imputer(strategy).fit(Table({"col1": [1, 2, 2]}), ["col1"]).transform(Table({"col1": []})) + SimpleImputer(strategy).fit(Table({"col1": [1, 2, 2]}), ["col1"]).transform(Table({"col1": []})) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_not_fitted(self, strategy: Imputer.Strategy) -> None: + def test_should_raise_if_not_fitted(self, strategy: SimpleImputer.Strategy) -> None: table = Table( { "a": [1, 3, None], }, ) - transformer = Imputer(strategy) + transformer = SimpleImputer(strategy) with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): transformer.transform(table) @@ -278,19 +283,19 @@ def test_should_raise_if_not_fitted(self, strategy: Imputer.Strategy) -> None: class TestIsFitted: @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_return_false_before_fitting(self, strategy: Imputer.Strategy) -> None: - transformer = Imputer(strategy) + def test_should_return_false_before_fitting(self, strategy: SimpleImputer.Strategy) -> None: + transformer = SimpleImputer(strategy) assert not transformer.is_fitted @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_return_true_after_fitting(self, strategy: Imputer.Strategy) -> None: + def test_should_return_true_after_fitting(self, strategy: SimpleImputer.Strategy) -> None: table = Table( { "a": [1, 3, 3, None], }, ) - transformer = Imputer(strategy) + transformer = SimpleImputer(strategy) fitted_transformer = transformer.fit(table, None) assert fitted_transformer.is_fitted @@ -306,7 +311,7 @@ class TestFitAndTransform: }, ), None, - Imputer.Strategy.Constant(0.0), + SimpleImputer.Strategy.Constant(0.0), None, Table( { @@ -321,7 +326,7 @@ class TestFitAndTransform: }, ), None, - Imputer.Strategy.Mean(), + SimpleImputer.Strategy.Mean(), None, Table( { @@ -336,7 +341,7 @@ class TestFitAndTransform: }, ), None, - Imputer.Strategy.Median(), + SimpleImputer.Strategy.Median(), None, Table( { @@ -351,7 +356,7 @@ class TestFitAndTransform: }, ), None, - Imputer.Strategy.Mode(), + SimpleImputer.Strategy.Mode(), None, Table( { @@ -367,7 +372,7 @@ class TestFitAndTransform: }, ), ["a"], - Imputer.Strategy.Constant(0.0), + SimpleImputer.Strategy.Constant(0.0), None, Table( { @@ -383,7 +388,7 @@ class TestFitAndTransform: }, ), ["a"], - Imputer.Strategy.Mode(), + SimpleImputer.Strategy.Mode(), None, Table({"a": [1.0, 1.0, 2.0, 2.0, 1.0]}), ), @@ -394,7 +399,7 @@ class TestFitAndTransform: }, ), None, - Imputer.Strategy.Constant(1.0), + SimpleImputer.Strategy.Constant(1.0), 0.0, Table( { @@ -417,7 +422,7 @@ def test_should_return_fitted_transformer_and_transformed_table( self, table: Table, column_names: list[str] | None, - strategy: Imputer.Strategy, + strategy: SimpleImputer.Strategy, value_to_replace: float | str | None, expected: Table, ) -> None: @@ -427,7 +432,7 @@ def test_should_return_fitted_transformer_and_transformed_table( message=r"There are multiple most frequent values in a column given to the Imputer\..*", category=UserWarning, ) - fitted_transformer, transformed_table = Imputer( + fitted_transformer, transformed_table = SimpleImputer( strategy, value_to_replace=value_to_replace, ).fit_and_transform(table, column_names) @@ -436,14 +441,14 @@ def test_should_return_fitted_transformer_and_transformed_table( assert transformed_table == expected @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_not_change_original_table(self, strategy: Imputer.Strategy) -> None: + def test_should_not_change_original_table(self, strategy: SimpleImputer.Strategy) -> None: table = Table( { "a": [1, None, None], }, ) - Imputer(strategy=strategy).fit_and_transform(table) + SimpleImputer(strategy=strategy).fit_and_transform(table) expected = Table( { @@ -454,8 +459,8 @@ def test_should_not_change_original_table(self, strategy: Imputer.Strategy) -> N assert table == expected @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_added_columns(self, strategy: Imputer.Strategy) -> None: - transformer = Imputer(strategy=strategy) + def test_get_names_of_added_columns(self, strategy: SimpleImputer.Strategy) -> None: + transformer = SimpleImputer(strategy=strategy) with pytest.raises(TransformerNotFittedError): transformer.get_names_of_added_columns() @@ -469,8 +474,8 @@ def test_get_names_of_added_columns(self, strategy: Imputer.Strategy) -> None: assert transformer.get_names_of_added_columns() == [] @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_changed_columns(self, strategy: Imputer.Strategy) -> None: - transformer = Imputer(strategy=strategy) + def test_get_names_of_changed_columns(self, strategy: SimpleImputer.Strategy) -> None: + transformer = SimpleImputer(strategy=strategy) with pytest.raises(TransformerNotFittedError): transformer.get_names_of_changed_columns() table = Table( @@ -483,8 +488,8 @@ def test_get_names_of_changed_columns(self, strategy: Imputer.Strategy) -> None: assert transformer.get_names_of_changed_columns() == ["a", "b"] @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_removed_columns(self, strategy: Imputer.Strategy) -> None: - transformer = Imputer(strategy=strategy) + def test_get_names_of_removed_columns(self, strategy: SimpleImputer.Strategy) -> None: + transformer = SimpleImputer(strategy=strategy) with pytest.raises(TransformerNotFittedError): transformer.get_names_of_removed_columns() diff --git a/tests/safeds/data/tabular/transformation/test_table_transformer.py b/tests/safeds/data/tabular/transformation/test_table_transformer.py index 3eef559c4..fb4640119 100644 --- a/tests/safeds/data/tabular/transformation/test_table_transformer.py +++ b/tests/safeds/data/tabular/transformation/test_table_transformer.py @@ -4,10 +4,10 @@ from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import ( Discretizer, - Imputer, LabelEncoder, OneHotEncoder, RangeScaler, + SimpleImputer, StandardScaler, TableTransformer, ) @@ -66,7 +66,7 @@ def transformers() -> list[TableTransformer]: transformers_numeric() + transformers_non_numeric() + [ - Imputer(strategy=Imputer.Strategy.Mode()), + SimpleImputer(strategy=SimpleImputer.Strategy.Mode()), ] ) @@ -175,6 +175,6 @@ def test_should_return_different_hash_for_imputer_fit( transformer2: TableTransformer, valid_data_imputer: Table, ) -> None: - transformer1 = Imputer(strategy=Imputer.Strategy.Mode()) + transformer1 = SimpleImputer(strategy=SimpleImputer.Strategy.Mode()) transformer1_fit = transformer1.fit(valid_data_imputer, ["col1"]) assert hash(transformer2) != hash(transformer1_fit) diff --git a/tests/safeds/data/tabular/typing/test_column_type.py b/tests/safeds/data/tabular/typing/test_column_type.py deleted file mode 100644 index dfe9c13e6..000000000 --- a/tests/safeds/data/tabular/typing/test_column_type.py +++ /dev/null @@ -1,123 +0,0 @@ -from collections.abc import Iterable -from typing import Any - -import numpy as np -import pandas as pd -import pytest -from safeds.data.tabular.typing import ( - Anything, - Boolean, - ColumnType, - Integer, - Nothing, - RealNumber, - String, -) - - -class TestDataType: - @pytest.mark.parametrize( - ("data", "expected"), - [ - ([1, 2, 3], Integer(is_nullable=False)), - ([1.0, 2.0, 3.0], Integer(is_nullable=False)), - ([1.0, 2.5, 3.0], RealNumber(is_nullable=False)), - ([True, False, True], Boolean(is_nullable=False)), - (["a", "b", "c"], String(is_nullable=False)), - (["a", 1, 2.0], Anything(is_nullable=False)), - ([None, None, None], Nothing()), - ([None, 1, 2], Integer(is_nullable=True)), - ([1.0, 2.0, None], Integer(is_nullable=True)), - ([1.0, 2.5, None], RealNumber(is_nullable=True)), - ([True, False, None], Boolean(is_nullable=True)), - (["a", None, "b"], String(is_nullable=True)), - ], - ids=[ - "Integer", - "Real number .0", - "Real number", - "Boolean", - "String", - "Mixed", - "None", - "Nullable integer", - "Nullable RealNumber .0", - "Nullable RealNumber", - "Nullable Boolean", - "Nullable String", - ], - ) - def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None: - assert ColumnType._data_type(pd.Series(data)) == expected - - @pytest.mark.parametrize( - ("data", "error_message"), - [(np.array([1, 2, 3], dtype=np.int16), "Unsupported numpy data type ''.")], - ids=["int16 not supported"], - ) - def test_should_throw_not_implemented_error_when_type_is_not_supported(self, data: Any, error_message: str) -> None: - with pytest.raises(NotImplementedError, match=error_message): - ColumnType._data_type(data) - - -class TestRepr: - @pytest.mark.parametrize( - ("column_type", "expected"), - [ - (Anything(is_nullable=False), "Anything"), - (Anything(is_nullable=True), "Anything?"), - (Boolean(is_nullable=False), "Boolean"), - (Boolean(is_nullable=True), "Boolean?"), - (RealNumber(is_nullable=False), "RealNumber"), - (RealNumber(is_nullable=True), "RealNumber?"), - (Integer(is_nullable=False), "Integer"), - (Integer(is_nullable=True), "Integer?"), - (String(is_nullable=False), "String"), - (String(is_nullable=True), "String?"), - ], - ids=repr, - ) - def test_should_create_a_printable_representation(self, column_type: ColumnType, expected: str) -> None: - assert repr(column_type) == expected - - -class TestIsNullable: - @pytest.mark.parametrize( - ("column_type", "expected"), - [ - (Anything(is_nullable=False), False), - (Anything(is_nullable=True), True), - (Boolean(is_nullable=False), False), - (Boolean(is_nullable=True), True), - (RealNumber(is_nullable=False), False), - (RealNumber(is_nullable=True), True), - (Integer(is_nullable=False), False), - (Integer(is_nullable=True), True), - (String(is_nullable=False), False), - (String(is_nullable=True), True), - ], - ids=repr, - ) - def test_should_return_whether_the_column_type_is_nullable(self, column_type: ColumnType, expected: bool) -> None: - assert column_type.is_nullable() == expected - - -class TestIsNumeric: - @pytest.mark.parametrize( - ("column_type", "expected"), - [ - (Anything(is_nullable=False), False), - (Anything(is_nullable=True), False), - (Boolean(is_nullable=False), False), - (Boolean(is_nullable=True), False), - (RealNumber(is_nullable=False), True), - (RealNumber(is_nullable=True), True), - (Integer(is_nullable=False), True), - (Integer(is_nullable=True), True), - (String(is_nullable=False), False), - (String(is_nullable=True), False), - ], - ids=repr, - ) - def test_should_return_whether_the_column_type_is_numeric(self, column_type: ColumnType, expected: bool) -> None: - assert column_type.is_numeric() == expected diff --git a/tests/safeds/data/tabular/typing/test_data_type.py b/tests/safeds/data/tabular/typing/test_data_type.py new file mode 100644 index 000000000..3f8b9663a --- /dev/null +++ b/tests/safeds/data/tabular/typing/test_data_type.py @@ -0,0 +1,124 @@ +# TODO +# from collections.abc import Iterable +# from typing import Any +# +# import numpy as np +# import pandas as pd +# import pytest +# from safeds.data.tabular.typing import ( +# Anything, +# Boolean, +# ColumnType, +# Integer, +# Nothing, +# RealNumber, +# String, +# ) +# +# +# class TestDataType: +# @pytest.mark.parametrize( +# ("data", "expected"), +# [ +# ([1, 2, 3], Integer(is_nullable=False)), +# ([1.0, 2.0, 3.0], Integer(is_nullable=False)), +# ([1.0, 2.5, 3.0], RealNumber(is_nullable=False)), +# ([True, False, True], Boolean(is_nullable=False)), +# (["a", "b", "c"], String(is_nullable=False)), +# (["a", 1, 2.0], Anything(is_nullable=False)), +# ([None, None, None], Nothing()), +# ([None, 1, 2], Integer(is_nullable=True)), +# ([1.0, 2.0, None], Integer(is_nullable=True)), +# ([1.0, 2.5, None], RealNumber(is_nullable=True)), +# ([True, False, None], Boolean(is_nullable=True)), +# (["a", None, "b"], String(is_nullable=True)), +# ], +# ids=[ +# "Integer", +# "Real number .0", +# "Real number", +# "Boolean", +# "String", +# "Mixed", +# "None", +# "Nullable integer", +# "Nullable RealNumber .0", +# "Nullable RealNumber", +# "Nullable Boolean", +# "Nullable String", +# ], +# ) +# def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None: +# assert ColumnType._data_type(pd.Series(data)) == expected +# +# @pytest.mark.parametrize( +# ("data", "error_message"), +# [(np.array([1, 2, 3], dtype=np.int16), "Unsupported numpy data type ''.")], +# ids=["int16 not supported"], +# ) +# def test_should_throw_not_implemented_error_when_type_is_not_supported(self, data: Any, error_message: str) -> None: +# with pytest.raises(NotImplementedError, match=error_message): +# ColumnType._data_type(data) +# +# +# class TestRepr: +# @pytest.mark.parametrize( +# ("column_type", "expected"), +# [ +# (Anything(is_nullable=False), "Anything"), +# (Anything(is_nullable=True), "Anything?"), +# (Boolean(is_nullable=False), "Boolean"), +# (Boolean(is_nullable=True), "Boolean?"), +# (RealNumber(is_nullable=False), "RealNumber"), +# (RealNumber(is_nullable=True), "RealNumber?"), +# (Integer(is_nullable=False), "Integer"), +# (Integer(is_nullable=True), "Integer?"), +# (String(is_nullable=False), "String"), +# (String(is_nullable=True), "String?"), +# ], +# ids=repr, +# ) +# def test_should_create_a_printable_representation(self, column_type: ColumnType, expected: str) -> None: +# assert repr(column_type) == expected +# +# +# class TestIsNullable: +# @pytest.mark.parametrize( +# ("column_type", "expected"), +# [ +# (Anything(is_nullable=False), False), +# (Anything(is_nullable=True), True), +# (Boolean(is_nullable=False), False), +# (Boolean(is_nullable=True), True), +# (RealNumber(is_nullable=False), False), +# (RealNumber(is_nullable=True), True), +# (Integer(is_nullable=False), False), +# (Integer(is_nullable=True), True), +# (String(is_nullable=False), False), +# (String(is_nullable=True), True), +# ], +# ids=repr, +# ) +# def test_should_return_whether_the_column_type_is_nullable(self, column_type: ColumnType, expected: bool) -> None: +# assert column_type.is_nullable() == expected +# +# +# class TestIsNumeric: +# @pytest.mark.parametrize( +# ("column_type", "expected"), +# [ +# (Anything(is_nullable=False), False), +# (Anything(is_nullable=True), False), +# (Boolean(is_nullable=False), False), +# (Boolean(is_nullable=True), False), +# (RealNumber(is_nullable=False), True), +# (RealNumber(is_nullable=True), True), +# (Integer(is_nullable=False), True), +# (Integer(is_nullable=True), True), +# (String(is_nullable=False), False), +# (String(is_nullable=True), False), +# ], +# ids=repr, +# ) +# def test_should_return_whether_the_column_type_is_numeric(self, column_type: ColumnType, expected: bool) -> None: +# assert column_type.is_numeric() == expected diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index 665359b4a..8660b4da7 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -1,516 +1,517 @@ -from __future__ import annotations - -import sys -from typing import TYPE_CHECKING - -import pandas as pd -import pytest -from safeds.data.tabular.typing import Anything, Boolean, ColumnType, Integer, RealNumber, Schema, String -from safeds.exceptions import UnknownColumnNameError - -if TYPE_CHECKING: - from collections.abc import Iterable - from typing import Any - - -class TestFromPandasDataFrame: - @pytest.mark.parametrize( - ("columns", "expected"), - [ - ( - pd.DataFrame({"A": [True, False, True]}), - Schema({"A": Boolean()}), - ), - ( - pd.DataFrame({"A": [1, 2, 3]}), - Schema({"A": Integer()}), - ), - ( - pd.DataFrame({"A": [1.0, 2.0, 3.0]}), - Schema({"A": Integer()}), - ), - ( - pd.DataFrame({"A": ["a", "b", "c"]}), - Schema({"A": String()}), - ), - ( - pd.DataFrame({"A": [1, 2.0, "a", True]}), - Schema({"A": Anything()}), - ), - ( - pd.DataFrame({"A": [1.0, 2.5, 3.0]}), - Schema({"A": RealNumber()}), - ), - ( - pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), - Schema({"A": Integer(), "B": String()}), - ), - ( - pd.DataFrame({"A": [True, False, None]}), - Schema({"A": Boolean(is_nullable=True)}), - ), - ( - pd.DataFrame({"A": [1, None, 3]}), - Schema({"A": Integer(is_nullable=True)}), - ), - ( - pd.DataFrame({"A": [1.0, None, 3.0]}), - Schema({"A": Integer(is_nullable=True)}), - ), - ( - pd.DataFrame({"A": [1.5, None, 3.0]}), - Schema({"A": RealNumber(is_nullable=True)}), - ), - ( - pd.DataFrame({"A": ["a", None, "c"]}), - Schema({"A": String(is_nullable=True)}), - ), - ( - pd.DataFrame({"A": [1, 2.0, None, True]}), - Schema({"A": Anything(is_nullable=True)}), - ), - ], - ids=[ - "boolean", - "integer", - "real number .0", - "string", - "mixed", - "real number", - "multiple columns", - "boolean?", - "integer?", - "real number? .0", - "real number?", - "string?", - "Anything?", - ], - ) - def test_should_create_schema_from_pandas_dataframe(self, columns: Iterable, expected: Schema) -> None: - assert Schema._from_pandas_dataframe(columns) == expected - - -class TestRepr: - @pytest.mark.parametrize( - ("schema", "expected"), - [ - (Schema({}), "Schema({})"), - (Schema({"A": Integer()}), "Schema({'A': Integer})"), - (Schema({"A": Integer(), "B": String()}), "Schema({\n 'A': Integer,\n 'B': String\n})"), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_create_a_string_representation(self, schema: Schema, expected: str) -> None: - assert repr(schema) == expected - - -class TestStr: - @pytest.mark.parametrize( - ("schema", "expected"), - [ - (Schema({}), "{}"), - (Schema({"A": Integer()}), "{'A': Integer}"), - (Schema({"A": Integer(), "B": String()}), "{\n 'A': Integer,\n 'B': String\n}"), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_create_a_string_representation(self, schema: Schema, expected: str) -> None: - assert str(schema) == expected - - -class TestEq: - @pytest.mark.parametrize( - ("schema1", "schema2", "expected"), - [ - (Schema({}), Schema({}), True), - (Schema({"col1": Integer()}), Schema({"col1": Integer()}), True), - (Schema({"col1": Integer()}), Schema({"col1": String()}), False), - (Schema({"col1": Integer()}), Schema({"col2": Integer()}), False), - ( - Schema({"col1": Integer(), "col2": String()}), - Schema({"col2": String(), "col1": Integer()}), - True, - ), - ], - ids=[ - "empty", - "same name and type", - "same name but different type", - "different name but same type", - "flipped columns", - ], - ) - def test_should_return_whether_two_schema_are_equal(self, schema1: Schema, schema2: Schema, expected: bool) -> None: - assert (schema1.__eq__(schema2)) == expected - - @pytest.mark.parametrize( - ("schema", "other"), - [ - (Schema({"col1": Integer()}), None), - (Schema({"col1": Integer()}), {"col1": Integer()}), - ], - ) - def test_should_return_not_implemented_if_other_is_not_schema(self, schema: Schema, other: Any) -> None: - assert (schema.__eq__(other)) is NotImplemented - - -class TestHash: - @pytest.mark.parametrize( - ("schema1", "schema2"), - [ - (Schema({}), Schema({})), - (Schema({"col1": Integer()}), Schema({"col1": Integer()})), - ], - ids=[ - "empty", - "one column", - ], - ) - def test_should_return_same_hash_for_equal_schemas(self, schema1: Schema, schema2: Schema) -> None: - assert hash(schema1) == hash(schema2) - - @pytest.mark.parametrize( - ("schema1", "schema2"), - [ - (Schema({"col1": Integer()}), Schema({"col1": String()})), - (Schema({"col1": Integer()}), Schema({"col2": Integer()})), - ], - ids=[ - "same name but different type", - "different name but same type", - ], - ) - def test_should_return_different_hash_for_unequal_schemas(self, schema1: Schema, schema2: Schema) -> None: - assert hash(schema1) != hash(schema2) - - -class TestHasColumn: - @pytest.mark.parametrize( - ("schema", "column_name", "expected"), - [ - (Schema({}), "A", False), - (Schema({"A": Integer()}), "A", True), - (Schema({"A": Integer()}), "B", False), - ], - ids=[ - "empty", - "column exists", - "column does not exist", - ], - ) - def test_should_return_whether_column_exists(self, schema: Schema, column_name: str, expected: bool) -> None: - assert schema.has_column(column_name) == expected - - -class TestGetTypeOfColumn: - @pytest.mark.parametrize( - ("schema", "column_name", "expected"), - [ - (Schema({"A": Integer()}), "A", Integer()), - (Schema({"A": Integer(), "B": String()}), "B", String()), - ], - ids=[ - "one column", - "two columns", - ], - ) - def test_should_return_type_of_existing_column( - self, - schema: Schema, - column_name: str, - expected: ColumnType, - ) -> None: - assert schema.get_column_type(column_name) == expected - - def test_should_raise_if_column_does_not_exist(self) -> None: - schema = Schema({"A": Integer()}) - with pytest.raises(UnknownColumnNameError): - schema.get_column_type("B") - - -class TestGetColumnNames: - @pytest.mark.parametrize( - ("schema", "expected"), - [ - (Schema({}), []), - (Schema({"A": Integer()}), ["A"]), - (Schema({"A": Integer(), "B": RealNumber()}), ["A", "B"]), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_return_column_names(self, schema: Schema, expected: list[str]) -> None: - assert schema.column_names == expected - - -class TestToDict: - @pytest.mark.parametrize( - ("schema", "expected"), - [ - (Schema({}), {}), - (Schema({"A": Integer()}), {"A": Integer()}), - (Schema({"A": Integer(), "B": String()}), {"A": Integer(), "B": String()}), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_return_dict_for_schema(self, schema: Schema, expected: str) -> None: - assert schema.to_dict() == expected - - -class TestMergeMultipleSchemas: - @pytest.mark.parametrize( - ("schemas", "error_msg_regex"), - [([Schema({"Column1": Anything()}), Schema({"Column2": Anything()})], r"Could not find column\(s\) 'Column2'")], - ids=["different_column_names"], - ) - def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], error_msg_regex: str) -> None: - with pytest.raises(UnknownColumnNameError, match=error_msg_regex): - Schema._merge_multiple_schemas(schemas) - - @pytest.mark.parametrize( - ("schemas", "expected"), - [ - ([Schema({"Column1": Integer()}), Schema({"Column1": Integer()})], Schema({"Column1": Integer()})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber()})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean()})), - ([Schema({"Column1": String()}), Schema({"Column1": String()})], Schema({"Column1": String()})), - ([Schema({"Column1": Anything()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber()})), - ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": Integer()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": Integer()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - ([Schema({"Column1": String()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), - ( - [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Integer()})], - Schema({"Column1": Integer(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": RealNumber()})], - Schema({"Column1": RealNumber(is_nullable=True)}), - ), - ( - [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Boolean()})], - Schema({"Column1": Boolean(is_nullable=True)}), - ), - ( - [Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": String()})], - Schema({"Column1": String(is_nullable=True)}), - ), - ( - [Schema({"Column1": Anything(is_nullable=True)}), Schema({"Column1": Anything()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": RealNumber()})], - Schema({"Column1": RealNumber(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Boolean()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": String()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Anything()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Boolean()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": String()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Anything()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": String()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Anything()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": Anything()})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer()}), Schema({"Column1": Integer(is_nullable=True)})], - Schema({"Column1": Integer(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(is_nullable=True)})], - Schema({"Column1": RealNumber(is_nullable=True)}), - ), - ( - [Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(is_nullable=True)})], - Schema({"Column1": Boolean(is_nullable=True)}), - ), - ( - [Schema({"Column1": String()}), Schema({"Column1": String(is_nullable=True)})], - Schema({"Column1": String(is_nullable=True)}), - ), - ( - [Schema({"Column1": Anything()}), Schema({"Column1": Anything(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(is_nullable=True)})], - Schema({"Column1": RealNumber(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer()}), Schema({"Column1": Boolean(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer()}), Schema({"Column1": String(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Integer()}), Schema({"Column1": Anything(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber()}), Schema({"Column1": String(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Boolean()}), Schema({"Column1": String(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": Boolean()}), Schema({"Column1": Anything(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ( - [Schema({"Column1": String()}), Schema({"Column1": Anything(is_nullable=True)})], - Schema({"Column1": Anything(is_nullable=True)}), - ), - ], - ids=[ - "Integer Integer", - "RealNumber RealNumber", - "Boolean Boolean", - "String String", - "Anything Anything", - "Integer RealNumber", - "Integer Boolean", - "Integer String", - "Integer Anything", - "RealNumber Boolean", - "RealNumber String", - "RealNumber Anything", - "Boolean String", - "Boolean Anything", - "String Anything", - "Integer(null) Integer", - "RealNumber(null) RealNumber", - "Boolean(null) Boolean", - "String(null) String", - "Anything(null) Anything", - "Integer(null) RealNumber", - "Integer(null) Boolean", - "Integer(null) String", - "Integer(null) Anything", - "RealNumber(null) Boolean", - "RealNumber(null) String", - "RealNumber(null) Anything", - "Boolean(null) String", - "Boolean(null) Anything", - "String(null) Anything", - "Integer Integer(null)", - "RealNumber RealNumber(null)", - "Boolean Boolean(null)", - "String String(null)", - "Anything Anything(null)", - "Integer RealNumber(null)", - "Integer Boolean(null)", - "Integer String(null)", - "Integer Anything(null)", - "RealNumber Boolean(null)", - "RealNumber String(null)", - "RealNumber Anything(null)", - "Boolean String(null)", - "Boolean Anything(null)", - "String Anything(null)", - ], - ) - def test_should_return_merged_schema(self, schemas: list[Schema], expected: Schema) -> None: - assert Schema._merge_multiple_schemas(schemas) == expected - schemas.reverse() - assert ( - Schema._merge_multiple_schemas(schemas) == expected - ) # test the reversed list because the first parameter is handled differently - - -class TestReprMarkdown: - @pytest.mark.parametrize( - ("schema", "expected"), - [ - (Schema({}), "Empty Schema"), - (Schema({"A": Integer()}), "| Column Name | Column Type |\n| --- | --- |\n| A | Integer |"), - ( - Schema({"A": Integer(), "B": String()}), - "| Column Name | Column Type |\n| --- | --- |\n| A | Integer |\n| B | String |", - ), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_create_a_string_representation(self, schema: Schema, expected: str) -> None: - assert schema._repr_markdown_() == expected - - -class TestSizeof: - @pytest.mark.parametrize( - "schema", - [ - Schema({}), - Schema({"A": Integer()}), - Schema({"A": Integer(), "B": String()}), - ], - ids=[ - "empty", - "single column", - "multiple columns", - ], - ) - def test_should_size_be_greater_than_normal_object(self, schema: Schema) -> None: - assert sys.getsizeof(schema) > sys.getsizeof(object()) +# TODO +# from __future__ import annotations +# +# import sys +# from typing import TYPE_CHECKING +# +# import pandas as pd +# import pytest +# from safeds.data.tabular.typing import Schema +# from safeds.exceptions import UnknownColumnNameError +# +# if TYPE_CHECKING: +# from collections.abc import Iterable +# from typing import Any +# +# +# class TestFromPandasDataFrame: +# @pytest.mark.parametrize( +# ("columns", "expected"), +# [ +# ( +# pd.DataFrame({"A": [True, False, True]}), +# Schema({"A": Boolean()}), +# ), +# ( +# pd.DataFrame({"A": [1, 2, 3]}), +# Schema({"A": Integer()}), +# ), +# ( +# pd.DataFrame({"A": [1.0, 2.0, 3.0]}), +# Schema({"A": Integer()}), +# ), +# ( +# pd.DataFrame({"A": ["a", "b", "c"]}), +# Schema({"A": String()}), +# ), +# ( +# pd.DataFrame({"A": [1, 2.0, "a", True]}), +# Schema({"A": Anything()}), +# ), +# ( +# pd.DataFrame({"A": [1.0, 2.5, 3.0]}), +# Schema({"A": RealNumber()}), +# ), +# ( +# pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), +# Schema({"A": Integer(), "B": String()}), +# ), +# ( +# pd.DataFrame({"A": [True, False, None]}), +# Schema({"A": Boolean(is_nullable=True)}), +# ), +# ( +# pd.DataFrame({"A": [1, None, 3]}), +# Schema({"A": Integer(is_nullable=True)}), +# ), +# ( +# pd.DataFrame({"A": [1.0, None, 3.0]}), +# Schema({"A": Integer(is_nullable=True)}), +# ), +# ( +# pd.DataFrame({"A": [1.5, None, 3.0]}), +# Schema({"A": RealNumber(is_nullable=True)}), +# ), +# ( +# pd.DataFrame({"A": ["a", None, "c"]}), +# Schema({"A": String(is_nullable=True)}), +# ), +# ( +# pd.DataFrame({"A": [1, 2.0, None, True]}), +# Schema({"A": Anything(is_nullable=True)}), +# ), +# ], +# ids=[ +# "boolean", +# "integer", +# "real number .0", +# "string", +# "mixed", +# "real number", +# "multiple columns", +# "boolean?", +# "integer?", +# "real number? .0", +# "real number?", +# "string?", +# "Anything?", +# ], +# ) +# def test_should_create_schema_from_pandas_dataframe(self, columns: Iterable, expected: Schema) -> None: +# assert Schema._from_pandas_dataframe(columns) == expected +# +# +# class TestRepr: +# @pytest.mark.parametrize( +# ("schema", "expected"), +# [ +# (Schema({}), "Schema({})"), +# (Schema({"A": Integer()}), "Schema({'A': Integer})"), +# (Schema({"A": Integer(), "B": String()}), "Schema({\n 'A': Integer,\n 'B': String\n})"), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_create_a_string_representation(self, schema: Schema, expected: str) -> None: +# assert repr(schema) == expected +# +# +# class TestStr: +# @pytest.mark.parametrize( +# ("schema", "expected"), +# [ +# (Schema({}), "{}"), +# (Schema({"A": Integer()}), "{'A': Integer}"), +# (Schema({"A": Integer(), "B": String()}), "{\n 'A': Integer,\n 'B': String\n}"), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_create_a_string_representation(self, schema: Schema, expected: str) -> None: +# assert str(schema) == expected +# +# +# class TestEq: +# @pytest.mark.parametrize( +# ("schema1", "schema2", "expected"), +# [ +# (Schema({}), Schema({}), True), +# (Schema({"col1": Integer()}), Schema({"col1": Integer()}), True), +# (Schema({"col1": Integer()}), Schema({"col1": String()}), False), +# (Schema({"col1": Integer()}), Schema({"col2": Integer()}), False), +# ( +# Schema({"col1": Integer(), "col2": String()}), +# Schema({"col2": String(), "col1": Integer()}), +# True, +# ), +# ], +# ids=[ +# "empty", +# "same name and type", +# "same name but different type", +# "different name but same type", +# "flipped columns", +# ], +# ) +# def test_should_return_whether_two_schema_are_equal(self, schema1: Schema, schema2: Schema, expected: bool) -> None: +# assert (schema1.__eq__(schema2)) == expected +# +# @pytest.mark.parametrize( +# ("schema", "other"), +# [ +# (Schema({"col1": Integer()}), None), +# (Schema({"col1": Integer()}), {"col1": Integer()}), +# ], +# ) +# def test_should_return_not_implemented_if_other_is_not_schema(self, schema: Schema, other: Any) -> None: +# assert (schema.__eq__(other)) is NotImplemented +# +# +# class TestHash: +# @pytest.mark.parametrize( +# ("schema1", "schema2"), +# [ +# (Schema({}), Schema({})), +# (Schema({"col1": Integer()}), Schema({"col1": Integer()})), +# ], +# ids=[ +# "empty", +# "one column", +# ], +# ) +# def test_should_return_same_hash_for_equal_schemas(self, schema1: Schema, schema2: Schema) -> None: +# assert hash(schema1) == hash(schema2) +# +# @pytest.mark.parametrize( +# ("schema1", "schema2"), +# [ +# (Schema({"col1": Integer()}), Schema({"col1": String()})), +# (Schema({"col1": Integer()}), Schema({"col2": Integer()})), +# ], +# ids=[ +# "same name but different type", +# "different name but same type", +# ], +# ) +# def test_should_return_different_hash_for_unequal_schemas(self, schema1: Schema, schema2: Schema) -> None: +# assert hash(schema1) != hash(schema2) +# +# +# class TestHasColumn: +# @pytest.mark.parametrize( +# ("schema", "column_name", "expected"), +# [ +# (Schema({}), "A", False), +# (Schema({"A": Integer()}), "A", True), +# (Schema({"A": Integer()}), "B", False), +# ], +# ids=[ +# "empty", +# "column exists", +# "column does not exist", +# ], +# ) +# def test_should_return_whether_column_exists(self, schema: Schema, column_name: str, expected: bool) -> None: +# assert schema.has_column(column_name) == expected +# +# +# class TestGetTypeOfColumn: +# @pytest.mark.parametrize( +# ("schema", "column_name", "expected"), +# [ +# (Schema({"A": Integer()}), "A", Integer()), +# (Schema({"A": Integer(), "B": String()}), "B", String()), +# ], +# ids=[ +# "one column", +# "two columns", +# ], +# ) +# def test_should_return_type_of_existing_column( +# self, +# schema: Schema, +# column_name: str, +# expected: ColumnType, +# ) -> None: +# assert schema.get_column_type(column_name) == expected +# +# def test_should_raise_if_column_does_not_exist(self) -> None: +# schema = Schema({"A": Integer()}) +# with pytest.raises(UnknownColumnNameError): +# schema.get_column_type("B") +# +# +# class TestGetColumnNames: +# @pytest.mark.parametrize( +# ("schema", "expected"), +# [ +# (Schema({}), []), +# (Schema({"A": Integer()}), ["A"]), +# (Schema({"A": Integer(), "B": RealNumber()}), ["A", "B"]), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_return_column_names(self, schema: Schema, expected: list[str]) -> None: +# assert schema.column_names == expected +# +# +# class TestToDict: +# @pytest.mark.parametrize( +# ("schema", "expected"), +# [ +# (Schema({}), {}), +# (Schema({"A": Integer()}), {"A": Integer()}), +# (Schema({"A": Integer(), "B": String()}), {"A": Integer(), "B": String()}), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_return_dict_for_schema(self, schema: Schema, expected: str) -> None: +# assert schema.to_dict() == expected +# +# +# class TestMergeMultipleSchemas: +# @pytest.mark.parametrize( +# ("schemas", "error_msg_regex"), +# [([Schema({"Column1": Anything()}), Schema({"Column2": Anything()})], r"Could not find column\(s\) 'Column2'")], +# ids=["different_column_names"], +# ) +# def test_should_raise_if_column_names_are_different(self, schemas: list[Schema], error_msg_regex: str) -> None: +# with pytest.raises(UnknownColumnNameError, match=error_msg_regex): +# Schema._merge_multiple_schemas(schemas) +# +# @pytest.mark.parametrize( +# ("schemas", "expected"), +# [ +# ([Schema({"Column1": Integer()}), Schema({"Column1": Integer()})], Schema({"Column1": Integer()})), +# ([Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber()})), +# ([Schema({"Column1": Boolean()}), Schema({"Column1": Boolean()})], Schema({"Column1": Boolean()})), +# ([Schema({"Column1": String()}), Schema({"Column1": String()})], Schema({"Column1": String()})), +# ([Schema({"Column1": Anything()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": Integer()}), Schema({"Column1": RealNumber()})], Schema({"Column1": RealNumber()})), +# ([Schema({"Column1": Integer()}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": Integer()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": Integer()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": RealNumber()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": RealNumber()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": Boolean()}), Schema({"Column1": String()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": Boolean()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), +# ([Schema({"Column1": String()}), Schema({"Column1": Anything()})], Schema({"Column1": Anything()})), +# ( +# [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Integer()})], +# Schema({"Column1": Integer(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": RealNumber()})], +# Schema({"Column1": RealNumber(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Boolean()})], +# Schema({"Column1": Boolean(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": String()})], +# Schema({"Column1": String(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Anything(is_nullable=True)}), Schema({"Column1": Anything()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": RealNumber()})], +# Schema({"Column1": RealNumber(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Boolean()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": String()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer(is_nullable=True)}), Schema({"Column1": Anything()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Boolean()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": String()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber(is_nullable=True)}), Schema({"Column1": Anything()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": String()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Boolean(is_nullable=True)}), Schema({"Column1": Anything()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": String(is_nullable=True)}), Schema({"Column1": Anything()})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer()}), Schema({"Column1": Integer(is_nullable=True)})], +# Schema({"Column1": Integer(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber()}), Schema({"Column1": RealNumber(is_nullable=True)})], +# Schema({"Column1": RealNumber(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Boolean()}), Schema({"Column1": Boolean(is_nullable=True)})], +# Schema({"Column1": Boolean(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": String()}), Schema({"Column1": String(is_nullable=True)})], +# Schema({"Column1": String(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Anything()}), Schema({"Column1": Anything(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer()}), Schema({"Column1": RealNumber(is_nullable=True)})], +# Schema({"Column1": RealNumber(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer()}), Schema({"Column1": Boolean(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer()}), Schema({"Column1": String(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Integer()}), Schema({"Column1": Anything(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber()}), Schema({"Column1": Boolean(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber()}), Schema({"Column1": String(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": RealNumber()}), Schema({"Column1": Anything(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Boolean()}), Schema({"Column1": String(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": Boolean()}), Schema({"Column1": Anything(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ( +# [Schema({"Column1": String()}), Schema({"Column1": Anything(is_nullable=True)})], +# Schema({"Column1": Anything(is_nullable=True)}), +# ), +# ], +# ids=[ +# "Integer Integer", +# "RealNumber RealNumber", +# "Boolean Boolean", +# "String String", +# "Anything Anything", +# "Integer RealNumber", +# "Integer Boolean", +# "Integer String", +# "Integer Anything", +# "RealNumber Boolean", +# "RealNumber String", +# "RealNumber Anything", +# "Boolean String", +# "Boolean Anything", +# "String Anything", +# "Integer(null) Integer", +# "RealNumber(null) RealNumber", +# "Boolean(null) Boolean", +# "String(null) String", +# "Anything(null) Anything", +# "Integer(null) RealNumber", +# "Integer(null) Boolean", +# "Integer(null) String", +# "Integer(null) Anything", +# "RealNumber(null) Boolean", +# "RealNumber(null) String", +# "RealNumber(null) Anything", +# "Boolean(null) String", +# "Boolean(null) Anything", +# "String(null) Anything", +# "Integer Integer(null)", +# "RealNumber RealNumber(null)", +# "Boolean Boolean(null)", +# "String String(null)", +# "Anything Anything(null)", +# "Integer RealNumber(null)", +# "Integer Boolean(null)", +# "Integer String(null)", +# "Integer Anything(null)", +# "RealNumber Boolean(null)", +# "RealNumber String(null)", +# "RealNumber Anything(null)", +# "Boolean String(null)", +# "Boolean Anything(null)", +# "String Anything(null)", +# ], +# ) +# def test_should_return_merged_schema(self, schemas: list[Schema], expected: Schema) -> None: +# assert Schema._merge_multiple_schemas(schemas) == expected +# schemas.reverse() +# assert ( +# Schema._merge_multiple_schemas(schemas) == expected +# ) # test the reversed list because the first parameter is handled differently +# +# +# class TestReprMarkdown: +# @pytest.mark.parametrize( +# ("schema", "expected"), +# [ +# (Schema({}), "Empty Schema"), +# (Schema({"A": Integer()}), "| Column Name | Column Type |\n| --- | --- |\n| A | Integer |"), +# ( +# Schema({"A": Integer(), "B": String()}), +# "| Column Name | Column Type |\n| --- | --- |\n| A | Integer |\n| B | String |", +# ), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_create_a_string_representation(self, schema: Schema, expected: str) -> None: +# assert schema._repr_markdown_() == expected +# +# +# class TestSizeof: +# @pytest.mark.parametrize( +# "schema", +# [ +# Schema({}), +# Schema({"A": Integer()}), +# Schema({"A": Integer(), "B": String()}), +# ], +# ids=[ +# "empty", +# "single column", +# "multiple columns", +# ], +# ) +# def test_should_size_be_greater_than_normal_object(self, schema: Schema) -> None: +# assert sys.getsizeof(schema) > sys.getsizeof(object()) diff --git a/tests/safeds/ml/classical/classification/test_ada_boost.py b/tests/safeds/ml/classical/classification/test_ada_boost.py index 91fde488b..d5f3a7d5b 100644 --- a/tests/safeds/ml/classical/classification/test_ada_boost.py +++ b/tests/safeds/ml/classical/classification/test_ada_boost.py @@ -20,8 +20,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: learner = AdaBoostClassifier() fitted_model = AdaBoostClassifier(learner=learner).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert isinstance(fitted_model._wrapped_classifier.estimator, type(learner._get_sklearn_classifier())) + assert fitted_model._wrapped_model is not None + assert isinstance(fitted_model._wrapped_model.estimator, type(learner._get_sklearn_model())) class TestMaximumNumberOfLearners: @@ -31,8 +31,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = AdaBoostClassifier(maximum_number_of_learners=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.n_estimators == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_estimators == 2 @pytest.mark.parametrize("maximum_number_of_learners", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, maximum_number_of_learners: int) -> None: @@ -50,8 +50,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = AdaBoostClassifier(learning_rate=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.learning_rate == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.learning_rate == 2 @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: diff --git a/tests/safeds/ml/classical/classification/test_classifier.py b/tests/safeds/ml/classical/classification/test_classifier.py index 4eb86da73..2f1097402 100644 --- a/tests/safeds/ml/classical/classification/test_classifier.py +++ b/tests/safeds/ml/classical/classification/test_classifier.py @@ -19,9 +19,9 @@ DecisionTreeClassifier, GradientBoostingClassifier, KNearestNeighborsClassifier, - LogisticRegressionClassifier, + LogisticClassifier, RandomForestClassifier, - SupportVectorMachineClassifier, + SupportVectorClassifier, ) if TYPE_CHECKING: @@ -47,9 +47,9 @@ def classifiers() -> list[Classifier]: DecisionTreeClassifier(), GradientBoostingClassifier(), KNearestNeighborsClassifier(2), - LogisticRegressionClassifier(), + LogisticClassifier(), RandomForestClassifier(), - SupportVectorMachineClassifier(), + SupportVectorClassifier(), ] @@ -332,7 +332,7 @@ def predict(self, dataset: Table) -> TabularDataset: def is_fitted(self) -> bool: return True - def _get_sklearn_classifier(self) -> ClassifierMixin: + def _get_sklearn_model(self) -> ClassifierMixin: pass @@ -403,23 +403,6 @@ def test_with_different_types(self) -> None: assert DummyClassifier().accuracy(table) == 0.0 - @pytest.mark.parametrize( - "table", - [ - Table( - { - "a": [1.0, 0.0, 0.0, 0.0], - "b": [0.0, 1.0, 1.0, 0.0], - "c": [0.0, 0.0, 0.0, 1.0], - }, - ), - ], - ids=["table"], - ) - def test_should_raise_if_given_normal_table(self, table: Table) -> None: - with pytest.raises(PlainTableError): - DummyClassifier().accuracy(table) # type: ignore[arg-type] - class TestPrecision: def test_should_compare_result(self) -> None: @@ -452,23 +435,6 @@ def test_should_return_1_if_never_expected_to_be_positive(self) -> None: assert DummyClassifier().precision(table, 1) == 1.0 - @pytest.mark.parametrize( - "table", - [ - Table( - { - "a": [1.0, 0.0, 0.0, 0.0], - "b": [0.0, 1.0, 1.0, 0.0], - "c": [0.0, 0.0, 0.0, 1.0], - }, - ), - ], - ids=["table"], - ) - def test_should_raise_if_given_normal_table(self, table: Table) -> None: - with pytest.raises(PlainTableError): - DummyClassifier().precision(table, 1) # type: ignore[arg-type] - class TestRecall: def test_should_compare_result(self) -> None: @@ -549,20 +515,3 @@ def test_should_return_1_if_never_expected_or_predicted_to_be_positive(self) -> ).to_tabular_dataset(target_name="expected") assert DummyClassifier().f1_score(table, 1) == 1.0 - - @pytest.mark.parametrize( - "table", - [ - Table( - { - "a": [1.0, 0.0, 0.0, 0.0], - "b": [0.0, 1.0, 1.0, 0.0], - "c": [0.0, 0.0, 0.0, 1.0], - }, - ), - ], - ids=["table"], - ) - def test_should_raise_if_given_normal_table(self, table: Table) -> None: - with pytest.raises(PlainTableError): - DummyClassifier().f1_score(table, 1) # type: ignore[arg-type] diff --git a/tests/safeds/ml/classical/classification/test_decision_tree.py b/tests/safeds/ml/classical/classification/test_decision_tree.py index c1ad02dae..fef94a59b 100644 --- a/tests/safeds/ml/classical/classification/test_decision_tree.py +++ b/tests/safeds/ml/classical/classification/test_decision_tree.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = DecisionTreeClassifier(maximum_depth=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.max_depth == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.max_depth == 2 @pytest.mark.parametrize("maximum_depth", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, maximum_depth: int) -> None: @@ -37,8 +37,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = DecisionTreeClassifier(minimum_number_of_samples_in_leaves=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.min_samples_leaf == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.min_samples_leaf == 2 @pytest.mark.parametrize("minimum_number_of_samples_in_leaves", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, minimum_number_of_samples_in_leaves: int) -> None: diff --git a/tests/safeds/ml/classical/classification/test_gradient_boosting.py b/tests/safeds/ml/classical/classification/test_gradient_boosting.py index c48ecd15d..c6cae80d3 100644 --- a/tests/safeds/ml/classical/classification/test_gradient_boosting.py +++ b/tests/safeds/ml/classical/classification/test_gradient_boosting.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = GradientBoostingClassifier(number_of_trees=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.n_estimators == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_estimators == 2 @pytest.mark.parametrize("number_of_trees", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_1(self, number_of_trees: int) -> None: @@ -37,8 +37,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = GradientBoostingClassifier(learning_rate=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.learning_rate == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.learning_rate == 2 @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: diff --git a/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py b/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py index 775ceb51a..ba00c4d12 100644 --- a/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py +++ b/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = KNearestNeighborsClassifier(number_of_neighbors=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.n_neighbors == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_neighbors == 2 @pytest.mark.parametrize("number_of_neighbors", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, number_of_neighbors: int) -> None: diff --git a/tests/safeds/ml/classical/classification/test_random_forest.py b/tests/safeds/ml/classical/classification/test_random_forest.py index ec44d657f..26aa2a250 100644 --- a/tests/safeds/ml/classical/classification/test_random_forest.py +++ b/tests/safeds/ml/classical/classification/test_random_forest.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RandomForestClassifier(number_of_trees=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.n_estimators == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_estimators == 2 @pytest.mark.parametrize("number_of_trees", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, number_of_trees: int) -> None: @@ -37,8 +37,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RandomForestClassifier(maximum_depth=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.max_depth == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.max_depth == 2 @pytest.mark.parametrize("maximum_depth", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, maximum_depth: int) -> None: @@ -56,8 +56,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RandomForestClassifier(minimum_number_of_samples_in_leaves=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.min_samples_leaf == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.min_samples_leaf == 2 @pytest.mark.parametrize("minimum_number_of_samples_in_leaves", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, minimum_number_of_samples_in_leaves: int) -> None: diff --git a/tests/safeds/ml/classical/classification/test_support_vector_machine.py b/tests/safeds/ml/classical/classification/test_support_vector_machine.py index 2a19fd80a..70f287b68 100644 --- a/tests/safeds/ml/classical/classification/test_support_vector_machine.py +++ b/tests/safeds/ml/classical/classification/test_support_vector_machine.py @@ -4,11 +4,11 @@ from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.classification import SupportVectorMachineClassifier -from safeds.ml.classical.classification._support_vector_machine import SupportVectorMachineKernel +from safeds.ml.classical._bases._support_vector_machine_base import _Linear, _Polynomial, _RadialBasisFunction, _Sigmoid +from safeds.ml.classical.classification import SupportVectorClassifier -def kernels() -> list[SupportVectorMachineKernel]: +def kernels() -> list[SupportVectorClassifier.Kernel]: """ Return the list of kernels to test. @@ -21,10 +21,10 @@ def kernels() -> list[SupportVectorMachineKernel]: The list of kernels to test. """ return [ - SupportVectorMachineClassifier.Kernel.Linear(), - SupportVectorMachineClassifier.Kernel.Sigmoid(), - SupportVectorMachineClassifier.Kernel.Polynomial(3), - SupportVectorMachineClassifier.Kernel.RadialBasisFunction(), + SupportVectorClassifier.Kernel.linear(), + SupportVectorClassifier.Kernel.sigmoid(), + SupportVectorClassifier.Kernel.polynomial(3), + SupportVectorClassifier.Kernel.radial_basis_function(), ] @@ -36,35 +36,35 @@ def training_set() -> TabularDataset: class TestC: def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - fitted_model = SupportVectorMachineClassifier(c=2).fit(training_set=training_set) + fitted_model = SupportVectorClassifier(c=2).fit(training_set=training_set) assert fitted_model.c == 2 def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - fitted_model = SupportVectorMachineClassifier(c=2).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert fitted_model._wrapped_classifier.C == 2 + fitted_model = SupportVectorClassifier(c=2).fit(training_set) + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.C == 2 @pytest.mark.parametrize("c", [-1.0, 0.0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, c: float) -> None: with pytest.raises(OutOfBoundsError, match=rf"c \(={c}\) is not inside \(0, \u221e\)\."): - SupportVectorMachineClassifier(c=c) + SupportVectorClassifier(c=c) class TestKernel: def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - kernel = SupportVectorMachineClassifier.Kernel.Linear() - fitted_model = SupportVectorMachineClassifier(c=2, kernel=kernel).fit(training_set=training_set) - assert isinstance(fitted_model.kernel, SupportVectorMachineClassifier.Kernel.Linear) + kernel = SupportVectorClassifier.Kernel.linear() + fitted_model = SupportVectorClassifier(c=2, kernel=kernel).fit(training_set=training_set) + assert isinstance(fitted_model.kernel, _Linear) def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - kernel = SupportVectorMachineClassifier.Kernel.Linear() - fitted_model = SupportVectorMachineClassifier(c=2, kernel=kernel).fit(training_set) - assert fitted_model._wrapped_classifier is not None - assert isinstance(fitted_model.kernel, SupportVectorMachineClassifier.Kernel.Linear) + kernel = SupportVectorClassifier.Kernel.linear() + fitted_model = SupportVectorClassifier(c=2, kernel=kernel).fit(training_set) + assert fitted_model._wrapped_model is not None + assert isinstance(fitted_model.kernel, _Linear) def test_should_get_sklearn_arguments_linear(self) -> None: - svm = SupportVectorMachineClassifier(c=2, kernel=SupportVectorMachineClassifier.Kernel.Linear()) - assert isinstance(svm.kernel, SupportVectorMachineClassifier.Kernel.Linear) + svm = SupportVectorClassifier(c=2, kernel=SupportVectorClassifier.Kernel.linear()) + assert isinstance(svm.kernel, _Linear) linear_kernel = svm.kernel._get_sklearn_arguments() assert linear_kernel == { "kernel": "linear", @@ -73,11 +73,11 @@ def test_should_get_sklearn_arguments_linear(self) -> None: @pytest.mark.parametrize("degree", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_degree_less_than_1(self, degree: int) -> None: with pytest.raises(OutOfBoundsError, match=rf"degree \(={degree}\) is not inside \[1, \u221e\)\."): - SupportVectorMachineClassifier.Kernel.Polynomial(degree=degree) + SupportVectorClassifier.Kernel.polynomial(degree=degree) def test_should_get_sklearn_arguments_polynomial(self) -> None: - svm = SupportVectorMachineClassifier(c=2, kernel=SupportVectorMachineClassifier.Kernel.Polynomial(degree=2)) - assert isinstance(svm.kernel, SupportVectorMachineClassifier.Kernel.Polynomial) + svm = SupportVectorClassifier(c=2, kernel=SupportVectorClassifier.Kernel.polynomial(degree=2)) + assert isinstance(svm.kernel, _Polynomial) poly_kernel = svm.kernel._get_sklearn_arguments() assert poly_kernel == { "kernel": "poly", @@ -85,20 +85,20 @@ def test_should_get_sklearn_arguments_polynomial(self) -> None: } def test_should_get_degree(self) -> None: - kernel = SupportVectorMachineClassifier.Kernel.Polynomial(degree=3) + kernel = SupportVectorClassifier.Kernel.polynomial(degree=3) assert kernel.degree == 3 def test_should_get_sklearn_arguments_sigmoid(self) -> None: - svm = SupportVectorMachineClassifier(c=2, kernel=SupportVectorMachineClassifier.Kernel.Sigmoid()) - assert isinstance(svm.kernel, SupportVectorMachineClassifier.Kernel.Sigmoid) + svm = SupportVectorClassifier(c=2, kernel=SupportVectorClassifier.Kernel.sigmoid()) + assert isinstance(svm.kernel, _Sigmoid) sigmoid_kernel = svm.kernel._get_sklearn_arguments() assert sigmoid_kernel == { "kernel": "sigmoid", } def test_should_get_sklearn_arguments_rbf(self) -> None: - svm = SupportVectorMachineClassifier(c=2, kernel=SupportVectorMachineClassifier.Kernel.RadialBasisFunction()) - assert isinstance(svm.kernel, SupportVectorMachineClassifier.Kernel.RadialBasisFunction) + svm = SupportVectorClassifier(c=2, kernel=SupportVectorClassifier.Kernel.radial_basis_function()) + assert isinstance(svm.kernel, _RadialBasisFunction) rbf_kernel = svm.kernel._get_sklearn_arguments() assert rbf_kernel == { "kernel": "rbf", @@ -111,8 +111,8 @@ def test_should_get_sklearn_arguments_rbf(self) -> None: ) def test_should_return_same_hash_for_equal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorClassifier.Kernel, + kernel2: SupportVectorClassifier.Kernel, ) -> None: assert hash(kernel1) == hash(kernel2) @@ -123,8 +123,8 @@ def test_should_return_same_hash_for_equal_kernel( ) def test_should_return_different_hash_for_unequal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorClassifier.Kernel, + kernel2: SupportVectorClassifier.Kernel, ) -> None: assert hash(kernel1) != hash(kernel2) @@ -135,8 +135,8 @@ def test_should_return_different_hash_for_unequal_kernel( ) def test_equal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorClassifier.Kernel, + kernel2: SupportVectorClassifier.Kernel, ) -> None: assert kernel1 == kernel2 @@ -147,18 +147,18 @@ def test_equal_kernel( ) def test_unequal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorClassifier.Kernel, + kernel2: SupportVectorClassifier.Kernel, ) -> None: assert kernel1 != kernel2 @pytest.mark.parametrize( "kernel", - ([SupportVectorMachineClassifier.Kernel.Polynomial(3)]), + ([SupportVectorClassifier.Kernel.polynomial(3)]), ids=lambda x: x.__class__.__name__, ) def test_sizeof_kernel( self, - kernel: SupportVectorMachineKernel, + kernel: SupportVectorClassifier.Kernel, ) -> None: assert sys.getsizeof(kernel) > sys.getsizeof(object()) diff --git a/tests/safeds/ml/classical/regression/test_ada_boost.py b/tests/safeds/ml/classical/regression/test_ada_boost.py index 44cfcbd83..26521dd1d 100644 --- a/tests/safeds/ml/classical/regression/test_ada_boost.py +++ b/tests/safeds/ml/classical/regression/test_ada_boost.py @@ -20,8 +20,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: learner = AdaBoostRegressor() fitted_model = AdaBoostRegressor(learner=learner).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert isinstance(fitted_model._wrapped_regressor.estimator, type(learner._get_sklearn_regressor())) + assert fitted_model._wrapped_model is not None + assert isinstance(fitted_model._wrapped_model.estimator, type(learner._get_sklearn_regressor())) class TestMaximumNumberOfLearners: @@ -31,8 +31,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = AdaBoostRegressor(maximum_number_of_learners=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.n_estimators == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_estimators == 2 @pytest.mark.parametrize("maximum_number_of_learners", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, maximum_number_of_learners: int) -> None: @@ -50,8 +50,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = AdaBoostRegressor(learning_rate=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.learning_rate == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.learning_rate == 2 @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: diff --git a/tests/safeds/ml/classical/regression/test_decision_tree.py b/tests/safeds/ml/classical/regression/test_decision_tree.py index 883dc5107..7ace3082a 100644 --- a/tests/safeds/ml/classical/regression/test_decision_tree.py +++ b/tests/safeds/ml/classical/regression/test_decision_tree.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = DecisionTreeRegressor(maximum_depth=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.max_depth == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.max_depth == 2 @pytest.mark.parametrize("maximum_depth", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, maximum_depth: int) -> None: @@ -37,8 +37,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = DecisionTreeRegressor(minimum_number_of_samples_in_leaves=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.min_samples_leaf == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.min_samples_leaf == 2 @pytest.mark.parametrize("minimum_number_of_samples_in_leaves", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, minimum_number_of_samples_in_leaves: int) -> None: diff --git a/tests/safeds/ml/classical/regression/test_elastic_net_regression.py b/tests/safeds/ml/classical/regression/test_elastic_net_regression.py index 66f10699d..e064e3bcc 100644 --- a/tests/safeds/ml/classical/regression/test_elastic_net_regression.py +++ b/tests/safeds/ml/classical/regression/test_elastic_net_regression.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = ElasticNetRegressor(alpha=1).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.alpha == 1 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.alpha == 1 @pytest.mark.parametrize("alpha", [-0.5], ids=["minus_0_point_5"]) def test_should_raise_if_less_than_0(self, alpha: float) -> None: @@ -47,8 +47,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = ElasticNetRegressor(lasso_ratio=0.3).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.l1_ratio == 0.3 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.l1_ratio == 0.3 @pytest.mark.parametrize("lasso_ratio", [-0.5, 1.5], ids=["minus_zero_point_5", "one_point_5"]) def test_should_raise_if_not_between_0_and_1(self, lasso_ratio: float) -> None: diff --git a/tests/safeds/ml/classical/regression/test_gradient_boosting.py b/tests/safeds/ml/classical/regression/test_gradient_boosting.py index f1ef8549d..2283f32c3 100644 --- a/tests/safeds/ml/classical/regression/test_gradient_boosting.py +++ b/tests/safeds/ml/classical/regression/test_gradient_boosting.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = GradientBoostingRegressor(number_of_trees=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.n_estimators == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_estimators == 2 @pytest.mark.parametrize("number_of_trees", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_1(self, number_of_trees: int) -> None: @@ -37,8 +37,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = GradientBoostingRegressor(learning_rate=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.learning_rate == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.learning_rate == 2 @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: diff --git a/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py b/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py index a01e27f0b..383e7a7db 100644 --- a/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py +++ b/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = KNearestNeighborsRegressor(number_of_neighbors=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.n_neighbors == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_neighbors == 2 @pytest.mark.parametrize("number_of_neighbors", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, number_of_neighbors: int) -> None: diff --git a/tests/safeds/ml/classical/regression/test_lasso_regression.py b/tests/safeds/ml/classical/regression/test_lasso_regression.py index 90d771b16..aa175f975 100644 --- a/tests/safeds/ml/classical/regression/test_lasso_regression.py +++ b/tests/safeds/ml/classical/regression/test_lasso_regression.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = LassoRegressor(alpha=1).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.alpha == 1 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.alpha == 1 @pytest.mark.parametrize("alpha", [-0.5], ids=["minus_zero_point_5"]) def test_should_raise_if_less_than_0(self, alpha: float) -> None: diff --git a/tests/safeds/ml/classical/regression/test_random_forest.py b/tests/safeds/ml/classical/regression/test_random_forest.py index a37e2d902..c1ef2cda6 100644 --- a/tests/safeds/ml/classical/regression/test_random_forest.py +++ b/tests/safeds/ml/classical/regression/test_random_forest.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RandomForestRegressor(number_of_trees=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.n_estimators == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.n_estimators == 2 @pytest.mark.parametrize("number_of_trees", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, number_of_trees: int) -> None: @@ -37,8 +37,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RandomForestRegressor(maximum_depth=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.max_depth == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.max_depth == 2 @pytest.mark.parametrize("maximum_depth", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, maximum_depth: int) -> None: @@ -56,8 +56,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RandomForestRegressor(minimum_number_of_samples_in_leaves=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.min_samples_leaf == 2 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.min_samples_leaf == 2 @pytest.mark.parametrize("minimum_number_of_samples_in_leaves", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, minimum_number_of_samples_in_leaves: int) -> None: diff --git a/tests/safeds/ml/classical/regression/test_regressor.py b/tests/safeds/ml/classical/regression/test_regressor.py index 90af36e63..c43affcaf 100644 --- a/tests/safeds/ml/classical/regression/test_regressor.py +++ b/tests/safeds/ml/classical/regression/test_regressor.py @@ -22,14 +22,12 @@ GradientBoostingRegressor, KNearestNeighborsRegressor, LassoRegressor, - LinearRegressionRegressor, + LinearRegressor, RandomForestRegressor, Regressor, RidgeRegressor, - SupportVectorMachineRegressor, + SupportVectorRegressor, ) - -# noinspection PyProtectedMember from safeds.ml.classical.regression._regressor import _check_metrics_preconditions if TYPE_CHECKING: @@ -57,10 +55,10 @@ def regressors() -> list[Regressor]: GradientBoostingRegressor(), KNearestNeighborsRegressor(2), LassoRegressor(), - LinearRegressionRegressor(), + LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), - SupportVectorMachineRegressor(), + SupportVectorRegressor(), ] @@ -324,11 +322,10 @@ class DummyRegressor(Regressor): `target_name` must be set to `"expected"`. """ - def fit(self, training_set: TabularDataset) -> DummyRegressor: # noqa: ARG002 + def fit(self, _training_set: TabularDataset) -> DummyRegressor: return self def predict(self, dataset: Table) -> TabularDataset: - # Needed until https://github.com/Safe-DS/Library/issues/75 is fixed predicted = dataset.get_column("predicted") feature = predicted.rename("feature") dataset = Table.from_columns([feature, predicted]) @@ -371,23 +368,6 @@ def test_valid_data(self, predicted: list[float], expected: list[float], result: assert DummyRegressor().summarize_metrics(table) == result - @pytest.mark.parametrize( - "table", - [ - Table( - { - "a": [1.0, 0.0, 0.0, 0.0], - "b": [0.0, 1.0, 1.0, 0.0], - "c": [0.0, 0.0, 0.0, 1.0], - }, - ), - ], - ids=["table"], - ) - def test_should_raise_if_given_normal_table(self, table: Table) -> None: - with pytest.raises(PlainTableError): - DummyRegressor().summarize_metrics(table) # type: ignore[arg-type] - class TestMeanAbsoluteError: @pytest.mark.parametrize( @@ -412,23 +392,6 @@ def test_valid_data(self, predicted: list[float], expected: list[float], result: assert DummyRegressor().mean_absolute_error(table) == result - @pytest.mark.parametrize( - "table", - [ - Table( - { - "a": [1.0, 0.0, 0.0, 0.0], - "b": [0.0, 1.0, 1.0, 0.0], - "c": [0.0, 0.0, 0.0, 1.0], - }, - ), - ], - ids=["table"], - ) - def test_should_raise_if_given_normal_table(self, table: Table) -> None: - with pytest.raises(PlainTableError): - DummyRegressor().mean_absolute_error(table) # type: ignore[arg-type] - class TestMeanSquaredError: @pytest.mark.parametrize( @@ -443,23 +406,6 @@ def test_valid_data(self, predicted: list[float], expected: list[float], result: assert DummyRegressor().mean_squared_error(table) == result - @pytest.mark.parametrize( - "table", - [ - Table( - { - "a": [1.0, 0.0, 0.0, 0.0], - "b": [0.0, 1.0, 1.0, 0.0], - "c": [0.0, 0.0, 0.0, 1.0], - }, - ), - ], - ids=["table"], - ) - def test_should_raise_if_given_normal_table(self, table: Table) -> None: - with pytest.raises(PlainTableError): - DummyRegressor().mean_squared_error(table) # type: ignore[arg-type] - class TestCheckMetricsPreconditions: @pytest.mark.parametrize( diff --git a/tests/safeds/ml/classical/regression/test_ridge_regression.py b/tests/safeds/ml/classical/regression/test_ridge_regression.py index 3dd2054ce..87ea2b8e5 100644 --- a/tests/safeds/ml/classical/regression/test_ridge_regression.py +++ b/tests/safeds/ml/classical/regression/test_ridge_regression.py @@ -18,8 +18,8 @@ def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: fitted_model = RidgeRegressor(alpha=1).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.alpha == 1 + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.alpha == 1 @pytest.mark.parametrize("alpha", [-0.5], ids=["minus_zero_point_5"]) def test_should_raise_if_less_than_0(self, alpha: float) -> None: diff --git a/tests/safeds/ml/classical/regression/test_support_vector_machine.py b/tests/safeds/ml/classical/regression/test_support_vector_machine.py index a2015964c..c96c2110c 100644 --- a/tests/safeds/ml/classical/regression/test_support_vector_machine.py +++ b/tests/safeds/ml/classical/regression/test_support_vector_machine.py @@ -4,11 +4,10 @@ from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.regression import SupportVectorMachineRegressor -from safeds.ml.classical.regression._support_vector_machine import SupportVectorMachineKernel +from safeds.ml.classical.regression import SupportVectorRegressor -def kernels() -> list[SupportVectorMachineKernel]: +def kernels() -> list[SupportVectorRegressor.Kernel]: """ Return the list of kernels to test. @@ -21,10 +20,10 @@ def kernels() -> list[SupportVectorMachineKernel]: The list of kernels to test. """ return [ - SupportVectorMachineRegressor.Kernel.Linear(), - SupportVectorMachineRegressor.Kernel.Sigmoid(), - SupportVectorMachineRegressor.Kernel.Polynomial(3), - SupportVectorMachineRegressor.Kernel.RadialBasisFunction(), + SupportVectorRegressor.Kernel.linear(), + SupportVectorRegressor.Kernel.sigmoid(), + SupportVectorRegressor.Kernel.polynomial(3), + SupportVectorRegressor.Kernel.radial_basis_function(), ] @@ -36,35 +35,35 @@ def training_set() -> TabularDataset: class TestC: def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - fitted_model = SupportVectorMachineRegressor(c=2).fit(training_set=training_set) + fitted_model = SupportVectorRegressor(c=2).fit(training_set=training_set) assert fitted_model.c == 2 def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - fitted_model = SupportVectorMachineRegressor(c=2).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert fitted_model._wrapped_regressor.C == 2 + fitted_model = SupportVectorRegressor(c=2).fit(training_set) + assert fitted_model._wrapped_model is not None + assert fitted_model._wrapped_model.C == 2 @pytest.mark.parametrize("c", [-1.0, 0.0], ids=["minus_one", "zero"]) def test_should_raise_if_less_than_or_equal_to_0(self, c: float) -> None: with pytest.raises(OutOfBoundsError, match=rf"c \(={c}\) is not inside \(0, \u221e\)\."): - SupportVectorMachineRegressor(c=c) + SupportVectorRegressor(c=c) class TestKernel: def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - kernel = SupportVectorMachineRegressor.Kernel.Linear() - fitted_model = SupportVectorMachineRegressor(c=2, kernel=kernel).fit(training_set=training_set) - assert isinstance(fitted_model.kernel, SupportVectorMachineRegressor.Kernel.Linear) + kernel = SupportVectorRegressor.Kernel.Linear() + fitted_model = SupportVectorRegressor(c=2, kernel=kernel).fit(training_set=training_set) + assert isinstance(fitted_model.kernel, SupportVectorRegressor.Kernel.Linear) def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - kernel = SupportVectorMachineRegressor.Kernel.Linear() - fitted_model = SupportVectorMachineRegressor(c=2, kernel=kernel).fit(training_set) - assert fitted_model._wrapped_regressor is not None - assert isinstance(fitted_model.kernel, SupportVectorMachineRegressor.Kernel.Linear) + kernel = SupportVectorRegressor.Kernel.Linear() + fitted_model = SupportVectorRegressor(c=2, kernel=kernel).fit(training_set) + assert fitted_model._wrapped_model is not None + assert isinstance(fitted_model.kernel, SupportVectorRegressor.Kernel.Linear) def test_should_get_sklearn_arguments_linear(self) -> None: - svm = SupportVectorMachineRegressor(c=2, kernel=SupportVectorMachineRegressor.Kernel.Linear()) - assert isinstance(svm.kernel, SupportVectorMachineRegressor.Kernel.Linear) + svm = SupportVectorRegressor(c=2, kernel=SupportVectorRegressor.Kernel.Linear()) + assert isinstance(svm.kernel, SupportVectorRegressor.Kernel.Linear) linear_kernel = svm.kernel._get_sklearn_arguments() assert linear_kernel == { "kernel": "linear", @@ -73,11 +72,11 @@ def test_should_get_sklearn_arguments_linear(self) -> None: @pytest.mark.parametrize("degree", [-1, 0], ids=["minus_one", "zero"]) def test_should_raise_if_degree_less_than_1(self, degree: int) -> None: with pytest.raises(OutOfBoundsError, match=rf"degree \(={degree}\) is not inside \[1, \u221e\)\."): - SupportVectorMachineRegressor.Kernel.Polynomial(degree=degree) + SupportVectorRegressor.Kernel.Polynomial(degree=degree) def test_should_get_sklearn_arguments_polynomial(self) -> None: - svm = SupportVectorMachineRegressor(c=2, kernel=SupportVectorMachineRegressor.Kernel.Polynomial(degree=2)) - assert isinstance(svm.kernel, SupportVectorMachineRegressor.Kernel.Polynomial) + svm = SupportVectorRegressor(c=2, kernel=SupportVectorRegressor.Kernel.Polynomial(degree=2)) + assert isinstance(svm.kernel, SupportVectorRegressor.Kernel.Polynomial) poly_kernel = svm.kernel._get_sklearn_arguments() assert poly_kernel == { "kernel": "poly", @@ -85,20 +84,20 @@ def test_should_get_sklearn_arguments_polynomial(self) -> None: } def test_should_get_degree(self) -> None: - kernel = SupportVectorMachineRegressor.Kernel.Polynomial(degree=3) + kernel = SupportVectorRegressor.Kernel.Polynomial(degree=3) assert kernel.degree == 3 def test_should_get_sklearn_arguments_sigmoid(self) -> None: - svm = SupportVectorMachineRegressor(c=2, kernel=SupportVectorMachineRegressor.Kernel.Sigmoid()) - assert isinstance(svm.kernel, SupportVectorMachineRegressor.Kernel.Sigmoid) + svm = SupportVectorRegressor(c=2, kernel=SupportVectorRegressor.Kernel.Sigmoid()) + assert isinstance(svm.kernel, SupportVectorRegressor.Kernel.Sigmoid) sigmoid_kernel = svm.kernel._get_sklearn_arguments() assert sigmoid_kernel == { "kernel": "sigmoid", } def test_should_get_sklearn_arguments_rbf(self) -> None: - svm = SupportVectorMachineRegressor(c=2, kernel=SupportVectorMachineRegressor.Kernel.RadialBasisFunction()) - assert isinstance(svm.kernel, SupportVectorMachineRegressor.Kernel.RadialBasisFunction) + svm = SupportVectorRegressor(c=2, kernel=SupportVectorRegressor.Kernel.RadialBasisFunction()) + assert isinstance(svm.kernel, SupportVectorRegressor.Kernel.RadialBasisFunction) rbf_kernel = svm.kernel._get_sklearn_arguments() assert rbf_kernel == { "kernel": "rbf", @@ -111,8 +110,8 @@ def test_should_get_sklearn_arguments_rbf(self) -> None: ) def test_should_return_same_hash_for_equal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorRegressor.Kernel, + kernel2: SupportVectorRegressor.Kernel, ) -> None: assert hash(kernel1) == hash(kernel2) @@ -123,8 +122,8 @@ def test_should_return_same_hash_for_equal_kernel( ) def test_should_return_different_hash_for_unequal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorRegressor.Kernel, + kernel2: SupportVectorRegressor.Kernel, ) -> None: assert hash(kernel1) != hash(kernel2) @@ -135,8 +134,8 @@ def test_should_return_different_hash_for_unequal_kernel( ) def test_equal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorRegressor.Kernel, + kernel2: SupportVectorRegressor.Kernel, ) -> None: assert kernel1 == kernel2 @@ -147,18 +146,18 @@ def test_equal_kernel( ) def test_unequal_kernel( self, - kernel1: SupportVectorMachineKernel, - kernel2: SupportVectorMachineKernel, + kernel1: SupportVectorRegressor.Kernel, + kernel2: SupportVectorRegressor.Kernel, ) -> None: assert kernel1 != kernel2 @pytest.mark.parametrize( "kernel", - ([SupportVectorMachineRegressor.Kernel.Polynomial(3)]), + ([SupportVectorRegressor.Kernel.polynomial(3)]), ids=lambda x: x.__class__.__name__, ) def test_sizeof_kernel( self, - kernel: SupportVectorMachineKernel, + kernel: SupportVectorRegressor.Kernel, ) -> None: assert sys.getsizeof(kernel) > sys.getsizeof(object()) diff --git a/tests/safeds/ml/classical/test_util_sklearn.py b/tests/safeds/ml/classical/test_util_sklearn.py index 7c547b2e3..75e5c1fe6 100644 --- a/tests/safeds/ml/classical/test_util_sklearn.py +++ b/tests/safeds/ml/classical/test_util_sklearn.py @@ -1,56 +1,57 @@ -import warnings -from typing import Any - -import pytest -from safeds.data.tabular.containers import Table -from safeds.exceptions import LearningError, PredictionError -from safeds.ml.classical._util_sklearn import fit, predict -from safeds.ml.classical.regression import LinearRegressionRegressor - - -def test_predict_should_not_warn_about_feature_names() -> None: - """See https://github.com/Safe-DS/Library/issues/51.""" - training_set = Table({"a": [1, 2, 3], "b": [2, 4, 6]}).to_tabular_dataset(target_name="b") - - model = LinearRegressionRegressor() - fitted_model = model.fit(training_set) - - test_set = Table({"a": [4, 5, 6]}) - - # No warning should be emitted - with warnings.catch_warnings(): - warnings.filterwarnings("error", message="X has feature names") - fitted_model.predict(test_set) - - -class MLModelRaiseValueErrorOnFitAndPredict: - x, y = None, None - - def fit(self, x: Any, y: Any) -> None: - # The Linter does not want unnecessary parameters, so we just assign them to the class values - self.x = x - self.y = y - raise ValueError("Raise ValueError (LearningError) in fit for Test") - - def predict(self, x: Any) -> None: - # The Linter does not want unnecessary parameters, so we just assign it to the class value - self.x = x - raise ValueError("Raise ValueError (PredictionError) in predict for Test") - - -def test_should_raise_learning_error() -> None: - tabular_dataset = Table({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}).to_tabular_dataset("col3") - with pytest.raises( - LearningError, - match=r"Error occurred while learning: Raise ValueError \(LearningError\) in fit for Test", - ): - fit(MLModelRaiseValueErrorOnFitAndPredict(), tabular_dataset) - - -def test_should_raise_prediction_error() -> None: - table = Table({"col1": [1, 2], "col2": [3, 4]}) - with pytest.raises( - PredictionError, - match=r"Error occurred while predicting: Raise ValueError \(PredictionError\) in predict for Test", - ): - predict(MLModelRaiseValueErrorOnFitAndPredict(), table, ["col1", "col2"], "col3") +# TODO +# import warnings +# from typing import Any +# +# import pytest +# from safeds.data.tabular.containers import Table +# from safeds.exceptions import LearningError, PredictionError +# from safeds.ml.classical._util_sklearn import fit, predict +# from safeds.ml.classical.regression import LinearRegressor +# +# +# def test_predict_should_not_warn_about_feature_names() -> None: +# """See https://github.com/Safe-DS/Library/issues/51.""" +# training_set = Table({"a": [1, 2, 3], "b": [2, 4, 6]}).to_tabular_dataset(target_name="b") +# +# model = LinearRegressor() +# fitted_model = model.fit(training_set) +# +# test_set = Table({"a": [4, 5, 6]}) +# +# # No warning should be emitted +# with warnings.catch_warnings(): +# warnings.filterwarnings("error", message="X has feature names") +# fitted_model.predict(test_set) +# +# +# class MLModelRaiseValueErrorOnFitAndPredict: +# x, y = None, None +# +# def fit(self, x: Any, y: Any) -> None: +# # The Linter does not want unnecessary parameters, so we just assign them to the class values +# self.x = x +# self.y = y +# raise ValueError("Raise ValueError (LearningError) in fit for Test") +# +# def predict(self, x: Any) -> None: +# # The Linter does not want unnecessary parameters, so we just assign it to the class value +# self.x = x +# raise ValueError("Raise ValueError (PredictionError) in predict for Test") +# +# +# def test_should_raise_learning_error() -> None: +# tabular_dataset = Table({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}).to_tabular_dataset("col3") +# with pytest.raises( +# LearningError, +# match=r"Error occurred while learning: Raise ValueError \(LearningError\) in fit for Test", +# ): +# fit(MLModelRaiseValueErrorOnFitAndPredict(), tabular_dataset) +# +# +# def test_should_raise_prediction_error() -> None: +# table = Table({"col1": [1, 2], "col2": [3, 4]}) +# with pytest.raises( +# PredictionError, +# match=r"Error occurred while predicting: Raise ValueError \(PredictionError\) in predict for Test", +# ): +# predict(MLModelRaiseValueErrorOnFitAndPredict(), table, ["col1", "col2"], "col3") diff --git a/tests/safeds/ml/nn/test_cnn_workflow.py b/tests/safeds/ml/nn/test_cnn_workflow.py index 95caf6781..0a5751ee6 100644 --- a/tests/safeds/ml/nn/test_cnn_workflow.py +++ b/tests/safeds/ml/nn/test_cnn_workflow.py @@ -3,7 +3,6 @@ import pytest import torch - from safeds._config import _get_device from safeds.data.image.containers import ImageList from safeds.data.labeled.containers import ImageDataset @@ -25,7 +24,7 @@ from syrupy import SnapshotAssertion from torch.types import Device -from tests.helpers import device_cpu, device_cuda, images_all, resolve_resource_path, configure_test_with_device +from tests.helpers import configure_test_with_device, device_cpu, device_cuda, images_all, resolve_resource_path if TYPE_CHECKING: from safeds.ml.nn import Layer @@ -92,7 +91,7 @@ def test_should_train_and_predict_model( torch.eq( nn_original._model.state_dict()["_pytorch_layers.3._layer.bias"], nn._model.state_dict()["_pytorch_layers.3._layer.bias"], - ) + ), ).item() prediction: ImageDataset = nn.predict(image_dataset.get_input()) assert one_hot_encoder.inverse_transform(prediction.get_output()) == Table({"class": prediction_label}) @@ -159,7 +158,7 @@ def test_should_train_and_predict_model( torch.eq( nn_original._model.state_dict()["_pytorch_layers.3._layer.bias"], nn._model.state_dict()["_pytorch_layers.3._layer.bias"], - ) + ), ).item() prediction: ImageDataset = nn.predict(image_dataset.get_input()) assert prediction.get_output() == Column("class", prediction_label) @@ -209,7 +208,7 @@ def test_should_train_and_predict_model( torch.eq( nn_original._model.state_dict()["_pytorch_layers.3._layer.bias"], nn._model.state_dict()["_pytorch_layers.3._layer.bias"], - ) + ), ).item() prediction = nn.predict(image_dataset.get_input()) assert isinstance(prediction.get_output(), ImageList)