From 7d46802fae0142a0c680880899659453d86da32e Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 17:10:26 +0200 Subject: [PATCH 1/2] feat: function to drop columns/rows with missing values --- src/safeds/data/tabular/containers/_table.py | 39 +++++++++++++++---- .../test_drop_columns_with_missing_values.py | 27 +++++++++++++ .../test_drop_rows_with_missing_values.py | 27 +++++++++++++ 3 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 tests/safeds/data/tabular/containers/_table/test_drop_columns_with_missing_values.py create mode 100644 tests/safeds/data/tabular/containers/_table/test_drop_rows_with_missing_values.py diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index f34ee6c02..61e3eb512 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -272,7 +272,7 @@ def get_column(self, column_name: str) -> Column: if self._schema.has_column(column_name): output_column = Column( self._data.iloc[ - :, [self._schema._get_column_index_by_name(column_name)] + :, [self._schema._get_column_index_by_name(column_name)] ].squeeze(), column_name, self._schema.get_type_of_column(column_name), @@ -573,6 +573,21 @@ def drop_columns(self, column_names: list[str]) -> Table: ) return Table(transformed_data) + def drop_columns_with_missing_values(self) -> Table: + """ + Return a table without the columns that contain missing values. + + Returns + ------- + table : Table + A table without the columns that contain missing values. + """ + return Table.from_columns([ + column + for column in self.to_columns() + if not column.has_missing_values() + ]) + def drop_columns_with_non_numerical_values(self) -> Table: """ Return a table without the columns that contain non-numerical values. @@ -593,12 +608,24 @@ def drop_duplicate_rows(self) -> Table: ------- result : Table The table with the duplicate rows removed. - """ df = self._data.drop_duplicates(ignore_index=True) df.columns = self._schema.get_column_names() return Table(df) + def drop_rows_with_missing_values(self) -> Table: + """ + Return a table without the rows that contain missing values. + + Returns + ------- + table : Table + A table without the rows that contain missing values. + """ + result = self._data.copy(deep=True) + result = result.dropna(axis="index") + return Table(result, self._schema) + def drop_rows_with_outliers(self) -> Table: """ Remove all rows from the table that contain at least one outlier defined as having a value that has a distance @@ -828,10 +855,7 @@ def slice( def sort_columns( self, - query: Callable[[Column, Column], int] = lambda col1, col2: ( - col1.name > col2.name - ) - - (col1.name < col2.name), + query: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name) - (col1.name < col2.name), ) -> Table: """ Sort a table with the given lambda function. @@ -868,7 +892,8 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]: Returns ------- result : (Table, Table) - A tuple containing the two resulting tables. The first table has the specified size, the second table contains the rest of the data. + A tuple containing the two resulting tables. The first table has the specified size, the second table + contains the rest of the data. """ diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_missing_values.py b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_missing_values.py new file mode 100644 index 000000000..41ee02574 --- /dev/null +++ b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_missing_values.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +from safeds.data.tabular.containers import Table +from safeds.data.tabular.typing import ColumnType, TableSchema + + +def test_drop_columns_with_missing_values_valid() -> None: + table = Table( + pd.DataFrame( + data={ + "col1": [None, None, None, None], + "col2": [1, 2, 3, None], + "col3": [1, 2, 3, 4], + "col4": [2, 3, 1, 4], + } + ) + ) + updated_table = table.drop_columns_with_missing_values() + assert updated_table.get_column_names() == ["col3", "col4"] + + +def test_drop_columns_with_missing_values_empty() -> None: + table = Table( + [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) + ) + updated_table = table.drop_columns_with_missing_values() + assert updated_table.get_column_names() == ["col1"] diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_missing_values.py b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_missing_values.py new file mode 100644 index 000000000..80bfe3ea9 --- /dev/null +++ b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_missing_values.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +from safeds.data.tabular.containers import Table +from safeds.data.tabular.typing import ColumnType, TableSchema + + +def test_drop_rows_with_missing_values_valid() -> None: + table = Table( + pd.DataFrame( + data={ + "col1": [None, None, "C", "A"], + "col2": [None, "Test1", "Test3", "Test1"], + "col3": [None, 2, 3, 4], + "col4": [None, 3, 1, 4], + } + ) + ) + updated_table = table.drop_rows_with_missing_values() + assert updated_table.count_rows() == 2 + + +def test_drop_rows_with_missing_values_empty() -> None: + table = Table( + [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) + ) + updated_table = table.drop_rows_with_missing_values() + assert updated_table.get_column_names() == ["col1"] From 491724fb10e4acde3ac8d8ae8e277f54d18f7860 Mon Sep 17 00:00:00 2001 From: lars-reimann Date: Mon, 27 Mar 2023 15:13:51 +0000 Subject: [PATCH 2/2] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_table.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 61e3eb512..3b12b7dea 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -272,7 +272,7 @@ def get_column(self, column_name: str) -> Column: if self._schema.has_column(column_name): output_column = Column( self._data.iloc[ - :, [self._schema._get_column_index_by_name(column_name)] + :, [self._schema._get_column_index_by_name(column_name)] ].squeeze(), column_name, self._schema.get_type_of_column(column_name), @@ -582,11 +582,9 @@ def drop_columns_with_missing_values(self) -> Table: table : Table A table without the columns that contain missing values. """ - return Table.from_columns([ - column - for column in self.to_columns() - if not column.has_missing_values() - ]) + return Table.from_columns( + [column for column in self.to_columns() if not column.has_missing_values()] + ) def drop_columns_with_non_numerical_values(self) -> Table: """ @@ -855,7 +853,10 @@ def slice( def sort_columns( self, - query: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name) - (col1.name < col2.name), + query: Callable[[Column, Column], int] = lambda col1, col2: ( + col1.name > col2.name + ) + - (col1.name < col2.name), ) -> Table: """ Sort a table with the given lambda function.