From f9f9483bd1d1ff45001098809719545959d3fa7a Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 16:25:53 +0200 Subject: [PATCH 1/3] style: structure methods of `Table` --- src/safeds/data/tabular/containers/_table.py | 987 +++++++++--------- .../data/tabular/transformation/_imputer.py | 4 +- ..._drop_columns_with_non_numerical_values.py | 30 + 3 files changed, 549 insertions(+), 472 deletions(-) create mode 100644 tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 1eed44683..de05e8fa3 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -12,6 +12,8 @@ import seaborn as sns from IPython.core.display_functions import DisplayHandle, display from pandas import DataFrame, Series +from scipy import stats + from safeds.data.tabular.containers._column import Column from safeds.data.tabular.containers._row import Row from safeds.data.tabular.typing import ColumnType, TableSchema @@ -26,7 +28,6 @@ SchemaMismatchError, UnknownColumnNameError, ) -from scipy import stats # noinspection PyProtectedMember @@ -47,44 +48,39 @@ class Table: If the table is empty and no schema is specified. """ - def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None): - self._data: pd.Dataframe = ( - data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) - ) - if schema is None: - if self.count_columns() == 0: - raise MissingSchemaError() - self.schema: TableSchema = TableSchema._from_dataframe(self._data) - else: - self.schema = schema - if self._data.empty: - self._data = pd.DataFrame(columns=self.schema.get_column_names()) - - self._data = self._data.reset_index(drop=True) - self._data.columns = list(range(self.count_columns())) + # ------------------------------------------------------------------------------------------------------------------ + # Creation + # ------------------------------------------------------------------------------------------------------------------ - def get_row(self, index: int) -> Row: + @staticmethod + def from_csv(path: str) -> Table: """ - Return the row at a specified index. + Read data from a CSV file into a table. Parameters ---------- - index : int - The index. + path : str + The path to the CSV file. Returns ------- - row : Row - The row of the table at the index. + table : Table + The table created from the CSV file. Raises ------ - IndexOutOfBoundsError - If no row at the specified index exists in this table. + FileNotFoundError + If the specified file does not exist. + ValueError + If the file could not be read. """ - if len(self._data.index) - 1 < index or index < 0: - raise IndexOutOfBoundsError(index) - return Row(self._data.iloc[[index]].squeeze(), self.schema) + + try: + return Table(pd.read_csv(path)) + except FileNotFoundError as exception: + raise FileNotFoundError(f'File "{path}" does not exist') from exception + except Exception as exception: + raise ValueError(f'Could not read file from "{path}" as CSV') from exception @staticmethod def from_json(path: str) -> Table: @@ -119,34 +115,42 @@ def from_json(path: str) -> Table: ) from exception @staticmethod - def from_csv(path: str) -> Table: + def from_columns(columns: list[Column]) -> Table: """ - Read data from a CSV file into a table. + Return a table created from a list of columns. Parameters ---------- - path : str - The path to the CSV file. + columns : list[Column] + The columns to be combined. They need to have the same size. Returns ------- table : Table - The table created from the CSV file. + The generated table. Raises ------ - FileNotFoundError - If the specified file does not exist. - ValueError - If the file could not be read. + MissingDataError + If an empty list is given. + ColumnLengthMismatchError + If any of the column sizes does not match with the others. """ + if len(columns) == 0: + raise MissingDataError("This function requires at least one column.") - try: - return Table(pd.read_csv(path)) - except FileNotFoundError as exception: - raise FileNotFoundError(f'File "{path}" does not exist') from exception - except Exception as exception: - raise ValueError(f'Could not read file from "{path}" as CSV') from exception + dataframe: DataFrame = pd.DataFrame() + + for column in columns: + if column._data.size != columns[0]._data.size: + raise ColumnLengthMismatchError( + "\n".join( + [f"{column.name}: {column._data.size}" for column in columns] + ) + ) + dataframe[column.name] = column._data + + return Table(dataframe) @staticmethod def from_rows(rows: list[Row]) -> Table: @@ -185,109 +189,67 @@ def from_rows(rows: list[Row]) -> Table: dataframe.columns = schema_compare.get_column_names() return Table(dataframe) - @staticmethod - def from_columns(columns: list[Column]) -> Table: - """ - Return a table created from a list of columns. - - Parameters - ---------- - columns : list[Column] - The columns to be combined. They need to have the same size. - - Returns - ------- - table : Table - The generated table. - - Raises - ------ - MissingDataError - If an empty list is given. - ColumnLengthMismatchError - If any of the column sizes does not match with the others. - """ - if len(columns) == 0: - raise MissingDataError("This function requires at least one column.") + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ - dataframe: DataFrame = pd.DataFrame() + def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None): + self._data: pd.Dataframe = ( + data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) + ) + if schema is None: + if self.count_columns() == 0: + raise MissingSchemaError() + self._schema: TableSchema = TableSchema._from_dataframe(self._data) + else: + self._schema = schema + if self._data.empty: + self._data = pd.DataFrame(columns=self._schema.get_column_names()) - for column in columns: - if column._data.size != columns[0]._data.size: - raise ColumnLengthMismatchError( - "\n".join( - [f"{column.name}: {column._data.size}" for column in columns] - ) - ) - dataframe[column.name] = column._data + self._data = self._data.reset_index(drop=True) + self._data.columns = list(range(self.count_columns())) - return Table(dataframe) + def __eq__(self, other: typing.Any) -> bool: + if not isinstance(other, Table): + return NotImplemented + if self is other: + return True + table1 = self.sort_columns() + table2 = other.sort_columns() + return table1._data.equals(table2._data) and table1._schema == table2._schema - def to_json(self, path_to_file: str) -> None: - """ - Write the data from the table into a JSON file. - If the file and/or the directories do not exist, they will be created. - If the file already exists it will be overwritten. + def __hash__(self) -> int: + return hash(self._data) - Parameters - ---------- - path_to_file : str - The path to the output file. - """ - Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) - data_to_json = self._data.copy() - data_to_json.columns = self.schema.get_column_names() - data_to_json.to_json(path_to_file) + def __repr__(self) -> str: + tmp = self._data.copy(deep=True) + tmp.columns = self.get_column_names() + return tmp.__repr__() - def to_csv(self, path_to_file: str) -> None: - """ - Write the data from the table into a CSV file. - If the file and/or the directories do not exist they will be created. - If the file already exists it will be overwritten. + def __str__(self) -> str: + tmp = self._data.copy(deep=True) + tmp.columns = self.get_column_names() + return tmp.__str__() - Parameters - ---------- - path_to_file : str - The path to the output file. - """ - Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) - data_to_csv = self._data.copy() - data_to_csv.columns = self.schema.get_column_names() - data_to_csv.to_csv(path_to_file, index=False) + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ - def rename_column(self, old_name: str, new_name: str) -> Table: + @property + def schema(self) -> TableSchema: """ - Rename a single column. - - Parameters - ---------- - old_name : str - The old name of the target column - new_name : str - The new name of the target column + Return the schema of the table. Returns ------- - table : Table - The Table with the renamed column. - - Raises - ------ - ColumnNameError - If the specified old target column name does not exist. - DuplicateColumnNameError - If the specified new target column name already exists. + schema : TableSchema + The schema. """ - if old_name not in self.schema.get_column_names(): - raise UnknownColumnNameError([old_name]) - if old_name == new_name: - return self - if new_name in self.schema.get_column_names(): - raise DuplicateColumnNameError(new_name) + return self._schema - new_df = self._data.copy() - new_df.columns = self.schema.get_column_names() - return Table(new_df.rename(columns={old_name: new_name})) + # ------------------------------------------------------------------------------------------------------------------ + # Getters + # ------------------------------------------------------------------------------------------------------------------ def get_column(self, column_name: str) -> Column: """ @@ -308,120 +270,95 @@ def get_column(self, column_name: str) -> Column: UnknownColumnNameError If the specified target column name does not exist. """ - if self.schema.has_column(column_name): + if self._schema.has_column(column_name): output_column = Column( self._data.iloc[ - :, [self.schema._get_column_index_by_name(column_name)] + :, [self._schema._get_column_index_by_name(column_name)] ].squeeze(), column_name, - self.schema.get_type_of_column(column_name), + self._schema.get_type_of_column(column_name), ) return output_column raise UnknownColumnNameError([column_name]) - def drop_columns(self, column_names: list[str]) -> Table: + def has_column(self, column_name: str) -> bool: """ - Return a table without the given column(s). + Return whether the table contains a given column. + Alias for self.schema.hasColumn(column_name: str) -> bool. Parameters ---------- + column_name : str + The name of the column. + + Returns + ------- + contains : bool + True if the column exists. + """ + return self._schema.has_column(column_name) + + def get_column_names(self) -> list[str]: + """ + Return a list of all column names in this table. + Alias for self.schema.get_column_names() -> list[str]. + + Returns + ------- column_names : list[str] - A list containing all columns to be dropped. + The list of the column names. + """ + return self._schema.get_column_names() + + def get_type_of_column(self, column_name: str) -> ColumnType: + """ + Return the type of the given column. + Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType. + + Parameters + ---------- + column_name : str + The name of the column to be queried. Returns ------- - table : Table - A table without the given columns. + type : ColumnType + The type of the column. Raises ------ ColumnNameError - If any of the given columns do not exist. + If the specified target column name does not exist. """ - invalid_columns = [] - column_indices = [] - for name in column_names: - if not self.schema.has_column(name): - invalid_columns.append(name) - else: - column_indices.append(self.schema._get_column_index_by_name(name)) - if len(invalid_columns) != 0: - raise UnknownColumnNameError(invalid_columns) - transformed_data = self._data.drop(labels=column_indices, axis="columns") - transformed_data.columns = list( - name for name in self.schema.get_column_names() if name not in column_names - ) - return Table(transformed_data) + return self._schema.get_type_of_column(column_name) - def keep_columns(self, column_names: list[str]) -> Table: + def get_row(self, index: int) -> Row: """ - Return a table with only the given column(s). + Return the row at a specified index. Parameters ---------- - column_names : list[str] - A list containing only the columns to be kept. + index : int + The index. Returns ------- - table : Table - A table containing only the given column(s). + row : Row + The row of the table at the index. Raises ------ - ColumnNameError - If any of the given columns do not exist. - """ - invalid_columns = [] - column_indices = [] - for name in column_names: - if not self.schema.has_column(name): - invalid_columns.append(name) - else: - column_indices.append(self.schema._get_column_index_by_name(name)) - if len(invalid_columns) != 0: - raise UnknownColumnNameError(invalid_columns) - transformed_data = self._data[column_indices] - transformed_data.columns = list( - name for name in self.schema.get_column_names() if name in column_names - ) - return Table(transformed_data) - - def to_rows(self) -> list[Row]: - """ - Return a list of the rows. - - Returns - ------- - rows : list[Row] - List of rows. - """ - return [ - Row(series_row, self.schema) for (_, series_row) in self._data.iterrows() - ] - - def filter_rows(self, query: Callable[[Row], bool]) -> Table: - """ - Return a table with rows filtered by Callable (e.g. lambda function). - - Parameters - ---------- - query : lambda function - A Callable that is applied to all rows. - - Returns - ------- - table : Table - A table containing only the rows filtered by the query. + IndexOutOfBoundsError + If no row at the specified index exists in this table. """ + if len(self._data.index) - 1 < index or index < 0: + raise IndexOutOfBoundsError(index) + return Row(self._data.iloc[[index]].squeeze(), self._schema) - rows: list[Row] = [row for row in self.to_rows() if query(row)] - if len(rows) == 0: - result_table = Table([], self.schema) - else: - result_table = self.from_rows(rows) - return result_table + # ------------------------------------------------------------------------------------------------------------------ + # Information + # ------------------------------------------------------------------------------------------------------------------ def count_rows(self) -> int: """ @@ -445,82 +382,53 @@ def count_columns(self) -> int: """ return self._data.shape[1] - def to_columns(self) -> list[Column]: - """ - Return a list of the columns. - - Returns - ------- - columns : list[Columns] - List of columns. - """ - return [self.get_column(name) for name in self.schema.get_column_names()] - - def drop_duplicate_rows(self) -> Table: + def summary(self) -> Table: """ - Return a copy of the table with every duplicate row removed. + Return a table with a number of statistical key values. Returns ------- result : Table - The table with the duplicate rows removed. - - """ - df = self._data.drop_duplicates(ignore_index=True) - df.columns = self.schema.get_column_names() - return Table(df) - - def replace_column(self, old_column_name: str, new_column: Column) -> Table: + The table with statistics. """ - Return a copy of the table with the specified old column replaced by a new column. Keeps the order of columns. - - Parameters - ---------- - old_column_name : str - The name of the column to be replaced. - - new_column : Column - The new column replacing the old column. - Returns - ------- - result : Table - A table with the old column replaced by the new column. - - Raises - ------ - UnknownColumnNameError - If the old column does not exist. - - DuplicateColumnNameError - If the new column already exists and the existing column is not affected by the replacement. + columns = self.to_columns() + result = pd.DataFrame() + statistics = {} - ColumnSizeError - If the size of the column does not match the amount of rows. - """ - if old_column_name not in self.schema.get_column_names(): - raise UnknownColumnNameError([old_column_name]) + for column in columns: + statistics = { + "maximum": column.maximum, + "minimum": column.minimum, + "mean": column.mean, + "mode": column.mode, + "median": column.median, + "sum": column.sum, + "variance": column.variance, + "standard deviation": column.standard_deviation, + "idness": column.idness, + "stability": column.stability, + "row count": column.count, + } + values = [] - if ( - new_column.name in self.schema.get_column_names() - and new_column.name != old_column_name - ): - raise DuplicateColumnNameError(new_column.name) + for function in statistics.values(): + try: + values.append(str(function())) + except NonNumericColumnError: + values.append("-") - if self.count_rows() != new_column._data.size: - raise ColumnSizeError(str(self.count_rows()), str(new_column._data.size)) + result = pd.concat([result, pd.DataFrame(values)], axis=1) - if old_column_name != new_column.name: - renamed_table = self.rename_column(old_column_name, new_column.name) - result = renamed_table._data - result.columns = renamed_table.schema.get_column_names() - else: - result = self._data.copy() - result.columns = self.schema.get_column_names() + result = pd.concat([pd.DataFrame(list(statistics.keys())), result], axis=1) + result.columns = ["metrics"] + self.get_column_names() - result[new_column.name] = new_column._data return Table(result) + # ------------------------------------------------------------------------------------------------------------------ + # Transformations + # ------------------------------------------------------------------------------------------------------------------ + def add_column(self, column: Column) -> Table: """ Return the original table with the provided column attached at the end. @@ -539,14 +447,14 @@ def add_column(self, column: Column) -> Table: If the size of the column does not match the amount of rows. """ - if self.schema.has_column(column.name): + if self._schema.has_column(column.name): raise DuplicateColumnNameError(column.name) if column._data.size != self.count_rows(): raise ColumnSizeError(str(self.count_rows()), str(column._data.size)) result = self._data.copy() - result.columns = self.schema.get_column_names() + result.columns = self._schema.get_column_names() result[column.name] = column._data return Table(result) @@ -574,7 +482,7 @@ def add_columns(self, columns: Union[list[Column], Table]) -> Table: if isinstance(columns, Table): columns = columns.to_columns() result = self._data.copy() - result.columns = self.schema.get_column_names() + result.columns = self._schema.get_column_names() for column in columns: if column.name in result.columns: raise DuplicateColumnNameError(column.name) @@ -600,10 +508,10 @@ def add_row(self, row: Row) -> Table: A new table with the added row at the end. """ - if self.schema != row.schema: + if self._schema != row.schema: raise SchemaMismatchError() new_df = pd.concat([self._data, row._data.to_frame().T]).infer_objects() - new_df.columns = self.schema.get_column_names() + new_df.columns = self._schema.get_column_names() return Table(new_df) def add_rows(self, rows: Union[list[Row], Table]) -> Table: @@ -624,269 +532,240 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table: rows = rows.to_rows() result = self._data for row in rows: - if self.schema != row.schema: + if self._schema != row.schema: raise SchemaMismatchError() result = pd.concat( [result, *[row._data.to_frame().T for row in rows]] ).infer_objects() - result.columns = self.schema.get_column_names() + result.columns = self._schema.get_column_names() return Table(result) - def has_column(self, column_name: str) -> bool: + def drop_columns(self, column_names: list[str]) -> Table: """ - Return whether the table contains a given column. - Alias for self.schema.hasColumn(column_name: str) -> bool. + Return a table without the given column(s). Parameters ---------- - column_name : str - The name of the column. + column_names : list[str] + A list containing all columns to be dropped. Returns ------- - contains : bool - True if the column exists. + table : Table + A table without the given columns. + + Raises + ------ + ColumnNameError + If any of the given columns do not exist. """ - return self.schema.has_column(column_name) + invalid_columns = [] + column_indices = [] + for name in column_names: + if not self._schema.has_column(name): + invalid_columns.append(name) + else: + column_indices.append(self._schema._get_column_index_by_name(name)) + if len(invalid_columns) != 0: + raise UnknownColumnNameError(invalid_columns) + transformed_data = self._data.drop(labels=column_indices, axis="columns") + transformed_data.columns = list( + name for name in self._schema.get_column_names() if name not in column_names + ) + return Table(transformed_data) - def list_columns_with_missing_values(self) -> list[Column]: + def drop_duplicate_rows(self) -> Table: """ - Return a list of all the columns that have at least one missing value. Returns an empty list if there are none. + Return a copy of the table with every duplicate row removed. Returns ------- - columns_with_missing_values: list[Column] - The list of columns with missing values. + result : Table + The table with the duplicate rows removed. + """ - columns = self.to_columns() - columns_with_missing_values = [] - for column in columns: - if column.has_missing_values(): - columns_with_missing_values.append(column) - return columns_with_missing_values + df = self._data.drop_duplicates(ignore_index=True) + df.columns = self._schema.get_column_names() + return Table(df) - def list_columns_with_non_numerical_values(self) -> list[Column]: + def drop_rows_with_outliers(self) -> Table: """ - Return a list of columns only containing non-numerical values. + Remove all rows from the table that contain at least one outlier defined as having a value that has a distance + of more than 3 standard deviations from the column average. Returns ------- - cols : list[Column] - The list with only non-numerical columns. + new_table : Table + A new table without rows containing outliers. """ - cols = [] - for column_name, data_type in self.schema._schema.items(): - if not data_type.is_numeric(): - cols.append(self.get_column(column_name)) - return cols + result = self._data.copy(deep=True) - def list_columns_with_numerical_values(self) -> list[Column]: - """ - Return a list of columns only containing numerical values. + table_without_nonnumericals = Table.from_columns( + self.list_columns_with_numerical_values() + ) - Returns - ------- - cols : list[Column] - The list with only numerical columns. - """ - cols = [] - for column_name, data_type in self.schema._schema.items(): - if data_type.is_numeric(): - cols.append(self.get_column(column_name)) - return cols + result = result[ + (np.absolute(stats.zscore(table_without_nonnumericals._data)) < 3).all( + axis=1 + ) + ] - def get_column_names(self) -> list[str]: + return Table(result, self._schema) + + def filter_rows(self, query: Callable[[Row], bool]) -> Table: """ - Return a list of all column names in this table. - Alias for self.schema.get_column_names() -> list[str]. + Return a table with rows filtered by Callable (e.g. lambda function). + + Parameters + ---------- + query : lambda function + A Callable that is applied to all rows. Returns ------- - column_names : list[str] - The list of the column names. + table : Table + A table containing only the rows filtered by the query. """ - return self.schema.get_column_names() - def get_type_of_column(self, column_name: str) -> ColumnType: + rows: list[Row] = [row for row in self.to_rows() if query(row)] + if len(rows) == 0: + result_table = Table([], self._schema) + else: + result_table = self.from_rows(rows) + return result_table + + def keep_columns(self, column_names: list[str]) -> Table: """ - Return the type of the given column. - Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType. + Return a table with only the given column(s). Parameters ---------- - column_name : str - The name of the column to be queried. + column_names : list[str] + A list containing only the columns to be kept. Returns ------- - type : ColumnType - The type of the column. + table : Table + A table containing only the given column(s). Raises ------ ColumnNameError - If the specified target column name does not exist. + If any of the given columns do not exist. """ - return self.schema.get_type_of_column(column_name) - - def sort_columns( - self, - query: Callable[[Column, Column], int] = lambda col1, col2: ( - col1.name > col2.name - ) - - (col1.name < col2.name), - ) -> Table: + invalid_columns = [] + column_indices = [] + for name in column_names: + if not self._schema.has_column(name): + invalid_columns.append(name) + else: + column_indices.append(self._schema._get_column_index_by_name(name)) + if len(invalid_columns) != 0: + raise UnknownColumnNameError(invalid_columns) + transformed_data = self._data[column_indices] + transformed_data.columns = list( + name for name in self._schema.get_column_names() if name in column_names + ) + return Table(transformed_data) + + def rename_column(self, old_name: str, new_name: str) -> Table: """ - Sort a table with the given lambda function. - If no function is given the columns will be sorted alphabetically. - This function uses the default python sort algorithm. - The query returns - 0, if both columns are equal. - < 0, if the first column should be ordered after the second column. - > 0, if the first column should be ordered before the second column. + Rename a single column. Parameters ---------- - query : a lambda function - The lambda function used to sort the columns. + old_name : str + The old name of the target column + new_name : str + The new name of the target column Returns ------- - new_table : Table - A new table with sorted columns. - """ - columns = self.to_columns() - columns.sort(key=functools.cmp_to_key(query)) - return Table.from_columns(columns) - - def drop_rows_with_outliers(self) -> Table: - """ - Remove all rows from the table that contain at least one outlier defined as having a value that has a distance - of more than 3 standard deviations from the column average. + table : Table + The Table with the renamed column. - Returns - ------- - new_table : Table - A new table without rows containing outliers. + Raises + ------ + ColumnNameError + If the specified old target column name does not exist. + DuplicateColumnNameError + If the specified new target column name already exists. """ - result = self._data.copy(deep=True) - - table_without_nonnumericals = Table.from_columns( - self.list_columns_with_numerical_values() - ) - - result = result[ - (np.absolute(stats.zscore(table_without_nonnumericals._data)) < 3).all( - axis=1 - ) - ] + if old_name not in self._schema.get_column_names(): + raise UnknownColumnNameError([old_name]) + if old_name == new_name: + return self + if new_name in self._schema.get_column_names(): + raise DuplicateColumnNameError(new_name) - return Table(result, self.schema) + new_df = self._data.copy() + new_df.columns = self._schema.get_column_names() + return Table(new_df.rename(columns={old_name: new_name})) - def __eq__(self, other: typing.Any) -> bool: - if not isinstance(other, Table): - return NotImplemented - if self is other: - return True - table1 = self.sort_columns() - table2 = other.sort_columns() - return table1._data.equals(table2._data) and table1.schema == table2.schema + def replace_column(self, old_column_name: str, new_column: Column) -> Table: + """ + Return a copy of the table with the specified old column replaced by a new column. Keeps the order of columns. - def __hash__(self) -> int: - return hash(self._data) + Parameters + ---------- + old_column_name : str + The name of the column to be replaced. - def transform_column( - self, name: str, transformer: Callable[[Row], typing.Any] - ) -> Table: - """ - Transform provided column by calling provided transformer. + new_column : Column + The new column replacing the old column. Returns ------- result : Table - The table with the transformed column. + A table with the old column replaced by the new column. Raises ------ UnknownColumnNameError - If the column does not exist. - - """ - if self.has_column(name): - items: list = [transformer(item) for item in self.to_rows()] - result: Column = Column(pd.Series(items), name) - return self.replace_column(name, result) - raise UnknownColumnNameError([name]) + If the old column does not exist. - def summary(self) -> Table: - """ - Return a table with a number of statistical key values. + DuplicateColumnNameError + If the new column already exists and the existing column is not affected by the replacement. - Returns - ------- - result : Table - The table with statistics. + ColumnSizeError + If the size of the column does not match the amount of rows. """ + if old_column_name not in self._schema.get_column_names(): + raise UnknownColumnNameError([old_column_name]) - columns = self.to_columns() - result = pd.DataFrame() - statistics = {} - - for column in columns: - statistics = { - "maximum": column.maximum, - "minimum": column.minimum, - "mean": column.mean, - "mode": column.mode, - "median": column.median, - "sum": column.sum, - "variance": column.variance, - "standard deviation": column.standard_deviation, - "idness": column.idness, - "stability": column.stability, - "row count": column.count, - } - values = [] - - for function in statistics.values(): - try: - values.append(str(function())) - except NonNumericColumnError: - values.append("-") + if ( + new_column.name in self._schema.get_column_names() + and new_column.name != old_column_name + ): + raise DuplicateColumnNameError(new_column.name) - result = pd.concat([result, pd.DataFrame(values)], axis=1) + if self.count_rows() != new_column._data.size: + raise ColumnSizeError(str(self.count_rows()), str(new_column._data.size)) - result = pd.concat([pd.DataFrame(list(statistics.keys())), result], axis=1) - result.columns = ["metrics"] + self.get_column_names() + if old_column_name != new_column.name: + renamed_table = self.rename_column(old_column_name, new_column.name) + result = renamed_table._data + result.columns = renamed_table._schema.get_column_names() + else: + result = self._data.copy() + result.columns = self._schema.get_column_names() + result[new_column.name] = new_column._data return Table(result) - def __repr__(self) -> str: - tmp = self._data.copy(deep=True) - tmp.columns = self.get_column_names() - return tmp.__repr__() - - def __str__(self) -> str: - tmp = self._data.copy(deep=True) - tmp.columns = self.get_column_names() - return tmp.__str__() - - def _ipython_display_(self) -> DisplayHandle: + def shuffle(self) -> Table: """ - Return a display object for the column to be used in Jupyter Notebooks. + Shuffle the table randomly. Returns ------- - output : DisplayHandle - Output object. - """ - tmp = self._data.copy(deep=True) - tmp.columns = self.get_column_names() + result : Table + The shuffled Table. - with pd.option_context( - "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] - ): - return display(tmp) + """ + new_df = self._data.sample(frac=1.0) + new_df.columns = self._schema.get_column_names() + return Table(new_df) def slice( self, @@ -933,9 +812,39 @@ def slice( raise ValueError("the given index is out of bounds") new_df = self._data.iloc[start:end:step] - new_df.columns = self.schema.get_column_names() + new_df.columns = self._schema.get_column_names() return Table(new_df) + def sort_columns( + self, + query: Callable[[Column, Column], int] = lambda col1, col2: ( + col1.name > col2.name + ) + - (col1.name < col2.name), + ) -> Table: + """ + Sort a table with the given lambda function. + If no function is given the columns will be sorted alphabetically. + This function uses the default python sort algorithm. + The query returns + 0, if both columns are equal. + < 0, if the first column should be ordered after the second column. + > 0, if the first column should be ordered before the second column. + + Parameters + ---------- + query : a lambda function + The lambda function used to sort the columns. + + Returns + ------- + new_table : Table + A new table with sorted columns. + """ + columns = self.to_columns() + columns.sort(key=functools.cmp_to_key(query)) + return Table.from_columns(columns) + def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]: """ Split the table into two new tables. @@ -959,19 +868,32 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]: self.slice(round(percentage_in_first * self.count_rows())), ) - def shuffle(self) -> Table: + def transform_column( + self, name: str, transformer: Callable[[Row], typing.Any] + ) -> Table: """ - Shuffle the table randomly. + Transform provided column by calling provided transformer. Returns ------- result : Table - The shuffled Table. + The table with the transformed column. + + Raises + ------ + UnknownColumnNameError + If the column does not exist. """ - new_df = self._data.sample(frac=1.0) - new_df.columns = self.schema.get_column_names() - return Table(new_df) + if self.has_column(name): + items: list = [transformer(item) for item in self.to_rows()] + result: Column = Column(pd.Series(items), name) + return self.replace_column(name, result) + raise UnknownColumnNameError([name]) + + # ------------------------------------------------------------------------------------------------------------------ + # Plotting + # ------------------------------------------------------------------------------------------------------------------ def correlation_heatmap(self) -> None: """ @@ -1015,8 +937,8 @@ def lineplot(self, x_column_name: str, y_column_name: str) -> None: ax = sns.lineplot( data=self._data, - x=self.schema._get_column_index_by_name(x_column_name), - y=self.schema._get_column_index_by_name(y_column_name), + x=self._schema._get_column_index_by_name(x_column_name), + y=self._schema._get_column_index_by_name(y_column_name), ) ax.set(xlabel=x_column_name, ylabel=y_column_name) ax.set_xticks(ax.get_xticks()) @@ -1049,8 +971,8 @@ def scatterplot(self, x_column_name: str, y_column_name: str) -> None: ax = sns.scatterplot( data=self._data, - x=self.schema._get_column_index_by_name(x_column_name), - y=self.schema._get_column_index_by_name(y_column_name), + x=self._schema._get_column_index_by_name(x_column_name), + y=self._schema._get_column_index_by_name(y_column_name), ) ax.set(xlabel=x_column_name, ylabel=y_column_name) ax.set_xticks(ax.get_xticks()) @@ -1059,3 +981,130 @@ def scatterplot(self, x_column_name: str, y_column_name: str) -> None: ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels plt.tight_layout() plt.show() + + # ------------------------------------------------------------------------------------------------------------------ + # Conversion + # ------------------------------------------------------------------------------------------------------------------ + + def to_csv(self, path_to_file: str) -> None: + """ + Write the data from the table into a CSV file. + If the file and/or the directories do not exist they will be created. + If the file already exists it will be overwritten. + + Parameters + ---------- + path_to_file : str + The path to the output file. + """ + Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) + data_to_csv = self._data.copy() + data_to_csv.columns = self._schema.get_column_names() + data_to_csv.to_csv(path_to_file, index=False) + + def to_json(self, path_to_file: str) -> None: + """ + Write the data from the table into a JSON file. + If the file and/or the directories do not exist, they will be created. + If the file already exists it will be overwritten. + + Parameters + ---------- + path_to_file : str + The path to the output file. + """ + Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) + data_to_json = self._data.copy() + data_to_json.columns = self._schema.get_column_names() + data_to_json.to_json(path_to_file) + + def to_columns(self) -> list[Column]: + """ + Return a list of the columns. + + Returns + ------- + columns : list[Columns] + List of columns. + """ + return [self.get_column(name) for name in self._schema.get_column_names()] + + def to_rows(self) -> list[Row]: + """ + Return a list of the rows. + + Returns + ------- + rows : list[Row] + List of rows. + """ + return [ + Row(series_row, self._schema) for (_, series_row) in self._data.iterrows() + ] + + # ------------------------------------------------------------------------------------------------------------------ + # Other + # ------------------------------------------------------------------------------------------------------------------ + + def _ipython_display_(self) -> DisplayHandle: + """ + Return a display object for the column to be used in Jupyter Notebooks. + + Returns + ------- + output : DisplayHandle + Output object. + """ + tmp = self._data.copy(deep=True) + tmp.columns = self.get_column_names() + + with pd.option_context( + "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] + ): + return display(tmp) + + def list_columns_with_missing_values(self) -> list[Column]: + """ + Return a list of all the columns that have at least one missing value. Returns an empty list if there are none. + + Returns + ------- + columns_with_missing_values: list[Column] + The list of columns with missing values. + """ + columns = self.to_columns() + columns_with_missing_values = [] + for column in columns: + if column.has_missing_values(): + columns_with_missing_values.append(column) + return columns_with_missing_values + + def list_columns_with_non_numerical_values(self) -> list[Column]: + """ + Return a list of columns only containing non-numerical values. + + Returns + ------- + cols : list[Column] + The list with only non-numerical columns. + """ + cols = [] + for column_name, data_type in self._schema._schema.items(): + if not data_type.is_numeric(): + cols.append(self.get_column(column_name)) + return cols + + def list_columns_with_numerical_values(self) -> list[Column]: + """ + Return a list of columns only containing numerical values. + + Returns + ------- + cols : list[Column] + The list with only numerical columns. + """ + cols = [] + for column_name, data_type in self._schema._schema.items(): + if data_type.is_numeric(): + cols.append(self.get_column(column_name)) + return cols diff --git a/src/safeds/data/tabular/transformation/_imputer.py b/src/safeds/data/tabular/transformation/_imputer.py index ef4015b02..45862778f 100644 --- a/src/safeds/data/tabular/transformation/_imputer.py +++ b/src/safeds/data/tabular/transformation/_imputer.py @@ -120,9 +120,7 @@ def transform(self, table: Table) -> Table: data[indices] = pd.DataFrame( self._imp.transform(data[indices]), columns=indices ) - table_imputed = Table(data) - table_imputed.schema = table.schema - return table_imputed + return Table(data, table.schema) def fit_transform( self, table: Table, column_names: Optional[list[str]] = None diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py new file mode 100644 index 000000000..f55d3533e --- /dev/null +++ b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py @@ -0,0 +1,30 @@ +import numpy as np +import pandas as pd +from safeds.data.tabular.containers import Table +from safeds.data.tabular.typing import ColumnType, TableSchema + + +def test_drop_columns_with_non_numerical_values_valid() -> None: + table = Table( + pd.DataFrame( + data={ + "col1": ["A", "B", "C", "A"], + "col2": ["Test1", "Test1", "Test3", "Test1"], + "col3": [1, 2, 3, 4], + "col4": [2, 3, 1, 4], + } + ) + ) + columns = table.drop_columns_with_non_numerical_values() + assert columns[0] == table.get_column("col3") + assert columns[1] == table.get_column("col4") + assert len(columns) == 2 + + +def test_drop_columns_with_non_numerical_values_invalid() -> None: + table = Table( + [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) + ) + columns = table.drop_columns_with_non_numerical_values() + assert columns[0] == table.get_column("col1") + assert len(columns) == 1 From 553cf25a80aea9447aea1bddb4129529b2021ccc Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 16:40:04 +0200 Subject: [PATCH 2/3] feat: function to drop columns with non-numerical values from `Table` --- src/safeds/data/tabular/containers/_table.py | 12 ++++++++++++ .../test_drop_columns_with_non_numerical_values.py | 13 +++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index de05e8fa3..1e1137a5a 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -574,6 +574,18 @@ def drop_columns(self, column_names: list[str]) -> Table: ) return Table(transformed_data) + def drop_columns_with_non_numerical_values(self) -> Table: + """ + Return a table without the columns that contain non-numerical values. + + Returns + ------- + table : Table + A table without the columns that contain non-numerical values. + + """ + return Table.from_columns(self.list_columns_with_numerical_values()) + def drop_duplicate_rows(self) -> Table: """ Return a copy of the table with every duplicate row removed. diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py index f55d3533e..d847a49d4 100644 --- a/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py +++ b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py @@ -15,16 +15,13 @@ def test_drop_columns_with_non_numerical_values_valid() -> None: } ) ) - columns = table.drop_columns_with_non_numerical_values() - assert columns[0] == table.get_column("col3") - assert columns[1] == table.get_column("col4") - assert len(columns) == 2 + updated_table = table.drop_columns_with_non_numerical_values() + assert updated_table.get_column_names() == ["col3", "col4"] -def test_drop_columns_with_non_numerical_values_invalid() -> None: +def test_drop_columns_with_non_numerical_values_empty() -> None: table = Table( [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) ) - columns = table.drop_columns_with_non_numerical_values() - assert columns[0] == table.get_column("col1") - assert len(columns) == 1 + updated_table = table.drop_columns_with_non_numerical_values() + assert updated_table.get_column_names() == ["col1"] From f44376a3a3057c3b910c8393499db497b32ba631 Mon Sep 17 00:00:00 2001 From: lars-reimann Date: Mon, 27 Mar 2023 14:43:34 +0000 Subject: [PATCH 3/3] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_table.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 1e1137a5a..f34ee6c02 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -12,8 +12,6 @@ import seaborn as sns from IPython.core.display_functions import DisplayHandle, display from pandas import DataFrame, Series -from scipy import stats - from safeds.data.tabular.containers._column import Column from safeds.data.tabular.containers._row import Row from safeds.data.tabular.typing import ColumnType, TableSchema @@ -28,6 +26,7 @@ SchemaMismatchError, UnknownColumnNameError, ) +from scipy import stats # noinspection PyProtectedMember @@ -273,7 +272,7 @@ def get_column(self, column_name: str) -> Column: if self._schema.has_column(column_name): output_column = Column( self._data.iloc[ - :, [self._schema._get_column_index_by_name(column_name)] + :, [self._schema._get_column_index_by_name(column_name)] ].squeeze(), column_name, self._schema.get_type_of_column(column_name), @@ -830,9 +829,9 @@ def slice( def sort_columns( self, query: Callable[[Column, Column], int] = lambda col1, col2: ( - col1.name > col2.name - ) - - (col1.name < col2.name), + col1.name > col2.name + ) + - (col1.name < col2.name), ) -> Table: """ Sort a table with the given lambda function.