diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 1eed44683..f34ee6c02 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -47,44 +47,39 @@ class Table: If the table is empty and no schema is specified. """ - def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None): - self._data: pd.Dataframe = ( - data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) - ) - if schema is None: - if self.count_columns() == 0: - raise MissingSchemaError() - self.schema: TableSchema = TableSchema._from_dataframe(self._data) - else: - self.schema = schema - if self._data.empty: - self._data = pd.DataFrame(columns=self.schema.get_column_names()) - - self._data = self._data.reset_index(drop=True) - self._data.columns = list(range(self.count_columns())) + # ------------------------------------------------------------------------------------------------------------------ + # Creation + # ------------------------------------------------------------------------------------------------------------------ - def get_row(self, index: int) -> Row: + @staticmethod + def from_csv(path: str) -> Table: """ - Return the row at a specified index. + Read data from a CSV file into a table. Parameters ---------- - index : int - The index. + path : str + The path to the CSV file. Returns ------- - row : Row - The row of the table at the index. + table : Table + The table created from the CSV file. Raises ------ - IndexOutOfBoundsError - If no row at the specified index exists in this table. + FileNotFoundError + If the specified file does not exist. + ValueError + If the file could not be read. """ - if len(self._data.index) - 1 < index or index < 0: - raise IndexOutOfBoundsError(index) - return Row(self._data.iloc[[index]].squeeze(), self.schema) + + try: + return Table(pd.read_csv(path)) + except FileNotFoundError as exception: + raise FileNotFoundError(f'File "{path}" does not exist') from exception + except Exception as exception: + raise ValueError(f'Could not read file from "{path}" as CSV') from exception @staticmethod def from_json(path: str) -> Table: @@ -119,34 +114,42 @@ def from_json(path: str) -> Table: ) from exception @staticmethod - def from_csv(path: str) -> Table: + def from_columns(columns: list[Column]) -> Table: """ - Read data from a CSV file into a table. + Return a table created from a list of columns. Parameters ---------- - path : str - The path to the CSV file. + columns : list[Column] + The columns to be combined. They need to have the same size. Returns ------- table : Table - The table created from the CSV file. + The generated table. Raises ------ - FileNotFoundError - If the specified file does not exist. - ValueError - If the file could not be read. + MissingDataError + If an empty list is given. + ColumnLengthMismatchError + If any of the column sizes does not match with the others. """ + if len(columns) == 0: + raise MissingDataError("This function requires at least one column.") - try: - return Table(pd.read_csv(path)) - except FileNotFoundError as exception: - raise FileNotFoundError(f'File "{path}" does not exist') from exception - except Exception as exception: - raise ValueError(f'Could not read file from "{path}" as CSV') from exception + dataframe: DataFrame = pd.DataFrame() + + for column in columns: + if column._data.size != columns[0]._data.size: + raise ColumnLengthMismatchError( + "\n".join( + [f"{column.name}: {column._data.size}" for column in columns] + ) + ) + dataframe[column.name] = column._data + + return Table(dataframe) @staticmethod def from_rows(rows: list[Row]) -> Table: @@ -185,109 +188,67 @@ def from_rows(rows: list[Row]) -> Table: dataframe.columns = schema_compare.get_column_names() return Table(dataframe) - @staticmethod - def from_columns(columns: list[Column]) -> Table: - """ - Return a table created from a list of columns. + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ - Parameters - ---------- - columns : list[Column] - The columns to be combined. They need to have the same size. - - Returns - ------- - table : Table - The generated table. - - Raises - ------ - MissingDataError - If an empty list is given. - ColumnLengthMismatchError - If any of the column sizes does not match with the others. - """ - if len(columns) == 0: - raise MissingDataError("This function requires at least one column.") - - dataframe: DataFrame = pd.DataFrame() + def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None): + self._data: pd.Dataframe = ( + data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) + ) + if schema is None: + if self.count_columns() == 0: + raise MissingSchemaError() + self._schema: TableSchema = TableSchema._from_dataframe(self._data) + else: + self._schema = schema + if self._data.empty: + self._data = pd.DataFrame(columns=self._schema.get_column_names()) - for column in columns: - if column._data.size != columns[0]._data.size: - raise ColumnLengthMismatchError( - "\n".join( - [f"{column.name}: {column._data.size}" for column in columns] - ) - ) - dataframe[column.name] = column._data + self._data = self._data.reset_index(drop=True) + self._data.columns = list(range(self.count_columns())) - return Table(dataframe) + def __eq__(self, other: typing.Any) -> bool: + if not isinstance(other, Table): + return NotImplemented + if self is other: + return True + table1 = self.sort_columns() + table2 = other.sort_columns() + return table1._data.equals(table2._data) and table1._schema == table2._schema - def to_json(self, path_to_file: str) -> None: - """ - Write the data from the table into a JSON file. - If the file and/or the directories do not exist, they will be created. - If the file already exists it will be overwritten. + def __hash__(self) -> int: + return hash(self._data) - Parameters - ---------- - path_to_file : str - The path to the output file. - """ - Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) - data_to_json = self._data.copy() - data_to_json.columns = self.schema.get_column_names() - data_to_json.to_json(path_to_file) + def __repr__(self) -> str: + tmp = self._data.copy(deep=True) + tmp.columns = self.get_column_names() + return tmp.__repr__() - def to_csv(self, path_to_file: str) -> None: - """ - Write the data from the table into a CSV file. - If the file and/or the directories do not exist they will be created. - If the file already exists it will be overwritten. + def __str__(self) -> str: + tmp = self._data.copy(deep=True) + tmp.columns = self.get_column_names() + return tmp.__str__() - Parameters - ---------- - path_to_file : str - The path to the output file. - """ - Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) - data_to_csv = self._data.copy() - data_to_csv.columns = self.schema.get_column_names() - data_to_csv.to_csv(path_to_file, index=False) + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ - def rename_column(self, old_name: str, new_name: str) -> Table: + @property + def schema(self) -> TableSchema: """ - Rename a single column. - - Parameters - ---------- - old_name : str - The old name of the target column - new_name : str - The new name of the target column + Return the schema of the table. Returns ------- - table : Table - The Table with the renamed column. - - Raises - ------ - ColumnNameError - If the specified old target column name does not exist. - DuplicateColumnNameError - If the specified new target column name already exists. + schema : TableSchema + The schema. """ - if old_name not in self.schema.get_column_names(): - raise UnknownColumnNameError([old_name]) - if old_name == new_name: - return self - if new_name in self.schema.get_column_names(): - raise DuplicateColumnNameError(new_name) + return self._schema - new_df = self._data.copy() - new_df.columns = self.schema.get_column_names() - return Table(new_df.rename(columns={old_name: new_name})) + # ------------------------------------------------------------------------------------------------------------------ + # Getters + # ------------------------------------------------------------------------------------------------------------------ def get_column(self, column_name: str) -> Column: """ @@ -308,120 +269,95 @@ def get_column(self, column_name: str) -> Column: UnknownColumnNameError If the specified target column name does not exist. """ - if self.schema.has_column(column_name): + if self._schema.has_column(column_name): output_column = Column( self._data.iloc[ - :, [self.schema._get_column_index_by_name(column_name)] + :, [self._schema._get_column_index_by_name(column_name)] ].squeeze(), column_name, - self.schema.get_type_of_column(column_name), + self._schema.get_type_of_column(column_name), ) return output_column raise UnknownColumnNameError([column_name]) - def drop_columns(self, column_names: list[str]) -> Table: + def has_column(self, column_name: str) -> bool: """ - Return a table without the given column(s). + Return whether the table contains a given column. + Alias for self.schema.hasColumn(column_name: str) -> bool. Parameters ---------- - column_names : list[str] - A list containing all columns to be dropped. + column_name : str + The name of the column. Returns ------- - table : Table - A table without the given columns. + contains : bool + True if the column exists. + """ + return self._schema.has_column(column_name) - Raises - ------ - ColumnNameError - If any of the given columns do not exist. + def get_column_names(self) -> list[str]: """ - invalid_columns = [] - column_indices = [] - for name in column_names: - if not self.schema.has_column(name): - invalid_columns.append(name) - else: - column_indices.append(self.schema._get_column_index_by_name(name)) - if len(invalid_columns) != 0: - raise UnknownColumnNameError(invalid_columns) - transformed_data = self._data.drop(labels=column_indices, axis="columns") - transformed_data.columns = list( - name for name in self.schema.get_column_names() if name not in column_names - ) - return Table(transformed_data) + Return a list of all column names in this table. + Alias for self.schema.get_column_names() -> list[str]. - def keep_columns(self, column_names: list[str]) -> Table: + Returns + ------- + column_names : list[str] + The list of the column names. """ - Return a table with only the given column(s). + return self._schema.get_column_names() + + def get_type_of_column(self, column_name: str) -> ColumnType: + """ + Return the type of the given column. + Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType. Parameters ---------- - column_names : list[str] - A list containing only the columns to be kept. + column_name : str + The name of the column to be queried. Returns ------- - table : Table - A table containing only the given column(s). + type : ColumnType + The type of the column. Raises ------ ColumnNameError - If any of the given columns do not exist. - """ - invalid_columns = [] - column_indices = [] - for name in column_names: - if not self.schema.has_column(name): - invalid_columns.append(name) - else: - column_indices.append(self.schema._get_column_index_by_name(name)) - if len(invalid_columns) != 0: - raise UnknownColumnNameError(invalid_columns) - transformed_data = self._data[column_indices] - transformed_data.columns = list( - name for name in self.schema.get_column_names() if name in column_names - ) - return Table(transformed_data) - - def to_rows(self) -> list[Row]: - """ - Return a list of the rows. - - Returns - ------- - rows : list[Row] - List of rows. + If the specified target column name does not exist. """ - return [ - Row(series_row, self.schema) for (_, series_row) in self._data.iterrows() - ] + return self._schema.get_type_of_column(column_name) - def filter_rows(self, query: Callable[[Row], bool]) -> Table: + def get_row(self, index: int) -> Row: """ - Return a table with rows filtered by Callable (e.g. lambda function). + Return the row at a specified index. Parameters ---------- - query : lambda function - A Callable that is applied to all rows. + index : int + The index. Returns ------- - table : Table - A table containing only the rows filtered by the query. + row : Row + The row of the table at the index. + + Raises + ------ + IndexOutOfBoundsError + If no row at the specified index exists in this table. """ + if len(self._data.index) - 1 < index or index < 0: + raise IndexOutOfBoundsError(index) + return Row(self._data.iloc[[index]].squeeze(), self._schema) - rows: list[Row] = [row for row in self.to_rows() if query(row)] - if len(rows) == 0: - result_table = Table([], self.schema) - else: - result_table = self.from_rows(rows) - return result_table + # ------------------------------------------------------------------------------------------------------------------ + # Information + # ------------------------------------------------------------------------------------------------------------------ def count_rows(self) -> int: """ @@ -445,82 +381,53 @@ def count_columns(self) -> int: """ return self._data.shape[1] - def to_columns(self) -> list[Column]: - """ - Return a list of the columns. - - Returns - ------- - columns : list[Columns] - List of columns. - """ - return [self.get_column(name) for name in self.schema.get_column_names()] - - def drop_duplicate_rows(self) -> Table: + def summary(self) -> Table: """ - Return a copy of the table with every duplicate row removed. + Return a table with a number of statistical key values. Returns ------- result : Table - The table with the duplicate rows removed. - - """ - df = self._data.drop_duplicates(ignore_index=True) - df.columns = self.schema.get_column_names() - return Table(df) - - def replace_column(self, old_column_name: str, new_column: Column) -> Table: + The table with statistics. """ - Return a copy of the table with the specified old column replaced by a new column. Keeps the order of columns. - - Parameters - ---------- - old_column_name : str - The name of the column to be replaced. - - new_column : Column - The new column replacing the old column. - - Returns - ------- - result : Table - A table with the old column replaced by the new column. - - Raises - ------ - UnknownColumnNameError - If the old column does not exist. - DuplicateColumnNameError - If the new column already exists and the existing column is not affected by the replacement. + columns = self.to_columns() + result = pd.DataFrame() + statistics = {} - ColumnSizeError - If the size of the column does not match the amount of rows. - """ - if old_column_name not in self.schema.get_column_names(): - raise UnknownColumnNameError([old_column_name]) + for column in columns: + statistics = { + "maximum": column.maximum, + "minimum": column.minimum, + "mean": column.mean, + "mode": column.mode, + "median": column.median, + "sum": column.sum, + "variance": column.variance, + "standard deviation": column.standard_deviation, + "idness": column.idness, + "stability": column.stability, + "row count": column.count, + } + values = [] - if ( - new_column.name in self.schema.get_column_names() - and new_column.name != old_column_name - ): - raise DuplicateColumnNameError(new_column.name) + for function in statistics.values(): + try: + values.append(str(function())) + except NonNumericColumnError: + values.append("-") - if self.count_rows() != new_column._data.size: - raise ColumnSizeError(str(self.count_rows()), str(new_column._data.size)) + result = pd.concat([result, pd.DataFrame(values)], axis=1) - if old_column_name != new_column.name: - renamed_table = self.rename_column(old_column_name, new_column.name) - result = renamed_table._data - result.columns = renamed_table.schema.get_column_names() - else: - result = self._data.copy() - result.columns = self.schema.get_column_names() + result = pd.concat([pd.DataFrame(list(statistics.keys())), result], axis=1) + result.columns = ["metrics"] + self.get_column_names() - result[new_column.name] = new_column._data return Table(result) + # ------------------------------------------------------------------------------------------------------------------ + # Transformations + # ------------------------------------------------------------------------------------------------------------------ + def add_column(self, column: Column) -> Table: """ Return the original table with the provided column attached at the end. @@ -539,14 +446,14 @@ def add_column(self, column: Column) -> Table: If the size of the column does not match the amount of rows. """ - if self.schema.has_column(column.name): + if self._schema.has_column(column.name): raise DuplicateColumnNameError(column.name) if column._data.size != self.count_rows(): raise ColumnSizeError(str(self.count_rows()), str(column._data.size)) result = self._data.copy() - result.columns = self.schema.get_column_names() + result.columns = self._schema.get_column_names() result[column.name] = column._data return Table(result) @@ -574,7 +481,7 @@ def add_columns(self, columns: Union[list[Column], Table]) -> Table: if isinstance(columns, Table): columns = columns.to_columns() result = self._data.copy() - result.columns = self.schema.get_column_names() + result.columns = self._schema.get_column_names() for column in columns: if column.name in result.columns: raise DuplicateColumnNameError(column.name) @@ -600,10 +507,10 @@ def add_row(self, row: Row) -> Table: A new table with the added row at the end. """ - if self.schema != row.schema: + if self._schema != row.schema: raise SchemaMismatchError() new_df = pd.concat([self._data, row._data.to_frame().T]).infer_objects() - new_df.columns = self.schema.get_column_names() + new_df.columns = self._schema.get_column_names() return Table(new_df) def add_rows(self, rows: Union[list[Row], Table]) -> Table: @@ -624,140 +531,73 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table: rows = rows.to_rows() result = self._data for row in rows: - if self.schema != row.schema: + if self._schema != row.schema: raise SchemaMismatchError() result = pd.concat( [result, *[row._data.to_frame().T for row in rows]] ).infer_objects() - result.columns = self.schema.get_column_names() + result.columns = self._schema.get_column_names() return Table(result) - def has_column(self, column_name: str) -> bool: + def drop_columns(self, column_names: list[str]) -> Table: """ - Return whether the table contains a given column. - Alias for self.schema.hasColumn(column_name: str) -> bool. + Return a table without the given column(s). Parameters ---------- - column_name : str - The name of the column. - - Returns - ------- - contains : bool - True if the column exists. - """ - return self.schema.has_column(column_name) - - def list_columns_with_missing_values(self) -> list[Column]: - """ - Return a list of all the columns that have at least one missing value. Returns an empty list if there are none. + column_names : list[str] + A list containing all columns to be dropped. Returns ------- - columns_with_missing_values: list[Column] - The list of columns with missing values. - """ - columns = self.to_columns() - columns_with_missing_values = [] - for column in columns: - if column.has_missing_values(): - columns_with_missing_values.append(column) - return columns_with_missing_values - - def list_columns_with_non_numerical_values(self) -> list[Column]: - """ - Return a list of columns only containing non-numerical values. + table : Table + A table without the given columns. - Returns - ------- - cols : list[Column] - The list with only non-numerical columns. + Raises + ------ + ColumnNameError + If any of the given columns do not exist. """ - cols = [] - for column_name, data_type in self.schema._schema.items(): - if not data_type.is_numeric(): - cols.append(self.get_column(column_name)) - return cols + invalid_columns = [] + column_indices = [] + for name in column_names: + if not self._schema.has_column(name): + invalid_columns.append(name) + else: + column_indices.append(self._schema._get_column_index_by_name(name)) + if len(invalid_columns) != 0: + raise UnknownColumnNameError(invalid_columns) + transformed_data = self._data.drop(labels=column_indices, axis="columns") + transformed_data.columns = list( + name for name in self._schema.get_column_names() if name not in column_names + ) + return Table(transformed_data) - def list_columns_with_numerical_values(self) -> list[Column]: + def drop_columns_with_non_numerical_values(self) -> Table: """ - Return a list of columns only containing numerical values. + Return a table without the columns that contain non-numerical values. Returns ------- - cols : list[Column] - The list with only numerical columns. - """ - cols = [] - for column_name, data_type in self.schema._schema.items(): - if data_type.is_numeric(): - cols.append(self.get_column(column_name)) - return cols - - def get_column_names(self) -> list[str]: - """ - Return a list of all column names in this table. - Alias for self.schema.get_column_names() -> list[str]. + table : Table + A table without the columns that contain non-numerical values. - Returns - ------- - column_names : list[str] - The list of the column names. """ - return self.schema.get_column_names() + return Table.from_columns(self.list_columns_with_numerical_values()) - def get_type_of_column(self, column_name: str) -> ColumnType: + def drop_duplicate_rows(self) -> Table: """ - Return the type of the given column. - Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType. - - Parameters - ---------- - column_name : str - The name of the column to be queried. + Return a copy of the table with every duplicate row removed. Returns ------- - type : ColumnType - The type of the column. - - Raises - ------ - ColumnNameError - If the specified target column name does not exist. - """ - return self.schema.get_type_of_column(column_name) - - def sort_columns( - self, - query: Callable[[Column, Column], int] = lambda col1, col2: ( - col1.name > col2.name - ) - - (col1.name < col2.name), - ) -> Table: - """ - Sort a table with the given lambda function. - If no function is given the columns will be sorted alphabetically. - This function uses the default python sort algorithm. - The query returns - 0, if both columns are equal. - < 0, if the first column should be ordered after the second column. - > 0, if the first column should be ordered before the second column. - - Parameters - ---------- - query : a lambda function - The lambda function used to sort the columns. + result : Table + The table with the duplicate rows removed. - Returns - ------- - new_table : Table - A new table with sorted columns. """ - columns = self.to_columns() - columns.sort(key=functools.cmp_to_key(query)) - return Table.from_columns(columns) + df = self._data.drop_duplicates(ignore_index=True) + df.columns = self._schema.get_column_names() + return Table(df) def drop_rows_with_outliers(self) -> Table: """ @@ -781,112 +621,162 @@ def drop_rows_with_outliers(self) -> Table: ) ] - return Table(result, self.schema) + return Table(result, self._schema) - def __eq__(self, other: typing.Any) -> bool: - if not isinstance(other, Table): - return NotImplemented - if self is other: - return True - table1 = self.sort_columns() - table2 = other.sort_columns() - return table1._data.equals(table2._data) and table1.schema == table2.schema + def filter_rows(self, query: Callable[[Row], bool]) -> Table: + """ + Return a table with rows filtered by Callable (e.g. lambda function). - def __hash__(self) -> int: - return hash(self._data) + Parameters + ---------- + query : lambda function + A Callable that is applied to all rows. - def transform_column( - self, name: str, transformer: Callable[[Row], typing.Any] - ) -> Table: + Returns + ------- + table : Table + A table containing only the rows filtered by the query. """ - Transform provided column by calling provided transformer. + + rows: list[Row] = [row for row in self.to_rows() if query(row)] + if len(rows) == 0: + result_table = Table([], self._schema) + else: + result_table = self.from_rows(rows) + return result_table + + def keep_columns(self, column_names: list[str]) -> Table: + """ + Return a table with only the given column(s). + + Parameters + ---------- + column_names : list[str] + A list containing only the columns to be kept. Returns ------- - result : Table - The table with the transformed column. + table : Table + A table containing only the given column(s). Raises ------ - UnknownColumnNameError - If the column does not exist. + ColumnNameError + If any of the given columns do not exist. + """ + invalid_columns = [] + column_indices = [] + for name in column_names: + if not self._schema.has_column(name): + invalid_columns.append(name) + else: + column_indices.append(self._schema._get_column_index_by_name(name)) + if len(invalid_columns) != 0: + raise UnknownColumnNameError(invalid_columns) + transformed_data = self._data[column_indices] + transformed_data.columns = list( + name for name in self._schema.get_column_names() if name in column_names + ) + return Table(transformed_data) + def rename_column(self, old_name: str, new_name: str) -> Table: """ - if self.has_column(name): - items: list = [transformer(item) for item in self.to_rows()] - result: Column = Column(pd.Series(items), name) - return self.replace_column(name, result) - raise UnknownColumnNameError([name]) + Rename a single column. - def summary(self) -> Table: + Parameters + ---------- + old_name : str + The old name of the target column + new_name : str + The new name of the target column + + Returns + ------- + table : Table + The Table with the renamed column. + + Raises + ------ + ColumnNameError + If the specified old target column name does not exist. + DuplicateColumnNameError + If the specified new target column name already exists. + """ + if old_name not in self._schema.get_column_names(): + raise UnknownColumnNameError([old_name]) + if old_name == new_name: + return self + if new_name in self._schema.get_column_names(): + raise DuplicateColumnNameError(new_name) + + new_df = self._data.copy() + new_df.columns = self._schema.get_column_names() + return Table(new_df.rename(columns={old_name: new_name})) + + def replace_column(self, old_column_name: str, new_column: Column) -> Table: """ - Return a table with a number of statistical key values. + Return a copy of the table with the specified old column replaced by a new column. Keeps the order of columns. + + Parameters + ---------- + old_column_name : str + The name of the column to be replaced. + + new_column : Column + The new column replacing the old column. Returns ------- result : Table - The table with statistics. - """ - - columns = self.to_columns() - result = pd.DataFrame() - statistics = {} + A table with the old column replaced by the new column. - for column in columns: - statistics = { - "maximum": column.maximum, - "minimum": column.minimum, - "mean": column.mean, - "mode": column.mode, - "median": column.median, - "sum": column.sum, - "variance": column.variance, - "standard deviation": column.standard_deviation, - "idness": column.idness, - "stability": column.stability, - "row count": column.count, - } - values = [] + Raises + ------ + UnknownColumnNameError + If the old column does not exist. - for function in statistics.values(): - try: - values.append(str(function())) - except NonNumericColumnError: - values.append("-") + DuplicateColumnNameError + If the new column already exists and the existing column is not affected by the replacement. - result = pd.concat([result, pd.DataFrame(values)], axis=1) + ColumnSizeError + If the size of the column does not match the amount of rows. + """ + if old_column_name not in self._schema.get_column_names(): + raise UnknownColumnNameError([old_column_name]) - result = pd.concat([pd.DataFrame(list(statistics.keys())), result], axis=1) - result.columns = ["metrics"] + self.get_column_names() + if ( + new_column.name in self._schema.get_column_names() + and new_column.name != old_column_name + ): + raise DuplicateColumnNameError(new_column.name) - return Table(result) + if self.count_rows() != new_column._data.size: + raise ColumnSizeError(str(self.count_rows()), str(new_column._data.size)) - def __repr__(self) -> str: - tmp = self._data.copy(deep=True) - tmp.columns = self.get_column_names() - return tmp.__repr__() + if old_column_name != new_column.name: + renamed_table = self.rename_column(old_column_name, new_column.name) + result = renamed_table._data + result.columns = renamed_table._schema.get_column_names() + else: + result = self._data.copy() + result.columns = self._schema.get_column_names() - def __str__(self) -> str: - tmp = self._data.copy(deep=True) - tmp.columns = self.get_column_names() - return tmp.__str__() + result[new_column.name] = new_column._data + return Table(result) - def _ipython_display_(self) -> DisplayHandle: + def shuffle(self) -> Table: """ - Return a display object for the column to be used in Jupyter Notebooks. + Shuffle the table randomly. Returns ------- - output : DisplayHandle - Output object. - """ - tmp = self._data.copy(deep=True) - tmp.columns = self.get_column_names() + result : Table + The shuffled Table. - with pd.option_context( - "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] - ): - return display(tmp) + """ + new_df = self._data.sample(frac=1.0) + new_df.columns = self._schema.get_column_names() + return Table(new_df) def slice( self, @@ -933,9 +823,39 @@ def slice( raise ValueError("the given index is out of bounds") new_df = self._data.iloc[start:end:step] - new_df.columns = self.schema.get_column_names() + new_df.columns = self._schema.get_column_names() return Table(new_df) + def sort_columns( + self, + query: Callable[[Column, Column], int] = lambda col1, col2: ( + col1.name > col2.name + ) + - (col1.name < col2.name), + ) -> Table: + """ + Sort a table with the given lambda function. + If no function is given the columns will be sorted alphabetically. + This function uses the default python sort algorithm. + The query returns + 0, if both columns are equal. + < 0, if the first column should be ordered after the second column. + > 0, if the first column should be ordered before the second column. + + Parameters + ---------- + query : a lambda function + The lambda function used to sort the columns. + + Returns + ------- + new_table : Table + A new table with sorted columns. + """ + columns = self.to_columns() + columns.sort(key=functools.cmp_to_key(query)) + return Table.from_columns(columns) + def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]: """ Split the table into two new tables. @@ -959,19 +879,32 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]: self.slice(round(percentage_in_first * self.count_rows())), ) - def shuffle(self) -> Table: + def transform_column( + self, name: str, transformer: Callable[[Row], typing.Any] + ) -> Table: """ - Shuffle the table randomly. + Transform provided column by calling provided transformer. Returns ------- result : Table - The shuffled Table. + The table with the transformed column. + + Raises + ------ + UnknownColumnNameError + If the column does not exist. """ - new_df = self._data.sample(frac=1.0) - new_df.columns = self.schema.get_column_names() - return Table(new_df) + if self.has_column(name): + items: list = [transformer(item) for item in self.to_rows()] + result: Column = Column(pd.Series(items), name) + return self.replace_column(name, result) + raise UnknownColumnNameError([name]) + + # ------------------------------------------------------------------------------------------------------------------ + # Plotting + # ------------------------------------------------------------------------------------------------------------------ def correlation_heatmap(self) -> None: """ @@ -1015,8 +948,8 @@ def lineplot(self, x_column_name: str, y_column_name: str) -> None: ax = sns.lineplot( data=self._data, - x=self.schema._get_column_index_by_name(x_column_name), - y=self.schema._get_column_index_by_name(y_column_name), + x=self._schema._get_column_index_by_name(x_column_name), + y=self._schema._get_column_index_by_name(y_column_name), ) ax.set(xlabel=x_column_name, ylabel=y_column_name) ax.set_xticks(ax.get_xticks()) @@ -1049,8 +982,8 @@ def scatterplot(self, x_column_name: str, y_column_name: str) -> None: ax = sns.scatterplot( data=self._data, - x=self.schema._get_column_index_by_name(x_column_name), - y=self.schema._get_column_index_by_name(y_column_name), + x=self._schema._get_column_index_by_name(x_column_name), + y=self._schema._get_column_index_by_name(y_column_name), ) ax.set(xlabel=x_column_name, ylabel=y_column_name) ax.set_xticks(ax.get_xticks()) @@ -1059,3 +992,130 @@ def scatterplot(self, x_column_name: str, y_column_name: str) -> None: ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels plt.tight_layout() plt.show() + + # ------------------------------------------------------------------------------------------------------------------ + # Conversion + # ------------------------------------------------------------------------------------------------------------------ + + def to_csv(self, path_to_file: str) -> None: + """ + Write the data from the table into a CSV file. + If the file and/or the directories do not exist they will be created. + If the file already exists it will be overwritten. + + Parameters + ---------- + path_to_file : str + The path to the output file. + """ + Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) + data_to_csv = self._data.copy() + data_to_csv.columns = self._schema.get_column_names() + data_to_csv.to_csv(path_to_file, index=False) + + def to_json(self, path_to_file: str) -> None: + """ + Write the data from the table into a JSON file. + If the file and/or the directories do not exist, they will be created. + If the file already exists it will be overwritten. + + Parameters + ---------- + path_to_file : str + The path to the output file. + """ + Path(os.path.dirname(path_to_file)).mkdir(parents=True, exist_ok=True) + data_to_json = self._data.copy() + data_to_json.columns = self._schema.get_column_names() + data_to_json.to_json(path_to_file) + + def to_columns(self) -> list[Column]: + """ + Return a list of the columns. + + Returns + ------- + columns : list[Columns] + List of columns. + """ + return [self.get_column(name) for name in self._schema.get_column_names()] + + def to_rows(self) -> list[Row]: + """ + Return a list of the rows. + + Returns + ------- + rows : list[Row] + List of rows. + """ + return [ + Row(series_row, self._schema) for (_, series_row) in self._data.iterrows() + ] + + # ------------------------------------------------------------------------------------------------------------------ + # Other + # ------------------------------------------------------------------------------------------------------------------ + + def _ipython_display_(self) -> DisplayHandle: + """ + Return a display object for the column to be used in Jupyter Notebooks. + + Returns + ------- + output : DisplayHandle + Output object. + """ + tmp = self._data.copy(deep=True) + tmp.columns = self.get_column_names() + + with pd.option_context( + "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] + ): + return display(tmp) + + def list_columns_with_missing_values(self) -> list[Column]: + """ + Return a list of all the columns that have at least one missing value. Returns an empty list if there are none. + + Returns + ------- + columns_with_missing_values: list[Column] + The list of columns with missing values. + """ + columns = self.to_columns() + columns_with_missing_values = [] + for column in columns: + if column.has_missing_values(): + columns_with_missing_values.append(column) + return columns_with_missing_values + + def list_columns_with_non_numerical_values(self) -> list[Column]: + """ + Return a list of columns only containing non-numerical values. + + Returns + ------- + cols : list[Column] + The list with only non-numerical columns. + """ + cols = [] + for column_name, data_type in self._schema._schema.items(): + if not data_type.is_numeric(): + cols.append(self.get_column(column_name)) + return cols + + def list_columns_with_numerical_values(self) -> list[Column]: + """ + Return a list of columns only containing numerical values. + + Returns + ------- + cols : list[Column] + The list with only numerical columns. + """ + cols = [] + for column_name, data_type in self._schema._schema.items(): + if data_type.is_numeric(): + cols.append(self.get_column(column_name)) + return cols diff --git a/src/safeds/data/tabular/transformation/_imputer.py b/src/safeds/data/tabular/transformation/_imputer.py index ef4015b02..45862778f 100644 --- a/src/safeds/data/tabular/transformation/_imputer.py +++ b/src/safeds/data/tabular/transformation/_imputer.py @@ -120,9 +120,7 @@ def transform(self, table: Table) -> Table: data[indices] = pd.DataFrame( self._imp.transform(data[indices]), columns=indices ) - table_imputed = Table(data) - table_imputed.schema = table.schema - return table_imputed + return Table(data, table.schema) def fit_transform( self, table: Table, column_names: Optional[list[str]] = None diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py new file mode 100644 index 000000000..d847a49d4 --- /dev/null +++ b/tests/safeds/data/tabular/containers/_table/test_drop_columns_with_non_numerical_values.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +from safeds.data.tabular.containers import Table +from safeds.data.tabular.typing import ColumnType, TableSchema + + +def test_drop_columns_with_non_numerical_values_valid() -> None: + table = Table( + pd.DataFrame( + data={ + "col1": ["A", "B", "C", "A"], + "col2": ["Test1", "Test1", "Test3", "Test1"], + "col3": [1, 2, 3, 4], + "col4": [2, 3, 1, 4], + } + ) + ) + updated_table = table.drop_columns_with_non_numerical_values() + assert updated_table.get_column_names() == ["col3", "col4"] + + +def test_drop_columns_with_non_numerical_values_empty() -> None: + table = Table( + [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) + ) + updated_table = table.drop_columns_with_non_numerical_values() + assert updated_table.get_column_names() == ["col1"]