Skip to content
Merged
21 changes: 7 additions & 14 deletions docs/tutorials/classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"\n",
"titanic = Table.from_csv_file(\"data/titanic.csv\")\n",
"#For visualisation purposes we only print out the first 15 rows.\n",
"titanic.slice_rows(0,15)"
"titanic.slice_rows(0, 15)"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -77,7 +77,6 @@
"source": [
"from safeds.data.tabular.transformation import OneHotEncoder\n",
"\n",
"old_column_names = train_table.column_names\n",
"encoder = OneHotEncoder().fit(train_table, [\"sex\"])"
],
"metadata": {
Expand All @@ -97,18 +96,14 @@
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"transformed_table = encoder.transform(train_table)\n",
"new_column_names = transformed_table.column_names\n",
"new_columns= set(new_column_names) - set(old_column_names)"
],
"source": "transformed_table = encoder.transform(train_table)",
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": "5. Mark the `survived` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.",
"source": "5. Mark the `survived` `Column` as the target variable to be predicted. Include some columns only as extra columns, which are completely ignored by the model:",
"metadata": {
"collapsed": false
}
Expand All @@ -118,9 +113,9 @@
"execution_count": null,
"outputs": [],
"source": [
"train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", feature_names=[\n",
" *new_columns\n",
"])"
"extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n",
"\n",
"train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -192,9 +187,7 @@
"encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
"testing_table = encoder.transform(testing_table)\n",
"\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", feature_names=[\n",
" *new_columns\n",
"])\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n",
"fitted_model.accuracy(test_tabular_dataset)\n"
],
"metadata": {
Expand Down
11 changes: 4 additions & 7 deletions docs/tutorials/regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
},
{
"cell_type": "markdown",
"source": "3. Mark the `price` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.\n",
"source": "3. Mark the `price` `Column` as the target variable to be predicted. Include the `id` column only as an extra column, which is completely ignored by the model:",
"metadata": {
"collapsed": false
}
Expand All @@ -70,10 +70,9 @@
"execution_count": null,
"outputs": [],
"source": [
"feature_columns = set(train_table.column_names) - set([\"price\", \"id\"])\n",
"extra_names = [\"id\"]\n",
"\n",
"train_tabular_dataset = train_table.to_tabular_dataset(\"price\", feature_names=[\n",
" *feature_columns])\n"
"train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -147,9 +146,7 @@
}
],
"source": [
"test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", feature_names=[\n",
" *feature_columns\n",
"])\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n",
"\n",
"fitted_model.mean_absolute_error(test_tabular_dataset)\n"
],
Expand Down
169 changes: 39 additions & 130 deletions src/safeds/data/labeled/containers/_tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@

from safeds._utils import _structural_hash
from safeds.data.tabular.containers import Column, Table
from safeds.exceptions import (
UnknownColumnNameError,
)

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
Expand All @@ -22,150 +19,67 @@ class TabularDataset:
"""
A tabular dataset maps feature columns to a target column.

Create a tabular dataset from a mapping of column names to their values.

Parameters
----------
data:
The data.
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.
extra_names:
Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
the target column are used as features.

Raises
------
ColumnLengthMismatchError
If columns have different lengths.
ValueError
If the target column is also a feature column.
If the target column is also an extra column.
ValueError
If no feature columns are specified.
If no feature columns remains.

Examples
--------
>>> from safeds.data.tabular.containers import Table
>>> table = Table({"col1": ["a", "b"], "col2": [1, 2]})
>>> tabular_dataset = table.to_tabular_dataset("col2", ["col1"])
>>> from safeds.data.labeled.containers import TabularDataset
>>> dataset = TabularDataset(
... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]},
... target_name="target",
... extra_names=["id"]
... )
"""

# ------------------------------------------------------------------------------------------------------------------
# Creation
# ------------------------------------------------------------------------------------------------------------------

@staticmethod
def _from_table(
table: Table,
target_name: str,
feature_names: list[str] | None = None,
) -> TabularDataset:
"""
Create a tabular dataset from a table.

Parameters
----------
table:
The table.
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.

Returns
-------
tabular_dataset:
The created tabular dataset.

Raises
------
UnknownColumnNameError
If target_name matches none of the column names.
ValueError
If the target column is also a feature column.
ValueError
If no feature columns are specified.

Examples
--------
>>> from safeds.data.labeled.containers import TabularDataset
>>> from safeds.data.tabular.containers import Table
>>> table = Table({"col1": ["a", "b", "c", "a"], "col2": [1, 2, 3, 4]})
>>> tabular_dataset = TabularDataset._from_table(table, "col2", ["col1"])
"""
table = table._as_table()
if target_name not in table.column_names:
raise UnknownColumnNameError([target_name])

# If no feature names are specified, use all columns except the target column
if feature_names is None:
feature_names = table.column_names
feature_names.remove(target_name)

# Validate inputs
if target_name in feature_names:
raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
if len(feature_names) == 0:
raise ValueError("At least one feature column must be specified.")

# Create result
result = object.__new__(TabularDataset)

result._table = table
result._features = table.keep_only_columns(feature_names)
result._target = table.get_column(target_name)

return result

# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(
self,
data: Mapping[str, Sequence[Any]],
data: Table | Mapping[str, Sequence[Any]],
target_name: str,
feature_names: list[str] | None = None,
extra_names: list[str] | None = None,
):
"""
Create a tabular dataset from a mapping of column names to their values.

Parameters
----------
data:
The data.
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.

Raises
------
ColumnLengthMismatchError
If columns have different lengths.
ValueError
If the target column is also a feature column.
ValueError
If no feature columns are specified.

Examples
--------
>>> from safeds.data.labeled.containers import TabularDataset
>>> table = TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"])
"""
self._table = Table(data)
# Preprocess inputs
if not isinstance(data, Table):
data = Table(data)
if extra_names is None:
extra_names = []

# If no feature names are specified, use all columns except the target column
if feature_names is None:
feature_names = self._table.column_names
if target_name in feature_names:
feature_names.remove(target_name)
# Derive feature names
feature_names = [name for name in data.column_names if name not in {target_name, *extra_names}]

# Validate inputs
if target_name in feature_names:
raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
if target_name in extra_names:
raise ValueError(f"Column '{target_name}' cannot be both target and extra.")
if len(feature_names) == 0:
raise ValueError("At least one feature column must be specified.")
raise ValueError("At least one feature column must remain.")

self._features: Table = self._table.keep_only_columns(feature_names)
self._target: Column = self._table.get_column(target_name)
# Set attributes
self._table: Table = data
self._features: Table = data.keep_only_columns(feature_names)
self._target: Column = data.get_column(target_name)
self._extras: Table = data.keep_only_columns(extra_names)

def __eq__(self, other: object) -> bool:
"""
Expand Down Expand Up @@ -210,27 +124,22 @@ def __sizeof__(self) -> int:

@property
def features(self) -> Table:
"""
Get the feature columns of the tabular dataset.

Returns
-------
features:
The table containing the feature columns.
"""
"""The feature columns of the tabular dataset."""
return self._features

@property
def target(self) -> Column:
"""The target column of the tabular dataset."""
return self._target

@property
def extras(self) -> Table:
"""
Get the target column of the tabular dataset.
Additional columns of the tabular dataset that are neither features nor target.

Returns
-------
target:
The target column.
These can be used to store additional information about instances, such as IDs.
"""
return self._target
return self._extras

# ------------------------------------------------------------------------------------------------------------------
# Conversion
Expand Down
13 changes: 7 additions & 6 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2412,7 +2412,7 @@ def to_rows(self) -> list[Row]:
for (_, series_row) in self._data.iterrows()
]

def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None = None) -> TabularDataset:
def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset:
"""
Return a new `TabularDataset` with columns marked as a target column or feature columns.

Expand All @@ -2422,12 +2422,13 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None =
----------
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.
extra_names:
Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
the target column are used as features.

Returns
-------
tabular_dataset:
dataset:
A new tabular dataset with the given target and feature names.

Raises
Expand All @@ -2441,11 +2442,11 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None =
--------
>>> from safeds.data.tabular.containers import Table
>>> table = Table({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]})
>>> tabular_dataset = table.to_tabular_dataset(target_name="amount_bought", feature_names=["item", "price"])
>>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"])
"""
from safeds.data.labeled.containers import TabularDataset

return TabularDataset._from_table(self, target_name, feature_names)
return TabularDataset(self, target_name, extra_names)

# ------------------------------------------------------------------------------------------------------------------
# IPython integration
Expand Down
Loading