From 8681389f19988de73d88edd3a478f0a890115246 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Fri, 17 May 2024 11:29:17 +0200 Subject: [PATCH 1/2] fix: explicit `None` check instead of `or` --- src/safeds/data/tabular/containers/_column.py | 7 ++- .../_column/test_summarize_statistics.py | 52 +++++++++++++++---- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index 3dc1781e1..bbd189a2f 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -688,9 +688,12 @@ def summarize_statistics(self) -> Table: self.stability(), ] else: + min_ = self.min() + max_ = self.max() + values = [ - str(self.min() or "-"), - str(self.max() or "-"), + str("-" if min_ is None else min_), + str("-" if max_ is None else max_), "-", "-", "-", diff --git a/tests/safeds/data/tabular/containers/_column/test_summarize_statistics.py b/tests/safeds/data/tabular/containers/_column/test_summarize_statistics.py index ee598a0e2..2ec858b6d 100644 --- a/tests/safeds/data/tabular/containers/_column/test_summarize_statistics.py +++ b/tests/safeds/data/tabular/containers/_column/test_summarize_statistics.py @@ -7,7 +7,36 @@ @pytest.mark.parametrize( ("column", "expected"), [ - ( + ( # boolean + Column("col", [True, False, True]), + Table( + { + "metric": [ + "min", + "max", + "mean", + "median", + "standard deviation", + "distinct value count", + "idness", + "missing value ratio", + "stability", + ], + "col": [ + "False", + "True", + "-", + "-", + "-", + "2", + str(2 / 3), + "0.0", + str(2 / 3), + ], + }, + ), + ), + ( # ints Column("col", [1, 2, 1]), Table( { @@ -25,18 +54,18 @@ "col": [ 1, 2, - 4.0 / 3, - 1.0, + 4 / 3, + 1, stdev([1, 2, 1]), 2, - 2.0 / 3, - 0.0, - 2.0 / 3, + 2 / 3, + 0, + 2 / 3, ], }, ), ), - ( + ( # strings Column("col", ["a", "b", "c"]), Table( { @@ -65,7 +94,7 @@ }, ), ), - ( + ( # only missing Column("col", [None, None]), Table( { @@ -96,9 +125,10 @@ ), ], ids=[ - "Column of ints", - "Column of strings", - "Column of None", + "boolean", + "ints", + "strings", + "only missing", ], ) def test_should_summarize_statistics(column: Column, expected: Table) -> None: From 93fbed965c5d9597f52e38e556a1fd4f8fff6750 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Fri, 17 May 2024 11:35:54 +0200 Subject: [PATCH 2/2] fix: `stability` cannot be computed for boolean columns Co-authored-by: Saman Hushalsadat --- src/safeds/data/tabular/containers/_column.py | 3 ++- tests/safeds/data/tabular/containers/_column/test_stability.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index bbd189a2f..72db3d35e 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -1058,7 +1058,8 @@ def stability(self) -> float: if non_missing.len() == 0: return 1.0 # All non-null values are the same (since there is are none) - mode_count = non_missing.unique_counts().max() + # `unique_counts` crashes in polars for boolean columns + mode_count = non_missing.value_counts().get_column("count").max() return mode_count / non_missing.len() diff --git a/tests/safeds/data/tabular/containers/_column/test_stability.py b/tests/safeds/data/tabular/containers/_column/test_stability.py index 0e6df25ef..611e1c025 100644 --- a/tests/safeds/data/tabular/containers/_column/test_stability.py +++ b/tests/safeds/data/tabular/containers/_column/test_stability.py @@ -12,6 +12,7 @@ ([None], 1), ([1, 2, 3, 4], 1 / 4), (["b", "a", "abc", "abc", "abc"], 3 / 5), + ([True, False, True, True, None], 3 / 4), ], ids=[ "empty", @@ -19,6 +20,7 @@ "only missing values", "numeric", "non-numeric", + "boolean", # caused a crash in previous implementation ], ) def test_should_return_stability_of_column(values: list[Any], expected: float) -> None: