diff --git a/ClashBehaviour.cs b/ClashBehaviour.cs index 743e4a0..da745c5 100644 --- a/ClashBehaviour.cs +++ b/ClashBehaviour.cs @@ -1,8 +1,22 @@ namespace Dimension.DataFrame.Extensions; +/// +/// Defines the behavior when adding a column to a DataFrame and a column with the same name already exists +/// public enum ClashBehaviour { + /// + /// Keep the existing column and do not add the new column + /// KeepOriginal, + + /// + /// Remove the existing column and add the new column in its place + /// ReplaceOriginal, + + /// + /// Throw an InvalidOperationException when a name clash occurs (default behavior) + /// Exception } \ No newline at end of file diff --git a/DataFrameExtensions.cs b/DataFrameExtensions.cs index 429477f..7ee665f 100644 --- a/DataFrameExtensions.cs +++ b/DataFrameExtensions.cs @@ -47,7 +47,7 @@ public static class DataFrameExtensionsCalculations public static PrimitiveDataFrameColumn Apply(this PrimitiveDataFrameColumn column, Func operation, string name = "") where T : unmanaged, INumber { - if (operation == null) + if (operation is null) { throw new ArgumentNullException(nameof(operation)); } diff --git a/DataFrameExtensionsArithmetic.cs b/DataFrameExtensionsArithmetic.cs index 8811c42..3f5d568 100644 --- a/DataFrameExtensionsArithmetic.cs +++ b/DataFrameExtensionsArithmetic.cs @@ -63,7 +63,7 @@ public static PrimitiveDataFrameColumn Minus(this PrimitiveDataFrameColumn if (string.IsNullOrEmpty(name)) { - name = $"{column.Name}_Minus_{columnToSubtract.Name}"; + name = $"{column.Name}-{columnToSubtract.Name}"; } return new PrimitiveDataFrameColumn(name, result); @@ -99,8 +99,8 @@ public static PrimitiveDataFrameColumn Times(this PrimitiveDataFrameColumn if (string.IsNullOrEmpty(name)) { - var otherNames = otherColumns.Select(c => c.Name); - name = $"{column.Name}_Times_{string.Join("_", otherNames)}"; + var namesToConcat = new[] {column.Name}.Concat(otherColumns.Select(c => c.Name)); + name = string.Join("*", namesToConcat); } return new PrimitiveDataFrameColumn(name, result); diff --git a/DataFrameExtensionsFilters.cs b/DataFrameExtensionsFilters.cs index 84a500f..d33d19f 100644 --- a/DataFrameExtensionsFilters.cs +++ b/DataFrameExtensionsFilters.cs @@ -49,75 +49,7 @@ public static Microsoft.Data.Analysis.DataFrame Filter(this Microsoft.Data.Analy var newColumns = new List(); foreach (var column in df.Columns) { - DataFrameColumn newColumn; - - // Support common numeric types - if (column.DataType == typeof(int)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(long)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(float)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(double)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(decimal)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - // Support other common types - else if (column.DataType == typeof(bool)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(byte)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(sbyte)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(short)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(ushort)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(uint)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(ulong)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(char)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(DateTime)) - { - newColumn = new PrimitiveDataFrameColumn(column.Name); - } - else if (column.DataType == typeof(string)) - { - newColumn = new StringDataFrameColumn(column.Name); - } - else - { - throw new NotSupportedException($"Column type {column.DataType.Name} is not supported. Supported types: int, long, float, double, decimal, bool, byte, sbyte, short, ushort, uint, ulong, char, DateTime, string"); - } - + var newColumn = CreateColumnByType(column.DataType, column.Name); newColumns.Add(newColumn); } @@ -137,4 +69,35 @@ public static Microsoft.Data.Analysis.DataFrame Filter(this Microsoft.Data.Analy return newDf; } + + /// + /// Creates a new DataFrame column based on the specified type + /// + /// The type of data the column will hold + /// The name for the new column + /// A new DataFrameColumn of the appropriate type + /// Thrown when the data type is not supported + private static DataFrameColumn CreateColumnByType(Type dataType, string columnName) + { + // Use pattern matching for cleaner type checking + if (dataType == typeof(int)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(long)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(float)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(double)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(decimal)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(bool)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(byte)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(sbyte)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(short)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(ushort)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(uint)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(ulong)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(char)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(DateTime)) return new PrimitiveDataFrameColumn(columnName); + if (dataType == typeof(string)) return new StringDataFrameColumn(columnName); + + throw new NotSupportedException( + $"Column type {dataType.Name} is not supported. " + + "Supported types: int, long, float, double, decimal, bool, byte, sbyte, short, ushort, uint, ulong, char, DateTime, string"); + } } \ No newline at end of file diff --git a/DataFrameExtensionsNullsNaNs.cs b/DataFrameExtensionsNullsNaNs.cs index 85ea02d..903949c 100644 --- a/DataFrameExtensionsNullsNaNs.cs +++ b/DataFrameExtensionsNullsNaNs.cs @@ -13,17 +13,17 @@ public static class DataFrameExtensionsNullsNaNs public static PrimitiveDataFrameColumn DropNulls(this PrimitiveDataFrameColumn column) where T : unmanaged, INumber { - var newColumn = new PrimitiveDataFrameColumn(column.Name, column.Length); + var validValues = new List(); foreach (var value in column) { var shouldAddValue = value != null && !(value is float f && float.IsNaN(f)) && !(value is double d && double.IsNaN(d)); if (shouldAddValue) { - newColumn.Append(value); + validValues.Add(value); } } - return newColumn; + return new PrimitiveDataFrameColumn(column.Name, validValues); } public static Microsoft.Data.Analysis.DataFrame DropNulls(this Microsoft.Data.Analysis.DataFrame df) diff --git a/DataFrameExtensionsRolling.cs b/DataFrameExtensionsRolling.cs index cde5484..04bd63f 100644 --- a/DataFrameExtensionsRolling.cs +++ b/DataFrameExtensionsRolling.cs @@ -24,6 +24,10 @@ public static PrimitiveDataFrameColumn Rolling(this PrimitiveDataFrameColu where T : unmanaged, INumber { var result = new PrimitiveDataFrameColumn(column.Name + "_Rolling", column.Length); + + // Pre-allocate a reusable buffer to avoid repeated allocations + var windowBuffer = new T?[windowSize]; + for (var i = 0; i < column.Length; i++) { if (i < windowSize - 1) @@ -32,19 +36,20 @@ public static PrimitiveDataFrameColumn Rolling(this PrimitiveDataFrameColu continue; } - var window = new List(); + // Reuse the buffer instead of creating new List + var windowCount = 0; for (var j = i - windowSize + 1; j <= i; j++) { - if (!column[j].HasValue) + if (column[j].HasValue) { - continue; + windowBuffer[windowCount++] = column[j]; } - - window.Add(column[j]); } - if (window.Count > 0) + if (windowCount > 0) { + // Create a span/array view of only the valid values + var window = new ArraySegment(windowBuffer, 0, windowCount); var opResult = operation(window); result[i] = opResult; } diff --git a/DataFrameExtensionsStatistics.cs b/DataFrameExtensionsStatistics.cs index ee4b999..89f366f 100644 --- a/DataFrameExtensionsStatistics.cs +++ b/DataFrameExtensionsStatistics.cs @@ -50,8 +50,8 @@ public static class DataFrameExtensionsStatistics /// /// Numeric type /// Column to calculate median for - /// Median value, or null if column is empty or all values are null - public static T? Median(this PrimitiveDataFrameColumn column) + /// Median value as double, or null if column is empty or all values are null + public static double? Median(this PrimitiveDataFrameColumn column) where T : unmanaged, INumber { if (column == null || column.Length == 0) @@ -59,7 +59,7 @@ public static class DataFrameExtensionsStatistics return null; } - var values = column.Where(v => v.HasValue).Select(v => v!.Value).OrderBy(v => v).ToList(); + var values = column.Where(v => v.HasValue).Select(v => Convert.ToDouble(v!.Value)).OrderBy(v => v).ToList(); if (values.Count == 0) { @@ -71,7 +71,7 @@ public static class DataFrameExtensionsStatistics if (values.Count % 2 == 0) { // Even number of elements - average the two middle values - return (values[middleIndex - 1] + values[middleIndex]) / T.CreateChecked(2); + return (values[middleIndex - 1] + values[middleIndex]) / 2.0; } else { @@ -95,7 +95,7 @@ public static class DataFrameExtensionsStatistics } /// - /// Calculates the variance of a column + /// Calculates the variance of a column using Welford's online algorithm for numerical stability /// /// Numeric type /// Column to calculate variance for @@ -109,18 +109,32 @@ public static class DataFrameExtensionsStatistics return null; } - var values = column.Where(v => v.HasValue).Select(v => Convert.ToDouble(v!.Value)).ToList(); + // Single-pass variance calculation using Welford's algorithm + var count = 0; + var mean = 0.0; + var m2 = 0.0; - if (values.Count < (sample ? 2 : 1)) + for (var i = 0; i < column.Length; i++) { - return null; + var value = column[i]; + if (value.HasValue) + { + count++; + var doubleValue = Convert.ToDouble(value.Value); + var delta = doubleValue - mean; + mean += delta / count; + var delta2 = doubleValue - mean; + m2 += delta * delta2; + } } - var mean = values.Average(); - var sumOfSquaredDifferences = values.Sum(v => Math.Pow(v - mean, 2)); - var divisor = sample ? values.Count - 1 : values.Count; + if (count < (sample ? 2 : 1)) + { + return null; + } - return sumOfSquaredDifferences / divisor; + var divisor = sample ? count - 1 : count; + return m2 / divisor; } /// @@ -212,7 +226,7 @@ public static long Count(this PrimitiveDataFrameColumn column) /// Numeric type /// Column to calculate statistics for /// Tuple containing (count, mean, stddev, min, 25th percentile, median, 75th percentile, max) - public static (long Count, T? Mean, double? StdDev, T? Min, T? Q25, T? Median, T? Q75, T? Max) Describe(this PrimitiveDataFrameColumn column) + public static (long Count, T? Mean, double? StdDev, T? Min, double? Q25, double? Median, double? Q75, T? Max) Describe(this PrimitiveDataFrameColumn column) where T : unmanaged, INumber { var count = column.Count(); @@ -233,8 +247,8 @@ public static (long Count, T? Mean, double? StdDev, T? Min, T? Q25, T? Median, T /// Numeric type /// Column to calculate quantile for /// Quantile to calculate (0.0 to 1.0, e.g., 0.25 for 25th percentile) - /// Quantile value, or null if column is empty - public static T? Quantile(this PrimitiveDataFrameColumn column, double quantile) + /// Quantile value as double, or null if column is empty + public static double? Quantile(this PrimitiveDataFrameColumn column, double quantile) where T : unmanaged, INumber { if (column == null || column.Length == 0 || quantile < 0 || quantile > 1) @@ -242,7 +256,7 @@ public static (long Count, T? Mean, double? StdDev, T? Min, T? Q25, T? Median, T return null; } - var values = column.Where(v => v.HasValue).Select(v => v!.Value).OrderBy(v => v).ToList(); + var values = column.Where(v => v.HasValue).Select(v => Convert.ToDouble(v!.Value)).OrderBy(v => v).ToList(); if (values.Count == 0) { @@ -258,7 +272,7 @@ public static (long Count, T? Mean, double? StdDev, T? Min, T? Q25, T? Median, T return values[lowerIndex]; } - var weight = T.CreateChecked(index - lowerIndex); + var weight = index - lowerIndex; return values[lowerIndex] + weight * (values[upperIndex] - values[lowerIndex]); } } diff --git a/DataFrameExtensionsSugar.cs b/DataFrameExtensionsSugar.cs index ee4d21e..ee01664 100644 --- a/DataFrameExtensionsSugar.cs +++ b/DataFrameExtensionsSugar.cs @@ -78,89 +78,4 @@ public static PrimitiveDataFrameColumn AddTo(this PrimitiveDataFrameColumn df.Columns.Add(column); return column; } - - private static bool ValuesAreEqual(T? a, T? b, T relativeTolerance) - where T : struct, INumber - { - if (!a.HasValue && !b.HasValue) - { - return true; // Both are null/missing - } - - if (!a.HasValue || !b.HasValue) - { - return false; // One is null/missing, the other isn't - } - - // Special handling for NaN values for floating-point types - if (typeof(T) == typeof(float)) - { - // Explicitly handle float NaN comparisons - if (float.IsNaN((float) (object) a) && float.IsNaN((float) (object) b)) - { - return true; - } - } - else if (typeof(T) == typeof(double)) - { - // Explicitly handle double NaN comparisons - if (double.IsNaN((double) (object) a) && double.IsNaN((double) (object) b)) - { - return true; - } - } - - // Calculate the absolute difference - var absoluteDifference = a.Value - b.Value; - if (absoluteDifference == T.Zero) - { - return true; - } - - if (absoluteDifference < T.Zero) - { - absoluteDifference *= -T.One; - } - - // Calculate the absolute maximum of the two numbers - var maxAbsolute = a.Value > b.Value ? a.Value : b.Value; - if (maxAbsolute == T.Zero) - { - // avoid DBZ error - return a.Value == b.Value; - } - - // Calculate the relative difference based on the maximum absolute value - var relativeDifference = absoluteDifference / maxAbsolute; - - // Check if the relative difference is within the relative tolerance - return relativeDifference <= relativeTolerance; - } - - private static T GetTolerance() where T : struct, INumber - { - // Define tolerance based on type - if (typeof(T) == typeof(float)) - { - return (T) (object) (float) 1e-6f; // Example tolerance for float - } - else if (typeof(T) == typeof(double)) - { - return (T) (object) (double) 1e-15; // Example tolerance for double - } - else if (typeof(T) == typeof(decimal)) - { - return (T) (object) (decimal) 1e-28M; // Lower tolerance for decimal - } - else if (typeof(T) == typeof(int) || typeof(T) == typeof(long)) - { - // For integral types, exact match is expected, so tolerance is zero - return T.Zero; - } - else - { - // Default tolerance for other types, adjust as necessary - return T.One / T.CreateChecked(1000000); - } - } } \ No newline at end of file diff --git a/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsIOTests.cs b/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsIOTests.cs new file mode 100644 index 0000000..58fe35b --- /dev/null +++ b/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsIOTests.cs @@ -0,0 +1,274 @@ +using FluentAssertions; +using Microsoft.Data.Analysis; +using System; +using System.IO; +using Xunit; + +namespace Dimension.DataFrame.Extensions.Tests; + +public class DataFrameExtensionsIOTests +{ + [Fact] + public void SaveToCsv_BasicDataFrame_CreatesValidCsv() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1, 2, 3 }), + new StringDataFrameColumn("Name", new[] { "Alice", "Bob", "Charlie" }), + new PrimitiveDataFrameColumn("Score", new[] { 95.5, 87.3, 92.1 }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().Contain("ID,Name,Score"); + content.Should().Contain("1,Alice,95.5"); + content.Should().Contain("2,Bob,87.3"); + content.Should().Contain("3,Charlie,92.1"); + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithCustomSeparator_UsesCorrectSeparator() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("A", new[] { 1, 2 }), + new PrimitiveDataFrameColumn("B", new[] { 3, 4 }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile, sep: ";"); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().Contain("A;B"); + content.Should().Contain("1;3"); + content.Should().Contain("2;4"); + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithoutHeader_DoesNotIncludeColumnNames() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1, 2 }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile, includeHeader: false); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().NotContain("ID"); + content.Should().Contain("1"); + content.Should().Contain("2"); + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithQuotesInData_EscapesCorrectly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new StringDataFrameColumn("Text", new[] { "Hello \"World\"", "Simple text" }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().Contain("\"Hello \"\"World\"\"\""); // RFC 4180: quotes doubled + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithCommaInData_QuotesField() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new StringDataFrameColumn("Text", new[] { "Hello, World", "Simple" }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().Contain("\"Hello, World\""); // RFC 4180: field with comma must be quoted + content.Should().Contain("Simple"); // Simple text not quoted + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithNewlineInData_QuotesField() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new StringDataFrameColumn("Text", new[] { "Line1\nLine2", "Simple" }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().Contain("\"Line1\nLine2\""); // RFC 4180: field with newline must be quoted + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithFormulaInjectionAttempt_SanitizesData() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new StringDataFrameColumn("Text", new[] { "=SUM(A1:A10)", "+cmd", "-cmd", "@cmd", "\tcmd", "\rcmd" }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + var content = File.ReadAllText(tempFile); + // CSV injection prevention: formula characters should be prefixed with single quote + content.Should().Contain("'=SUM"); + content.Should().Contain("'+cmd"); + content.Should().Contain("'-cmd"); + content.Should().Contain("'@cmd"); + content.Should().Contain("'\tcmd"); + content.Should().Contain("'\rcmd"); + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_WithNullValues_HandlesCorrectly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("Num", new int?[] { 1, null, 3 }), + new StringDataFrameColumn("Text", new[] { "A", null, "C" }) + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + File.Exists(tempFile).Should().BeTrue(); + var content = File.ReadAllText(tempFile); + content.Should().Contain("Num,Text"); + // Nulls should be represented as empty strings + var lines = content.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries); + lines.Should().HaveCountGreaterOrEqualTo(3); // Header + 3 data rows + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } + + [Fact] + public void SaveToCsv_ToInvalidPath_ThrowsIOException() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("A", new[] { 1 }) + ); + var invalidPath = "/invalid/path/that/does/not/exist/file.csv"; + + // Act & Assert + var act = () => df.SaveToCsv(invalidPath); + act.Should().Throw(); + } + + [Fact] + public void SaveToCsv_EmptyDataFrame_CreatesFileWithHeaderOnly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID"), + new StringDataFrameColumn("Name") + ); + var tempFile = Path.GetTempFileName(); + + try + { + // Act + df.SaveToCsv(tempFile); + + // Assert + var content = File.ReadAllText(tempFile); + content.Should().Contain("ID,Name"); + var lines = content.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries); + lines.Should().HaveCount(1); // Only header line + } + finally + { + if (File.Exists(tempFile)) + File.Delete(tempFile); + } + } +} diff --git a/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsRowsTests.cs b/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsRowsTests.cs new file mode 100644 index 0000000..10fcc3e --- /dev/null +++ b/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsRowsTests.cs @@ -0,0 +1,236 @@ +using FluentAssertions; +using Microsoft.Data.Analysis; +using System; +using Xunit; + +namespace Dimension.DataFrame.Extensions.Tests; + +public class DataFrameExtensionsRowsTests +{ + [Fact] + public void AddRow_WithMatchingTypes_AddsRowSuccessfully() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1, 2 }), + new StringDataFrameColumn("Name", new[] { "Alice", "Bob" }) + ); + + // Act + df.AddRow(3, "Charlie"); + + // Assert + df.Rows.Count.Should().Be(3); + df["ID"][2].Should().Be(3); + df["Name"][2].Should().Be("Charlie"); + } + + [Fact] + public void AddRow_WithNullableInt_HandlesNullCorrectly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new int?[] { 1, 2 }), + new StringDataFrameColumn("Name", new[] { "Alice", "Bob" }) + ); + + // Act + df.AddRow(null, "Charlie"); + + // Assert + df.Rows.Count.Should().Be(3); + df["ID"][2].Should().BeNull(); + df["Name"][2].Should().Be("Charlie"); + } + + [Fact] + public void AddRow_WithMultipleNumericTypes_AddsCorrectly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("IntCol", new[] { 1 }), + new PrimitiveDataFrameColumn("LongCol", new[] { 100L }), + new PrimitiveDataFrameColumn("FloatCol", new[] { 1.5f }), + new PrimitiveDataFrameColumn("DoubleCol", new[] { 2.5 }), + new PrimitiveDataFrameColumn("DecimalCol", new[] { 3.5m }) + ); + + // Act + df.AddRow(2, 200L, 2.5f, 3.5, 4.5m); + + // Assert + df.Rows.Count.Should().Be(2); + df["IntCol"][1].Should().Be(2); + df["LongCol"][1].Should().Be(200L); + df["FloatCol"][1].Should().Be(2.5f); + df["DoubleCol"][1].Should().Be(3.5); + df["DecimalCol"][1].Should().Be(4.5m); + } + + [Fact] + public void AddRow_WithBooleanColumn_AddsCorrectly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1 }), + new PrimitiveDataFrameColumn("Active", new[] { true }) + ); + + // Act + df.AddRow(2, false); + + // Assert + df.Rows.Count.Should().Be(2); + df["Active"][1].Should().Be(false); + } + + [Fact] + public void AddRow_WithDateTimeColumn_AddsCorrectly() + { + // Arrange + var date1 = new DateTime(2024, 1, 1); + var date2 = new DateTime(2024, 1, 2); + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1 }), + new PrimitiveDataFrameColumn("Date", new[] { date1 }) + ); + + // Act + df.AddRow(2, date2); + + // Assert + df.Rows.Count.Should().Be(2); + df["Date"][1].Should().Be(date2); + } + + [Fact] + public void AddRow_WithWrongNumberOfValues_ThrowsArgumentException() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1 }), + new StringDataFrameColumn("Name", new[] { "Alice" }) + ); + + // Act & Assert + var act = () => df.AddRow(2); // Missing Name value + act.Should().Throw() + .WithMessage("*number of provided values must match*"); + } + + [Fact] + public void AddRow_WithIncompatibleType_ThrowsInvalidOperationException() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1 }), + new StringDataFrameColumn("Name", new[] { "Alice" }) + ); + + // Act & Assert + var act = () => df.AddRow("NotAnInt", "Bob"); // String instead of int + act.Should().Throw() + .WithMessage("*not compatible*"); + } + + [Fact] + public void AddRow_WithIEnumerable_AddsRowSuccessfully() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new[] { 1, 2 }), + new StringDataFrameColumn("Name", new[] { "Alice", "Bob" }) + ); + var values = new object[] { 3, "Charlie" }; + + // Act + df.AddRow((System.Collections.Generic.IEnumerable)values); + + // Assert + df.Rows.Count.Should().Be(3); + df["ID"][2].Should().Be(3); + df["Name"][2].Should().Be("Charlie"); + } + + [Fact] + public void AddRow_ToEmptyDataFrame_CreatesFirstRow() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID"), + new StringDataFrameColumn("Name") + ); + + // Act + df.AddRow(1, "Alice"); + + // Assert + df.Rows.Count.Should().Be(1); + df["ID"][0].Should().Be(1); + df["Name"][0].Should().Be("Alice"); + } + + [Fact] + public void AddRow_MultipleRows_MaintainsOrder() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID"), + new StringDataFrameColumn("Name") + ); + + // Act + df.AddRow(1, "Alice"); + df.AddRow(2, "Bob"); + df.AddRow(3, "Charlie"); + + // Assert + df.Rows.Count.Should().Be(3); + df["ID"][0].Should().Be(1); + df["ID"][1].Should().Be(2); + df["ID"][2].Should().Be(3); + df["Name"][0].Should().Be("Alice"); + df["Name"][1].Should().Be("Bob"); + df["Name"][2].Should().Be("Charlie"); + } + + [Fact] + public void AddRow_WithAllNullValues_AddsRowWithNulls() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ID", new int?[] { 1 }), + new StringDataFrameColumn("Name", new[] { "Alice" }) + ); + + // Act + df.AddRow(null, null); + + // Assert + df.Rows.Count.Should().Be(2); + df["ID"][1].Should().BeNull(); + df["Name"][1].Should().BeNull(); + } + + [Fact] + public void AddRow_WithUnsignedIntegerTypes_AddsCorrectly() + { + // Arrange + var df = new Microsoft.Data.Analysis.DataFrame( + new PrimitiveDataFrameColumn("ByteCol", new byte[] { 1 }), + new PrimitiveDataFrameColumn("UShortCol", new ushort[] { 100 }), + new PrimitiveDataFrameColumn("UIntCol", new uint[] { 1000 }), + new PrimitiveDataFrameColumn("ULongCol", new ulong[] { 10000 }) + ); + + // Act + df.AddRow((byte)2, (ushort)200, (uint)2000, (ulong)20000); + + // Assert + df.Rows.Count.Should().Be(2); + df["ByteCol"][1].Should().Be((byte)2); + df["UShortCol"][1].Should().Be((ushort)200); + df["UIntCol"][1].Should().Be((uint)2000); + df["ULongCol"][1].Should().Be((ulong)20000); + } +} diff --git a/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsStatisticsTests.cs b/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsStatisticsTests.cs index e07d55c..3885241 100644 --- a/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsStatisticsTests.cs +++ b/Dimension.DataFrame.Extensions.Tests/DataFrameExtensionsStatisticsTests.cs @@ -55,7 +55,7 @@ public void Median_EvenCount_ReturnsAverageOfMiddleTwo() var result = column.Median(); // Assert - result.Should().Be(2); // Average of 2 and 3 = 2.5, but integer division gives 2 + result.Should().Be(2.5); // Average of 2 and 3 = 2.5 (now returns double for precision) } [Fact] diff --git a/README.md b/README.md index 3bd1b5d..d5ca775 100644 --- a/README.md +++ b/README.md @@ -309,6 +309,31 @@ var roundedInt = decimals.Round(); // [1.0, 6.0, 10.0] - Microsoft.Data.Analysis 0.21.1 or later - MathNet.Numerics 5.0.0 or later +## Null Handling + +Different operations handle null values in different ways: + +### Arithmetic Operations (Plus, Minus, Times, Divide) +- Null values are treated as `default(T)` (typically 0 for numeric types) +- Example: `1 + null = 1 + 0 = 1` + +### Statistical Operations (Mean, Median, StdDev, Variance, etc.) +- Null values are **skipped** and excluded from calculations +- Example: `Mean([1, null, 3]) = (1 + 3) / 2 = 2.0` + +### Shift Operations +- Null values are **preserved** in their new positions +- Fill values can be specified for positions vacated by the shift + +### Rolling Window Operations +- Null values are **skipped** within each window +- The operation is applied only to non-null values + +### Filtering Operations +- `DropNulls()` - Removes rows containing null values +- `DropNAs()` - Removes rows containing NaN values (for float/double) +- `DropNullsOrNAs()` - Removes rows containing either nulls or NaNs + ## Contributing Contributions are welcome! Please feel free to submit a Pull Request.