From c527c218b93f83bc47b7cae60f8d037e72b9c6c5 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Tue, 27 Apr 2021 01:12:40 +0300 Subject: [PATCH 1/2] fix #5767 issue with DataFrame Merge method --- src/Microsoft.Data.Analysis/DataFrame.Join.cs | 17 ++++++++--------- .../DataFrameTests.cs | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.Join.cs b/src/Microsoft.Data.Analysis/DataFrame.Join.cs index d5a1278371..381268dee2 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.Join.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.Join.cs @@ -252,9 +252,9 @@ public DataFrame Merge(DataFrame other, string leftJoinColumn, string righ // Hash the column with the smaller RowCount long leftRowCount = Rows.Count; long rightRowCount = other.Rows.Count; - DataFrame longerDataFrame = leftRowCount <= rightRowCount ? other : this; - DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; - DataFrameColumn hashColumn = (leftRowCount <= rightRowCount) ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; + + var leftColumnIsSmaller = (leftRowCount <= rightRowCount); + DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; DataFrameColumn otherColumn = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn]; Dictionary> multimap = hashColumn.GroupColumnValues(); @@ -270,23 +270,21 @@ public DataFrame Merge(DataFrame other, string leftJoinColumn, string righ { if (hashColumn[row] == null) { - leftRowIndices.Append(row); - rightRowIndices.Append(i); + leftRowIndices.Append(leftColumnIsSmaller ? row : i); + rightRowIndices.Append(leftColumnIsSmaller ? i : row); } } else { if (hashColumn[row] != null) { - leftRowIndices.Append(row); - rightRowIndices.Append(i); + leftRowIndices.Append(leftColumnIsSmaller ? row : i); + rightRowIndices.Append(leftColumnIsSmaller ? i : row); } } } } } - leftDataFrame = shorterDataFrame; - rightDataFrame = longerDataFrame; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { @@ -366,4 +364,5 @@ public DataFrame Merge(DataFrame other, string leftJoinColumn, string righ } } + } diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 300babbffb..72072fd533 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1579,6 +1579,25 @@ public void TestSample() Assert.Throws(()=> df.Sample(13)); } + [Theory] + [InlineData(1, 2)] + [InlineData(2, 1)] + public void TestDataCorrectnessForInnerMerge(int leftCount, int rightCount) + { + DataFrame left = MakeDataFrameWithNumericColumns(leftCount, false); + DataFrameColumn leftStringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, leftCount).Select(x => "Left")); + left.Columns.Insert(left.Columns.Count, leftStringColumn); + + DataFrame right = MakeDataFrameWithNumericColumns(rightCount, false); + DataFrameColumn rightStringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, rightCount).Select(x => "Right")); + right.Columns.Insert(right.Columns.Count, rightStringColumn); + + DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); + + Assert.Equal("Left", (string)merge.Columns["String_left"][0]); + Assert.Equal("Right", (string)merge.Columns["String_right"][0]); + } + [Fact] public void TestMerge() { From d4310286d36f3c6286f7473beb487f3576d3ed67 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Fri, 28 May 2021 22:20:21 +0300 Subject: [PATCH 2/2] #5813 Improve exception text on grouping by invalid column name --- src/Microsoft.Data.Analysis/DataFrame.cs | 4 ++-- src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs | 2 +- src/Microsoft.Data.Analysis/Strings.resx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 8eb04797aa..79f16e4f74 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -362,7 +362,7 @@ public GroupBy GroupBy(string columnName) { int columnIndex = _columnCollection.IndexOf(columnName); if (columnIndex == -1) - throw new ArgumentException(Strings.InvalidColumnName, nameof(columnName)); + throw new ArgumentException(String.Format(Strings.InvalidColumnName, columnName), nameof(columnName)); DataFrameColumn column = _columnCollection[columnIndex]; return column.GroupBy(columnIndex, this); @@ -573,7 +573,7 @@ public DataFrame Append(IEnumerable> row, bool inPl int index = ret.Columns.IndexOf(columnName); if (index == -1) { - throw new ArgumentException(Strings.InvalidColumnName, nameof(columnName)); + throw new ArgumentException(String.Format(Strings.InvalidColumnName, columnName), nameof(columnName)); } DataFrameColumn column = ret.Columns[index]; diff --git a/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs b/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs index 2c3b72b28f..45f40b4696 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs @@ -161,7 +161,7 @@ public DataFrameColumn this[string columnName] int columnIndex = IndexOf(columnName); if (columnIndex == -1) { - throw new ArgumentException(Strings.InvalidColumnName, nameof(columnName)); + throw new ArgumentException(String.Format(Strings.InvalidColumnName, columnName), nameof(columnName)); } return this[columnIndex]; } diff --git a/src/Microsoft.Data.Analysis/Strings.resx b/src/Microsoft.Data.Analysis/Strings.resx index de91078cec..7821c7a118 100644 --- a/src/Microsoft.Data.Analysis/Strings.resx +++ b/src/Microsoft.Data.Analysis/Strings.resx @@ -184,7 +184,7 @@ Inconsistent null bitmaps and NullCounts - Column does not exist + Column '{0}' does not exist All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length.