From 2c6341538d39f171595ff6a26647d598d49bc4de Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Thu, 18 Dec 2025 11:29:32 -0800
Subject: [PATCH] fix: merge_insert uses full schema path for reordered columns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When merge_insert received columns in a different order than the dataset
schema, it would fall back to the slower partial schema path. This adds
`ignore_field_order: true` to the schema comparison so that reordered
columns can use the more efficient full schema path.

Fixes #5323

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 rust/lance/src/dataset/write/merge_insert.rs | 78 ++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs
index 9b102c6bf6a..c4709624bc7 100644
--- a/rust/lance/src/dataset/write/merge_insert.rs
+++ b/rust/lance/src/dataset/write/merge_insert.rs
@@ -1429,6 +1429,8 @@ impl MergeInsertJob {
                 compare_metadata: false,
                 // Allow nullable source fields for non-nullable targets.
                 compare_nullability: NullabilityComparison::Ignore,
+                // Allow columns to be in a different order; they will be matched by name.
+                ignore_field_order: true,
                 ..Default::default()
             },
         );
@@ -5236,4 +5238,80 @@ MergeInsert: on=[id], when_matched=UpdateAll, when_not_matched=InsertAll, when_n
             }
         }
     }
+
+    /// Test case for Issue #5323: merge_insert should use the full schema path
+    /// when columns are provided in a different order than the dataset schema.
+    #[tokio::test]
+    async fn test_merge_insert_reordered_columns() {
+        use arrow_array::record_batch;
+
+        let initial_data = record_batch!(
+            ("id", Int32, [1, 2, 3]),
+            ("value", Float64, [1.1, 2.2, 3.3]),
+            ("extra", Utf8, ["a", "b", "c"])
+        )
+        .unwrap();
+
+        let dataset = Dataset::write(
+            RecordBatchIterator::new(vec![Ok(initial_data.clone())], initial_data.schema()),
+            "memory://test_issue_5323",
+            None,
+        )
+        .await
+        .unwrap();
+
+        // Source data with reordered columns: [extra, id, value] instead of [id, value, extra]
+        let new_data = record_batch!(
+            ("extra", Utf8, ["x", "y"]),
+            ("id", Int32, [2, 4]), // id 2 exists, 4 is new
+            ("value", Float64, [22.2, 44.4])
+        )
+        .unwrap();
+
+        // Verify reordered columns can use the fast path
+        let job = MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()])
+            .unwrap()
+            .when_matched(WhenMatched::UpdateAll)
+            .when_not_matched(WhenNotMatched::InsertAll)
+            .try_build()
+            .unwrap();
+        assert!(
+            job.can_use_create_plan(&new_data.schema()).await.unwrap(),
+            "Reordered schema should be able to use fast path"
+        );
+
+        // Execute and verify data correctness
+        let (merged_dataset, _) =
+            MergeInsertBuilder::try_new(Arc::new(dataset), vec!["id".to_string()])
+                .unwrap()
+                .when_matched(WhenMatched::UpdateAll)
+                .when_not_matched(WhenNotMatched::InsertAll)
+                .try_build()
+                .unwrap()
+                .execute_reader(Box::new(RecordBatchIterator::new(
+                    vec![Ok(new_data.clone())],
+                    new_data.schema(),
+                )))
+                .await
+                .unwrap();
+
+        let result = merged_dataset
+            .scan()
+            .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
+                "id".to_string(),
+            )]))
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap();
+
+        let expected = record_batch!(
+            ("id", Int32, [1, 2, 3, 4]),
+            ("value", Float64, [1.1, 22.2, 3.3, 44.4]),
+            ("extra", Utf8, ["a", "x", "c", "y"])
+        )
+        .unwrap();
+
+        assert_eq!(result, expected);
+    }
 }