From 99eb2dc9b07e953249b4ece6ea3a44ecebc9287b Mon Sep 17 00:00:00 2001 From: Budde Date: Fri, 10 Mar 2017 13:04:22 -0800 Subject: [PATCH] [SPARK-19611][SQL] Preserve metastore field order when merging inferred schema The ```HiveMetastoreCatalog.mergeWithMetastoreSchema()``` method added in #16944 may not preserve the same field order as the metastore schema in some cases, which can cause queries to fail. This change ensures that the metastore field order is preserved. A test for ensuring that metastore order is preserved was added to ```HiveSchemaInferenceSuite.``` The particular failure usecase from #16944 was tested manually as well. --- .../spark/sql/hive/HiveMetastoreCatalog.scala | 5 +---- .../sql/hive/HiveSchemaInferenceSuite.scala | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 056af495590f7..9f0d1ceb28fca 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -356,13 +356,10 @@ private[hive] object HiveMetastoreCatalog { .filterKeys(!inferredSchema.map(_.name.toLowerCase).contains(_)) .values .filter(_.nullable) - // Merge missing nullable fields to inferred schema and build a case-insensitive field map. val inferredFields = StructType(inferredSchema ++ missingNullables) .map(f => f.name.toLowerCase -> f).toMap - StructType(metastoreFields.map { case(name, field) => - field.copy(name = inferredFields(name).name) - }.toSeq) + StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name).name))) } catch { case NonFatal(_) => val msg = s"""Detected conflicting schemas when merging the schema obtained from the Hive diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala index 78955803819cf..e48ce2304d086 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala @@ -293,6 +293,27 @@ class HiveSchemaInferenceSuite StructField("firstField", StringType, nullable = true), StructField("secondField", StringType, nullable = true)))) }.getMessage.contains("Detected conflicting schemas")) + + // Schema merge should maintain metastore order. + assertResult( + StructType(Seq( + StructField("first_field", StringType, nullable = true), + StructField("second_field", StringType, nullable = true), + StructField("third_field", StringType, nullable = true), + StructField("fourth_field", StringType, nullable = true), + StructField("fifth_field", StringType, nullable = true)))) { + HiveMetastoreCatalog.mergeWithMetastoreSchema( + StructType(Seq( + StructField("first_field", StringType, nullable = true), + StructField("second_field", StringType, nullable = true), + StructField("third_field", StringType, nullable = true), + StructField("fourth_field", StringType, nullable = true), + StructField("fifth_field", StringType, nullable = true))), + StructType(Seq( + StructField("fifth_field", StringType, nullable = true), + StructField("third_field", StringType, nullable = true), + StructField("second_field", StringType, nullable = true)))) + } } }