From ef5db42b6630c7c891c9f0e5252daf4a37ddca91 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Mon, 27 Jun 2016 22:53:22 -0700 Subject: [PATCH 1/5] [SPARK-16248][SQL] Whitelist the list of Hive fallback functions --- .../spark/sql/hive/HiveSessionCatalog.scala | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 2f6a2207855ec..30486e7319e4c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -162,17 +162,6 @@ private[sql] class HiveSessionCatalog( } } - // We have a list of Hive built-in functions that we do not support. So, we will check - // Hive's function registry and lazily load needed functions into our own function registry. - // Those Hive built-in functions are - // assert_true, collect_list, collect_set, compute_stats, context_ngrams, create_union, - // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, - // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values, - // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming, - // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2, - // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean, - // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number, - // xpath_short, and xpath_string. override def lookupFunction(name: FunctionIdentifier, children: Seq[Expression]): Expression = { // TODO: Once lookupFunction accepts a FunctionIdentifier, we should refactor this method to // if (super.functionExists(name)) { @@ -196,6 +185,10 @@ private[sql] class HiveSessionCatalog( // built-in function. // Hive is case insensitive. val functionName = funcName.unquotedString.toLowerCase + if (!hiveFunctions.contains(functionName)) { + failFunctionLookup(funcName.unquotedString) + } + // TODO: This may not really work for current_user because current_user is not evaluated // with session info. // We do not need to use executionHive at here because we only load @@ -221,4 +214,18 @@ private[sql] class HiveSessionCatalog( } } } + + /** List of functions we pass over to Hive. Note that over time this list should go to 0. */ + // We have a list of Hive built-in functions that we do not support. So, we will check + // Hive's function registry and lazily load needed functions into our own function registry. + // Those Hive built-in functions are + // compute_stats, context_ngrams, create_union, + // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, + // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values, + // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming, + // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2, + // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean, + // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number, + // xpath_short, and xpath_string. + private val hiveFunctions = Seq("percentile", "percentile_approx") } From 263964401fa5c7fd3de80aad50720f9f77a08043 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 28 Jun 2016 01:00:03 -0700 Subject: [PATCH 2/5] Tests --- .../execution/HiveCompatibilitySuite.scala | 22 ++++++++++--------- .../HiveWindowFunctionQuerySuite.scala | 4 ++++ .../spark/sql/hive/HiveSessionCatalog.scala | 3 ++- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 2d5a970c12006..13d18fdec0e9d 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -517,6 +517,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // This test uses CREATE EXTERNAL TABLE without specifying LOCATION "alter2", + // [SPARK-16248][SQL] Whitelist the list of Hive fallback functions + "udf_field", + "udf_reflect2", + "udf_xpath", + "udf_xpath_boolean", + "udf_xpath_double", + "udf_xpath_float", + "udf_xpath_int", + "udf_xpath_long", + "udf_xpath_short", + "udf_xpath_string", + // These tests DROP TABLE that don't exist (but do not specify IF EXISTS) "alter_rename_partition1", "date_1", @@ -1004,7 +1016,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_elt", "udf_equal", "udf_exp", - "udf_field", "udf_find_in_set", "udf_float", "udf_floor", @@ -1049,7 +1060,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_power", "udf_radians", "udf_rand", - "udf_reflect2", "udf_regexp", "udf_regexp_extract", "udf_regexp_replace", @@ -1090,14 +1100,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_variance", "udf_weekofyear", "udf_when", - "udf_xpath", - "udf_xpath_boolean", - "udf_xpath_double", - "udf_xpath_float", - "udf_xpath_int", - "udf_xpath_long", - "udf_xpath_short", - "udf_xpath_string", "union10", "union11", "union13", diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala index 6c3978154d4b6..91dd491790e18 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala @@ -559,6 +559,9 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte |order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp """.stripMargin, reset = false) */ + + /* + [SPARK-16248][SQL] Whitelist the list of Hive fallback functions createQueryTest("windowing.q -- 21. testDISTs", """ |select p_mfgr,p_name, p_size, @@ -569,6 +572,7 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name | rows between 2 preceding and 2 following) """.stripMargin, reset = false) + */ createQueryTest("windowing.q -- 24. testLateralViews", """ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 30486e7319e4c..40dc654757469 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -227,5 +227,6 @@ private[sql] class HiveSessionCatalog( // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean, // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number, // xpath_short, and xpath_string. - private val hiveFunctions = Seq("percentile", "percentile_approx") + private val hiveFunctions = Seq( + "elt", "hash", "java_method", "parse_url", "percentile", "percentile_approx", "stack") } From 392bdba3ded26c86e05f87a3259ed6c2ed7ad938 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 28 Jun 2016 15:45:26 -0700 Subject: [PATCH 3/5] push --- .../catalyst/analysis/FunctionRegistry.scala | 1 + .../spark/sql/hive/HiveSessionCatalog.scala | 22 ++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 42a8faa412a34..0bde48ce57c86 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -248,6 +248,7 @@ object FunctionRegistry { expression[Average]("mean"), expression[Min]("min"), expression[Skewness]("skewness"), + expression[StddevSamp]("std"), expression[StddevSamp]("stddev"), expression[StddevPop]("stddev_pop"), expression[StddevSamp]("stddev_samp"), diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 40dc654757469..d092ab626dd16 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -218,15 +218,21 @@ private[sql] class HiveSessionCatalog( /** List of functions we pass over to Hive. Note that over time this list should go to 0. */ // We have a list of Hive built-in functions that we do not support. So, we will check // Hive's function registry and lazily load needed functions into our own function registry. - // Those Hive built-in functions are + // List of functions we are explicitly not supporting are: // compute_stats, context_ngrams, create_union, - // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, - // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values, + // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, + // in_file, index, java_method, // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming, - // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2, - // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean, - // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number, - // xpath_short, and xpath_string. + // parse_url_tuple, posexplode, reflect2, + // str_to_map, windowingtablefunction. private val hiveFunctions = Seq( - "elt", "hash", "java_method", "parse_url", "percentile", "percentile_approx", "stack") + "elt", "hash", "java_method", "histogram_numeric", + "map_keys", "map_values", + "parse_url", "percentile", "percentile_approx", "reflect", "sentences", "stack", "str_to_map", + "xpath", "xpath_boolean", "xpath_double", "xpath_float", "xpath_int", "xpath_long", + "xpath_number", "xpath_short", "xpath_string", + + // table generating function + "inline", "posexplode" + ) } From eaef9c537bf5637c8f7be86a525847fd0ef537f4 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 28 Jun 2016 15:46:21 -0700 Subject: [PATCH 4/5] code review --- .../org/apache/spark/sql/hive/HiveSessionCatalog.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index d092ab626dd16..8a47dcf908030 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -189,10 +189,8 @@ private[sql] class HiveSessionCatalog( failFunctionLookup(funcName.unquotedString) } - // TODO: This may not really work for current_user because current_user is not evaluated - // with session info. - // We do not need to use executionHive at here because we only load - // Hive's builtin functions, which do not need current db. + // TODO: Remove this fallback path once we implement the list of fallback functions + // defined below in hiveFunctions. val functionInfo = { try { Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse( From 251b484dadcabf6f89dfc95162db0aab4da778ce Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 28 Jun 2016 17:35:12 -0700 Subject: [PATCH 5/5] michael comment --- .../HiveWindowFunctionQuerySuite.scala | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala index 91dd491790e18..7ba5790c2979d 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala @@ -534,34 +534,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte | rows between 2 preceding and 2 following); """.stripMargin, reset = false) - // collect_set() output array in an arbitrary order, hence causes different result - // when running this test suite under Java 7 and 8. - // We change the original sql query a little bit for making the test suite passed - // under different JDK - /* Disabled because: - - Spark uses a different default stddev. - - Tiny numerical differences in stddev results. - createQueryTest("windowing.q -- 20. testSTATs", - """ - |select p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp - |from ( - |select p_mfgr,p_name, p_size, - |stddev(p_retailprice) over w1 as sdev, - |stddev_pop(p_retailprice) over w1 as sdev_pop, - |collect_set(p_size) over w1 as uniq_size, - |variance(p_retailprice) over w1 as var, - |corr(p_size, p_retailprice) over w1 as cor, - |covar_pop(p_size, p_retailprice) over w1 as covarp - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - |) t lateral view explode(uniq_size) d as uniq_data - |order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp - """.stripMargin, reset = false) - */ - - /* - [SPARK-16248][SQL] Whitelist the list of Hive fallback functions createQueryTest("windowing.q -- 21. testDISTs", """ |select p_mfgr,p_name, p_size, @@ -572,7 +544,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name | rows between 2 preceding and 2 following) """.stripMargin, reset = false) - */ createQueryTest("windowing.q -- 24. testLateralViews", """