diff --git a/sql/core/src/test/resources/sql-tests/inputs/left-semi-join.sql b/sql/core/src/test/resources/sql-tests/inputs/left-semi-join.sql new file mode 100644 index 0000000000000..3c28e3da0d374 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/left-semi-join.sql @@ -0,0 +1,24 @@ +-- A data set containing duplicate rows +CREATE OR REPLACE TEMPORARY VIEW duplicateColumnValueData AS SELECT * FROM VALUES +(1, 1), +(1, 2), +(2, 1), +(2, 2), +(3, 1), +(3, 2) +as duplicateRowData(a, b); + +-- left semi greater than predicate +SELECT * +FROM duplicateColumnValueData x LEFT SEMI JOIN duplicateColumnValueData y +ON x.a >= y.a + 2; + +-- left semi greater than predicate and equal operator #1 +SELECT * +FROM duplicateColumnValueData x LEFT SEMI JOIN duplicateColumnValueData y +ON x.b = y.b and x.a >= y.a + 2; + +-- left semi greater than predicate and equal operator #2 +SELECT * +FROM duplicateColumnValueData x LEFT SEMI JOIN duplicateColumnValueData y +ON x.b = y.a and x.a >= y.b + 1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/using-join.sql b/sql/core/src/test/resources/sql-tests/inputs/using-join.sql new file mode 100644 index 0000000000000..aca6c153d5e50 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/using-join.sql @@ -0,0 +1,38 @@ +create temporary view ut1 as select * from values + ("r1c1", "r1c2", "t1r1c3"), + ("r2c1", "r2c2", "t1r2c3"), + ("r3c1x", "r3c2", "t1r3c3") + as ut1(c1, c2, c3); + +create temporary view ut2 as select * from values + ("r1c1", "r1c2", "t2r1c3"), + ("r2c1", "r2c2", "t2r2c3"), + ("r3c1y", "r3c2", "t2r3c3") + as ut2(c1, c2, c3); + +create temporary view ut3 as select * from values + (CAST(null as String), "r1c2", "t3r1c3"), + ("r2c1", "r2c2", "t3r2c3"), + ("r3c1y", "r3c2", "t3r3c3") + as ut3(c1, c2, c3); + +-- inner join with one using column +SELECT * FROM ut1 join ut2 using (c1); + +-- inner join with two using columns +SELECT * FROM ut1 join ut2 using (c1, c2); + +-- left outer join with one using column. +SELECT * FROM ut1 left join ut2 using (c1); + +-- right outer join with one using column. +SELECT * FROM ut1 right join ut2 using (c1); + +-- full outer join with one using column. +SELECT * FROM ut1 full outer join ut2 using (c1); + +-- full outer join with null value in join column. +SELECT * FROM ut1 full outer join ut3 using (c1); + +-- self join with using columns. +SELECT * FROM ut1 join ut1 using (c1); diff --git a/sql/core/src/test/resources/sql-tests/results/left-semi-join.sql.out b/sql/core/src/test/resources/sql-tests/results/left-semi-join.sql.out new file mode 100644 index 0000000000000..e6abacdbf1491 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/left-semi-join.sql.out @@ -0,0 +1,52 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 4 + + +-- !query 0 +CREATE OR REPLACE TEMPORARY VIEW duplicateColumnValueData AS SELECT * FROM VALUES +(1, 1), +(1, 2), +(2, 1), +(2, 2), +(3, 1), +(3, 2) +as duplicateRowData(a, b) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +SELECT * +FROM duplicateColumnValueData x LEFT SEMI JOIN duplicateColumnValueData y +ON x.a >= y.a + 2 +-- !query 1 schema +struct +-- !query 1 output +3 1 +3 2 + + +-- !query 2 +SELECT * +FROM duplicateColumnValueData x LEFT SEMI JOIN duplicateColumnValueData y +ON x.b = y.b and x.a >= y.a + 2 +-- !query 2 schema +struct +-- !query 2 output +3 1 +3 2 + + +-- !query 3 +SELECT * +FROM duplicateColumnValueData x LEFT SEMI JOIN duplicateColumnValueData y +ON x.b = y.a and x.a >= y.b + 1 +-- !query 3 schema +struct +-- !query 3 output +2 1 +2 2 +3 1 +3 2 diff --git a/sql/core/src/test/resources/sql-tests/results/using-join.sql.out b/sql/core/src/test/resources/sql-tests/results/using-join.sql.out new file mode 100644 index 0000000000000..cd63bb1ec0742 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/using-join.sql.out @@ -0,0 +1,109 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 10 + + +-- !query 0 +create temporary view ut1 as select * from values + ("r1c1", "r1c2", "t1r1c3"), + ("r2c1", "r2c2", "t1r2c3"), + ("r3c1x", "r3c2", "t1r3c3") + as ut1(c1, c2, c3) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +create temporary view ut2 as select * from values + ("r1c1", "r1c2", "t2r1c3"), + ("r2c1", "r2c2", "t2r2c3"), + ("r3c1y", "r3c2", "t2r3c3") + as ut2(c1, c2, c3) +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +create temporary view ut3 as select * from values + (CAST(null as String), "r1c2", "t3r1c3"), + ("r2c1", "r2c2", "t3r2c3"), + ("r3c1y", "r3c2", "t3r3c3") + as ut3(c1, c2, c3) +-- !query 2 schema +struct<> +-- !query 2 output + + + +-- !query 3 +SELECT * FROM ut1 join ut2 using (c1) +-- !query 3 schema +struct +-- !query 3 output +r1c1 r1c2 t1r1c3 r1c2 t2r1c3 +r2c1 r2c2 t1r2c3 r2c2 t2r2c3 + + +-- !query 4 +SELECT * FROM ut1 join ut2 using (c1, c2) +-- !query 4 schema +struct +-- !query 4 output +r1c1 r1c2 t1r1c3 t2r1c3 +r2c1 r2c2 t1r2c3 t2r2c3 + + +-- !query 5 +SELECT * FROM ut1 left join ut2 using (c1) +-- !query 5 schema +struct +-- !query 5 output +r1c1 r1c2 t1r1c3 r1c2 t2r1c3 +r2c1 r2c2 t1r2c3 r2c2 t2r2c3 +r3c1x r3c2 t1r3c3 NULL NULL + + +-- !query 6 +SELECT * FROM ut1 right join ut2 using (c1) +-- !query 6 schema +struct +-- !query 6 output +r1c1 r1c2 t1r1c3 r1c2 t2r1c3 +r2c1 r2c2 t1r2c3 r2c2 t2r2c3 +r3c1y NULL NULL r3c2 t2r3c3 + + +-- !query 7 +SELECT * FROM ut1 full outer join ut2 using (c1) +-- !query 7 schema +struct +-- !query 7 output +r1c1 r1c2 t1r1c3 r1c2 t2r1c3 +r2c1 r2c2 t1r2c3 r2c2 t2r2c3 +r3c1x r3c2 t1r3c3 NULL NULL +r3c1y NULL NULL r3c2 t2r3c3 + + +-- !query 8 +SELECT * FROM ut1 full outer join ut3 using (c1) +-- !query 8 schema +struct +-- !query 8 output +NULL NULL NULL r1c2 t3r1c3 +r1c1 r1c2 t1r1c3 NULL NULL +r2c1 r2c2 t1r2c3 r2c2 t3r2c3 +r3c1x r3c2 t1r3c3 NULL NULL +r3c1y NULL NULL r3c2 t3r3c3 + + +-- !query 9 +SELECT * FROM ut1 join ut1 using (c1) +-- !query 9 schema +struct +-- !query 9 output +r1c1 r1c2 t1r1c3 r1c2 t1r1c3 +r2c1 r2c2 t1r2c3 r2c2 t1r2c3 +r3c1x r3c2 t1r3c3 r3c2 t1r3c3 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 4abf5e42b9c34..5b2fec4343c96 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, RightOuter} import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class DataFrameJoinSuite extends QueryTest with SharedSQLContext { @@ -226,6 +227,42 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext { ) } + test("cartesian product join") { + withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { + checkAnswer( + testData3.join(testData3), + Row(1, null, 1, null) :: + Row(1, null, 2, 2) :: + Row(2, 2, 1, null) :: + Row(2, 2, 2, 2) :: Nil) + } + } + + test("SortMergeJoin returns wrong results when using UnsafeRows") { + // This test is for the fix of https://issues.apache.org/jira/browse/SPARK-10737. + // This bug will be triggered when Tungsten is enabled and there are multiple + // SortMergeJoin operators executed in the same task. + val confs = SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1" :: Nil + withSQLConf(confs: _*) { + val df1 = (1 to 50).map(i => (s"str_$i", i)).toDF("i", "j") + val df2 = + df1 + .join(df1.select(df1("i")), "i") + .select(df1("i"), df1("j")) + + val df3 = df2.withColumnRenamed("i", "i1").withColumnRenamed("j", "j1") + val df4 = + df2 + .join(df3, df2("i") === df3("i1")) + .withColumn("diff", $"j" - $"j1") + .select(df2("i"), df2("j"), $"diff") + + checkAnswer( + df4, + df1.withColumn("diff", lit(0))) + } + } + test("SPARK-16991: Full outer join followed by inner join produces wrong results") { val a = Seq((1, 2), (2, 3)).toDF("a", "b") val b = Seq((2, 5), (3, 4)).toDF("a", "c") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index eac266cba55b8..7f8d68e3a0d6c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -445,27 +445,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { Nil) } - test("left semi greater than predicate") { - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { - checkAnswer( - sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.a >= y.a + 2"), - Seq(Row(3, 1), Row(3, 2)) - ) - } - } - - test("left semi greater than predicate and equal operator") { - checkAnswer( - sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.b = y.b and x.a >= y.a + 2"), - Seq(Row(3, 1), Row(3, 2)) - ) - - checkAnswer( - sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.b = y.a and x.a >= y.b + 1"), - Seq(Row(2, 1), Row(2, 2), Row(3, 1), Row(3, 2)) - ) - } - test("agg") { checkAnswer( sql("SELECT a, SUM(b) FROM testData2 GROUP BY a"), @@ -723,17 +702,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq) } - test("cartesian product join") { - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { - checkAnswer( - testData3.join(testData3), - Row(1, null, 1, null) :: - Row(1, null, 2, 2) :: - Row(2, 2, 1, null) :: - Row(2, 2, 2, 2) :: Nil) - } - } - test("left outer join") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { checkAnswer( @@ -1679,31 +1647,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { "org.apache.spark.sql.execution.datasources.jdbc")) } - test("SortMergeJoin returns wrong results when using UnsafeRows") { - // This test is for the fix of https://issues.apache.org/jira/browse/SPARK-10737. - // This bug will be triggered when Tungsten is enabled and there are multiple - // SortMergeJoin operators executed in the same task. - val confs = SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1" :: Nil - withSQLConf(confs: _*) { - val df1 = (1 to 50).map(i => (s"str_$i", i)).toDF("i", "j") - val df2 = - df1 - .join(df1.select(df1("i")), "i") - .select(df1("i"), df1("j")) - - val df3 = df2.withColumnRenamed("i", "i1").withColumnRenamed("j", "j1") - val df4 = - df2 - .join(df3, df2("i") === df3("i1")) - .withColumn("diff", $"j" - $"j1") - .select(df2("i"), df2("j"), $"diff") - - checkAnswer( - df4, - df1.withColumn("diff", lit(0))) - } - } - test("SPARK-11303: filter should not be pushed down into sample") { val df = spark.range(100) List(true, false).foreach { withReplacement => @@ -2203,70 +2146,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } } - test("join with using clause") { - val df1 = Seq(("r1c1", "r1c2", "t1r1c3"), - ("r2c1", "r2c2", "t1r2c3"), ("r3c1x", "r3c2", "t1r3c3")).toDF("c1", "c2", "c3") - val df2 = Seq(("r1c1", "r1c2", "t2r1c3"), - ("r2c1", "r2c2", "t2r2c3"), ("r3c1y", "r3c2", "t2r3c3")).toDF("c1", "c2", "c3") - val df3 = Seq((null, "r1c2", "t3r1c3"), - ("r2c1", "r2c2", "t3r2c3"), ("r3c1y", "r3c2", "t3r3c3")).toDF("c1", "c2", "c3") - withTempView("t1", "t2", "t3") { - df1.createOrReplaceTempView("t1") - df2.createOrReplaceTempView("t2") - df3.createOrReplaceTempView("t3") - // inner join with one using column - checkAnswer( - sql("SELECT * FROM t1 join t2 using (c1)"), - Row("r1c1", "r1c2", "t1r1c3", "r1c2", "t2r1c3") :: - Row("r2c1", "r2c2", "t1r2c3", "r2c2", "t2r2c3") :: Nil) - - // inner join with two using columns - checkAnswer( - sql("SELECT * FROM t1 join t2 using (c1, c2)"), - Row("r1c1", "r1c2", "t1r1c3", "t2r1c3") :: - Row("r2c1", "r2c2", "t1r2c3", "t2r2c3") :: Nil) - - // Left outer join with one using column. - checkAnswer( - sql("SELECT * FROM t1 left join t2 using (c1)"), - Row("r1c1", "r1c2", "t1r1c3", "r1c2", "t2r1c3") :: - Row("r2c1", "r2c2", "t1r2c3", "r2c2", "t2r2c3") :: - Row("r3c1x", "r3c2", "t1r3c3", null, null) :: Nil) - - // Right outer join with one using column. - checkAnswer( - sql("SELECT * FROM t1 right join t2 using (c1)"), - Row("r1c1", "r1c2", "t1r1c3", "r1c2", "t2r1c3") :: - Row("r2c1", "r2c2", "t1r2c3", "r2c2", "t2r2c3") :: - Row("r3c1y", null, null, "r3c2", "t2r3c3") :: Nil) - - // Full outer join with one using column. - checkAnswer( - sql("SELECT * FROM t1 full outer join t2 using (c1)"), - Row("r1c1", "r1c2", "t1r1c3", "r1c2", "t2r1c3") :: - Row("r2c1", "r2c2", "t1r2c3", "r2c2", "t2r2c3") :: - Row("r3c1x", "r3c2", "t1r3c3", null, null) :: - Row("r3c1y", null, - null, "r3c2", "t2r3c3") :: Nil) - - // Full outer join with null value in join column. - checkAnswer( - sql("SELECT * FROM t1 full outer join t3 using (c1)"), - Row("r1c1", "r1c2", "t1r1c3", null, null) :: - Row("r2c1", "r2c2", "t1r2c3", "r2c2", "t3r2c3") :: - Row("r3c1x", "r3c2", "t1r3c3", null, null) :: - Row("r3c1y", null, null, "r3c2", "t3r3c3") :: - Row(null, null, null, "r1c2", "t3r1c3") :: Nil) - - // Self join with using columns. - checkAnswer( - sql("SELECT * FROM t1 join t1 using (c1)"), - Row("r1c1", "r1c2", "t1r1c3", "r1c2", "t1r1c3") :: - Row("r2c1", "r2c2", "t1r2c3", "r2c2", "t1r2c3") :: - Row("r3c1x", "r3c2", "t1r3c3", "r3c2", "t1r3c3") :: Nil) - } - } - test("SPARK-15327: fail to compile generated code with complex data structure") { withTempDir{ dir => val json =