From 440f76bdbf4d720a361e0afde3599027ff6e7be2 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 14 Jan 2018 00:20:11 -0800 Subject: [PATCH 1/4] [SPARK-21783][SQL] Turn on ORC filter push-down by default --- .../apache/spark/sql/internal/SQLConf.scala | 2 +- .../spark/sql/FilterPushdownBenchmark.scala | 195 ++++++++++++++++++ 2 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 6746fbcaf2483..16fbb0c3e9e21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -410,7 +410,7 @@ object SQLConf { val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown") .doc("When true, enable filter pushdown for ORC files.") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val HIVE_VERIFY_PARTITION_PATH = buildConf("spark.sql.hive.verifyPartitionPath") .doc("When true, check all the partition paths under the table\'s root directory " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala new file mode 100644 index 0000000000000..44770a02de086 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.io.File + +import scala.util.{Random, Try} + +import org.apache.spark.SparkConf +import org.apache.spark.sql.functions.monotonically_increasing_id +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.util.{Benchmark, Utils} + + +/** + * Benchmark to measure read performance with Filter pushdown. + */ +// scalastyle:off line.size.limit +object FilterPushdownBenchmark { + val conf = new SparkConf() + conf.set("orc.compression", "snappy") + conf.set("spark.sql.parquet.compression.codec", "snappy") + + private val spark = SparkSession.builder() + .master("local[1]") + .appName("FilterPushdownBenchmark") + .config(conf) + .getOrCreate() + + // Set default configs. Individual cases will change them if necessary. + spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") + spark.conf.set(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key, "true") + + def withTempPath(f: File => Unit): Unit = { + val path = Utils.createTempDir() + path.delete() + try f(path) finally Utils.deleteRecursively(path) + } + + def withTempTable(tableNames: String*)(f: => Unit): Unit = { + try f finally tableNames.foreach(spark.catalog.dropTempView) + } + + def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { + val (keys, values) = pairs.unzip + val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption) + (keys, values).zipped.foreach(spark.conf.set) + try f finally { + keys.zip(currentValues).foreach { + case (key, Some(value)) => spark.conf.set(key, value) + case (key, None) => spark.conf.unset(key) + } + } + } + + private def prepareTable(dir: File, df: DataFrame): Unit = { + val dirORC = dir.getCanonicalPath + "/orc" + val dirParquet = dir.getCanonicalPath + "/parquet" + + df.write.mode("overwrite").orc(dirORC) + df.write.mode("overwrite").parquet(dirParquet) + + spark.read.orc(dirORC).createOrReplaceTempView("orcTable") + spark.read.parquet(dirParquet).createOrReplaceTempView("parquetTable") + } + + def filterPushDownBenchmark(values: Int, width: Int, expr: String): Unit = { + val benchmark = new Benchmark(s"Filter Pushdown ($expr)", values) + + withTempPath { dir => + withTempTable("t1", "orcTable", "patquetTable") { + import spark.implicits._ + val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i") + val df = spark.range(values).map(_ => Random.nextLong).selectExpr(selectExpr: _*) + .withColumn("id", monotonically_increasing_id()) + + df.createOrReplaceTempView("t1") + prepareTable(dir, spark.sql("SELECT * FROM t1")) + + Seq(false, true).foreach { value => + benchmark.addCase(s"Parquet Vectorized ${if (value) s"(Pushdown)" else ""}") { _ => + withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$value") { + spark.sql(s"SELECT * FROM parquetTable WHERE $expr").collect() + } + } + } + + Seq(false, true).foreach { value => + benchmark.addCase(s"Native ORC Vectorized ${if (value) s"(Pushdown)" else ""}") { _ => + withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$value") { + spark.sql(s"SELECT * FROM orcTable WHERE $expr").collect() + } + } + } + + // Positive cases: Select one or no rows + /* + Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2 + Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz + + Filter Pushdown (id = 0): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 2267 / 2287 0.5 2162.0 1.0X + Parquet Vectorized (Pushdown) 735 / 803 1.4 701.1 3.1X + Native ORC Vectorized 1708 / 1718 0.6 1629.1 1.3X + Native ORC Vectorized (Pushdown) 83 / 88 12.7 79.0 27.4X + + Filter Pushdown (id == 0): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 2005 / 2123 0.5 1911.7 1.0X + Parquet Vectorized (Pushdown) 701 / 773 1.5 668.1 2.9X + Native ORC Vectorized 1618 / 1632 0.6 1543.3 1.2X + Native ORC Vectorized (Pushdown) 77 / 80 13.6 73.6 26.0X + + Filter Pushdown (id <= 0): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 2085 / 2165 0.5 1988.0 1.0X + Parquet Vectorized (Pushdown) 704 / 769 1.5 671.1 3.0X + Native ORC Vectorized 1637 / 1638 0.6 1561.1 1.3X + Native ORC Vectorized (Pushdown) 76 / 79 13.8 72.4 27.4X + + Filter Pushdown (id < 1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 2069 / 2133 0.5 1972.7 1.0X + Parquet Vectorized (Pushdown) 705 / 764 1.5 672.7 2.9X + Native ORC Vectorized 1637 / 1651 0.6 1561.3 1.3X + Native ORC Vectorized (Pushdown) 75 / 77 14.0 71.4 27.6X + + Filter Pushdown (id IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 2081 / 2123 0.5 1984.4 1.0X + Parquet Vectorized (Pushdown) 36 / 37 29.3 34.1 58.1X + Native ORC Vectorized 1616 / 1645 0.6 1540.7 1.3X + Native ORC Vectorized (Pushdown) 41 / 43 25.7 39.0 50.9X + */ + + // Negative cases: Select all rows which means the predicate is always true. + /* + Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2 + Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz + + Filter Pushdown (id > -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 8346 / 8516 0.1 7959.8 1.0X + Parquet Vectorized (Pushdown) 8611 / 8630 0.1 8212.4 1.0X + Native ORC Vectorized 7700 / 7940 0.1 7343.2 1.1X + Native ORC Vectorized (Pushdown) 7572 / 7635 0.1 7221.5 1.1X + + Filter Pushdown (id != -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 8088 / 8297 0.1 7713.2 1.0X + Parquet Vectorized (Pushdown) 7110 / 8674 0.1 6780.8 1.1X + Native ORC Vectorized 7430 / 7567 0.1 7086.0 1.1X + Native ORC Vectorized (Pushdown) 7739 / 7832 0.1 7380.9 1.0X + + Filter Pushdown (id IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Parquet Vectorized 7927 / 8284 0.1 7560.3 1.0X + Parquet Vectorized (Pushdown) 7329 / 7332 0.1 6989.6 1.1X + Native ORC Vectorized 7928 / 7971 0.1 7560.5 1.0X + Native ORC Vectorized (Pushdown) 7392 / 7502 0.1 7049.9 1.1X + */ + benchmark.run() + } + } + } + + def main(args: Array[String]): Unit = { + // Positive cases: Select one or no rows + Seq("id = 0", "id == 0", "id <= 0", "id < 1", "id IS NULL").foreach { expr => + filterPushDownBenchmark(1024 * 1024 * 1, 20, expr) + } + + // Negative cases: Select all rows which means the predicate is always true. + Seq("id > -1", "id != -1", "id IS NOT NULL").foreach { expr => + filterPushDownBenchmark(1024 * 1024 * 1, 20, expr) + } + } +} +// scalastyle:on line.size.limit From 87af693a82f9591a256c55a5eca65041f330a225 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 16 Jan 2018 13:13:29 -0800 Subject: [PATCH 2/4] Address comments --- .../spark/sql/FilterPushdownBenchmark.scala | 249 ++++++++++-------- 1 file changed, 142 insertions(+), 107 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala index 44770a02de086..ef7f2fe277e9c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala @@ -30,7 +30,6 @@ import org.apache.spark.util.{Benchmark, Utils} /** * Benchmark to measure read performance with Filter pushdown. */ -// scalastyle:off line.size.limit object FilterPushdownBenchmark { val conf = new SparkConf() conf.set("orc.compression", "snappy") @@ -42,10 +41,6 @@ object FilterPushdownBenchmark { .config(conf) .getOrCreate() - // Set default configs. Individual cases will change them if necessary. - spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") - spark.conf.set(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key, "true") - def withTempPath(f: File => Unit): Unit = { val path = Utils.createTempDir() path.delete() @@ -68,7 +63,12 @@ object FilterPushdownBenchmark { } } - private def prepareTable(dir: File, df: DataFrame): Unit = { + private def prepareTable(dir: File, numRows: Int, width: Int): Unit = { + import spark.implicits._ + val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i") + val df = spark.range(numRows).map(_ => Random.nextLong).selectExpr(selectExpr: _*) + .withColumn("id", monotonically_increasing_id()) + val dirORC = dir.getCanonicalPath + "/orc" val dirParquet = dir.getCanonicalPath + "/parquet" @@ -79,117 +79,152 @@ object FilterPushdownBenchmark { spark.read.parquet(dirParquet).createOrReplaceTempView("parquetTable") } - def filterPushDownBenchmark(values: Int, width: Int, expr: String): Unit = { - val benchmark = new Benchmark(s"Filter Pushdown ($expr)", values) + def filterPushDownBenchmark(values: Int, title: String, expr: String): Unit = { + val benchmark = new Benchmark(title, values, minNumIters = 5) - withTempPath { dir => - withTempTable("t1", "orcTable", "patquetTable") { - import spark.implicits._ - val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i") - val df = spark.range(values).map(_ => Random.nextLong).selectExpr(selectExpr: _*) - .withColumn("id", monotonically_increasing_id()) - - df.createOrReplaceTempView("t1") - prepareTable(dir, spark.sql("SELECT * FROM t1")) - - Seq(false, true).foreach { value => - benchmark.addCase(s"Parquet Vectorized ${if (value) s"(Pushdown)" else ""}") { _ => - withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$value") { - spark.sql(s"SELECT * FROM parquetTable WHERE $expr").collect() - } - } + Seq(false, true).foreach { pushDownEnabled => + val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" + benchmark.addCase(name) { _ => + withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") { + spark.sql(s"SELECT * FROM parquetTable WHERE $expr").collect() } + } + } - Seq(false, true).foreach { value => - benchmark.addCase(s"Native ORC Vectorized ${if (value) s"(Pushdown)" else ""}") { _ => - withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$value") { - spark.sql(s"SELECT * FROM orcTable WHERE $expr").collect() - } - } + Seq(false, true).foreach { pushDownEnabled => + val name = s"Native ORC Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" + benchmark.addCase(name) { _ => + withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") { + spark.sql(s"SELECT * FROM orcTable WHERE $expr").collect() } - - // Positive cases: Select one or no rows - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2 - Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz - - Filter Pushdown (id = 0): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 2267 / 2287 0.5 2162.0 1.0X - Parquet Vectorized (Pushdown) 735 / 803 1.4 701.1 3.1X - Native ORC Vectorized 1708 / 1718 0.6 1629.1 1.3X - Native ORC Vectorized (Pushdown) 83 / 88 12.7 79.0 27.4X - - Filter Pushdown (id == 0): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 2005 / 2123 0.5 1911.7 1.0X - Parquet Vectorized (Pushdown) 701 / 773 1.5 668.1 2.9X - Native ORC Vectorized 1618 / 1632 0.6 1543.3 1.2X - Native ORC Vectorized (Pushdown) 77 / 80 13.6 73.6 26.0X - - Filter Pushdown (id <= 0): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 2085 / 2165 0.5 1988.0 1.0X - Parquet Vectorized (Pushdown) 704 / 769 1.5 671.1 3.0X - Native ORC Vectorized 1637 / 1638 0.6 1561.1 1.3X - Native ORC Vectorized (Pushdown) 76 / 79 13.8 72.4 27.4X - - Filter Pushdown (id < 1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 2069 / 2133 0.5 1972.7 1.0X - Parquet Vectorized (Pushdown) 705 / 764 1.5 672.7 2.9X - Native ORC Vectorized 1637 / 1651 0.6 1561.3 1.3X - Native ORC Vectorized (Pushdown) 75 / 77 14.0 71.4 27.6X - - Filter Pushdown (id IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 2081 / 2123 0.5 1984.4 1.0X - Parquet Vectorized (Pushdown) 36 / 37 29.3 34.1 58.1X - Native ORC Vectorized 1616 / 1645 0.6 1540.7 1.3X - Native ORC Vectorized (Pushdown) 41 / 43 25.7 39.0 50.9X - */ - - // Negative cases: Select all rows which means the predicate is always true. - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2 - Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz - - Filter Pushdown (id > -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 8346 / 8516 0.1 7959.8 1.0X - Parquet Vectorized (Pushdown) 8611 / 8630 0.1 8212.4 1.0X - Native ORC Vectorized 7700 / 7940 0.1 7343.2 1.1X - Native ORC Vectorized (Pushdown) 7572 / 7635 0.1 7221.5 1.1X - - Filter Pushdown (id != -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 8088 / 8297 0.1 7713.2 1.0X - Parquet Vectorized (Pushdown) 7110 / 8674 0.1 6780.8 1.1X - Native ORC Vectorized 7430 / 7567 0.1 7086.0 1.1X - Native ORC Vectorized (Pushdown) 7739 / 7832 0.1 7380.9 1.0X - - Filter Pushdown (id IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Parquet Vectorized 7927 / 8284 0.1 7560.3 1.0X - Parquet Vectorized (Pushdown) 7329 / 7332 0.1 6989.6 1.1X - Native ORC Vectorized 7928 / 7971 0.1 7560.5 1.0X - Native ORC Vectorized (Pushdown) 7392 / 7502 0.1 7049.9 1.1X - */ - benchmark.run() } } + + /* + Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2 + Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz + + Select 0 row (id IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2091 / 2258 0.5 1993.9 1.0X + Parquet Vectorized (Pushdown) 41 / 44 25.6 39.0 51.1X + Native ORC Vectorized 1625 / 1648 0.6 1549.6 1.3X + Native ORC Vectorized (Pushdown) 45 / 47 23.5 42.5 46.9X + + Select 0 row (524288 < id < 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2202 / 2294 0.5 2099.7 1.0X + Parquet Vectorized (Pushdown) 734 / 844 1.4 699.9 3.0X + Native ORC Vectorized 1632 / 1659 0.6 1556.0 1.3X + Native ORC Vectorized (Pushdown) 94 / 98 11.2 89.6 23.4X + + Select 1 row (id = 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2113 / 2160 0.5 2015.3 1.0X + Parquet Vectorized (Pushdown) 711 / 790 1.5 677.7 3.0X + Native ORC Vectorized 1612 / 1657 0.7 1537.2 1.3X + Native ORC Vectorized (Pushdown) 92 / 95 11.4 87.7 23.0X + + Select 1 row (id <=> 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2105 / 2149 0.5 2007.9 1.0X + Parquet Vectorized (Pushdown) 712 / 794 1.5 679.2 3.0X + Native ORC Vectorized 1619 / 1655 0.6 1543.7 1.3X + Native ORC Vectorized (Pushdown) 90 / 93 11.6 85.9 23.4X + + Select 1 row (524288 <= id <= 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2081 / 2120 0.5 1984.8 1.0X + Parquet Vectorized (Pushdown) 700 / 793 1.5 667.5 3.0X + Native ORC Vectorized 1618 / 1653 0.6 1542.7 1.3X + Native ORC Vectorized (Pushdown) 91 / 94 11.5 86.6 22.9X + + Select 1 row (524287 < id < 524289): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2094 / 2127 0.5 1997.3 1.0X + Parquet Vectorized (Pushdown) 714 / 792 1.5 680.8 2.9X + Native ORC Vectorized 1621 / 1644 0.6 1546.3 1.3X + Native ORC Vectorized (Pushdown) 90 / 94 11.6 86.1 23.2X + + Select 10% rows (id < 104857): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 2498 / 2591 0.4 2381.9 1.0X + Parquet Vectorized (Pushdown) 1047 / 1082 1.0 998.2 2.4X + Native ORC Vectorized 1986 / 2119 0.5 1893.8 1.3X + Native ORC Vectorized (Pushdown) 552 / 582 1.9 526.1 4.5X + + Select 50% rows (id < 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 4321 / 5021 0.2 4121.3 1.0X + Parquet Vectorized (Pushdown) 3967 / 4183 0.3 3783.6 1.1X + Native ORC Vectorized 4107 / 4565 0.3 3916.9 1.1X + Native ORC Vectorized (Pushdown) 2983 / 3861 0.4 2844.5 1.4X + + Select 90% rows (id < 943718): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 6815 / 7287 0.2 6499.0 1.0X + Parquet Vectorized (Pushdown) 6891 / 7220 0.2 6571.5 1.0X + Native ORC Vectorized 7337 / 7565 0.1 6997.1 0.9X + Native ORC Vectorized (Pushdown) 7274 / 7523 0.1 6936.6 0.9X + + Select all rows (id IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 7321 / 7380 0.1 6981.5 1.0X + Parquet Vectorized (Pushdown) 7352 / 7398 0.1 7011.2 1.0X + Native ORC Vectorized 7386 / 7660 0.1 7043.9 1.0X + Native ORC Vectorized (Pushdown) 7629 / 7705 0.1 7275.9 1.0X + + Select all rows (id > -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 7125 / 7384 0.1 6795.2 1.0X + Parquet Vectorized (Pushdown) 7334 / 7390 0.1 6994.3 1.0X + Native ORC Vectorized 7517 / 7642 0.1 7168.7 0.9X + Native ORC Vectorized (Pushdown) 7323 / 7601 0.1 6983.7 1.0X + + Select all rows (id != -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ----------------------------------------------------------------------------------------------- + Parquet Vectorized 7281 / 7850 0.1 6944.0 1.0X + Parquet Vectorized (Pushdown) 7311 / 7939 0.1 6972.7 1.0X + Native ORC Vectorized 7530 / 7748 0.1 7181.4 1.0X + Native ORC Vectorized (Pushdown) 7309 / 7667 0.1 6970.2 1.0X + */ + benchmark.run() } def main(args: Array[String]): Unit = { - // Positive cases: Select one or no rows - Seq("id = 0", "id == 0", "id <= 0", "id < 1", "id IS NULL").foreach { expr => - filterPushDownBenchmark(1024 * 1024 * 1, 20, expr) - } + val numRows = 1024 * 1024 + val width = 20 + val mid = numRows / 2 + + withTempPath { dir => + withTempTable("orcTable", "patquetTable") { + prepareTable(dir, numRows, width) + + Seq("id IS NULL", s"$mid < id AND id < $mid").foreach { expr => + val title = s"Select 0 row ($expr)".replace("id AND id", "id") + filterPushDownBenchmark(numRows, title, expr) + } + + Seq( + s"id = $mid", + s"id <=> $mid", + s"$mid <= id AND id <= $mid", + s"${mid - 1} < id AND id < ${mid + 1}" + ).foreach { expr => + val title = s"Select 1 row ($expr)".replace("id AND id", "id") + filterPushDownBenchmark(numRows, title, expr) + } - // Negative cases: Select all rows which means the predicate is always true. - Seq("id > -1", "id != -1", "id IS NOT NULL").foreach { expr => - filterPushDownBenchmark(1024 * 1024 * 1, 20, expr) + Seq(10, 50, 90).foreach { percent => + filterPushDownBenchmark(numRows, + s"Select $percent% rows (id < ${numRows * percent / 100})", + s"id < ${numRows * percent / 100}") + } + + Seq("id IS NOT NULL", "id > -1", "id != -1").foreach { expr => + filterPushDownBenchmark(numRows, s"Select all rows ($expr)", expr) + } + } } } } -// scalastyle:on line.size.limit From a5561697314527938a0cff085be33b215a746c4a Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 17 Jan 2018 00:10:05 -0800 Subject: [PATCH 3/4] Increase the number of rows and reduce the number of columns --- .../spark/sql/FilterPushdownBenchmark.scala | 155 ++++++++++-------- 1 file changed, 84 insertions(+), 71 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala index ef7f2fe277e9c..94fb06dc6bd11 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala @@ -79,14 +79,18 @@ object FilterPushdownBenchmark { spark.read.parquet(dirParquet).createOrReplaceTempView("parquetTable") } - def filterPushDownBenchmark(values: Int, title: String, expr: String): Unit = { + def filterPushDownBenchmark( + values: Int, + title: String, + whereExpr: String, + selectExpr: String = "*"): Unit = { val benchmark = new Benchmark(title, values, minNumIters = 5) Seq(false, true).foreach { pushDownEnabled => val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" benchmark.addCase(name) { _ => withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") { - spark.sql(s"SELECT * FROM parquetTable WHERE $expr").collect() + spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE $whereExpr").collect() } } } @@ -95,7 +99,7 @@ object FilterPushdownBenchmark { val name = s"Native ORC Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" benchmark.addCase(name) { _ => withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") { - spark.sql(s"SELECT * FROM orcTable WHERE $expr").collect() + spark.sql(s"SELECT $selectExpr FROM orcTable WHERE $whereExpr").collect() } } } @@ -106,103 +110,103 @@ object FilterPushdownBenchmark { Select 0 row (id IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2091 / 2258 0.5 1993.9 1.0X - Parquet Vectorized (Pushdown) 41 / 44 25.6 39.0 51.1X - Native ORC Vectorized 1625 / 1648 0.6 1549.6 1.3X - Native ORC Vectorized (Pushdown) 45 / 47 23.5 42.5 46.9X + Parquet Vectorized 7906 / 7955 2.0 502.6 1.0X + Parquet Vectorized (Pushdown) 56 / 60 281.1 3.6 141.3X + Native ORC Vectorized 5655 / 5700 2.8 359.5 1.4X + Native ORC Vectorized (Pushdown) 68 / 71 233.0 4.3 117.1X - Select 0 row (524288 < id < 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 0 row (7864320 < id < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2202 / 2294 0.5 2099.7 1.0X - Parquet Vectorized (Pushdown) 734 / 844 1.4 699.9 3.0X - Native ORC Vectorized 1632 / 1659 0.6 1556.0 1.3X - Native ORC Vectorized (Pushdown) 94 / 98 11.2 89.6 23.4X + Parquet Vectorized 7891 / 7922 2.0 501.7 1.0X + Parquet Vectorized (Pushdown) 746 / 769 21.1 47.5 10.6X + Native ORC Vectorized 5645 / 5686 2.8 358.9 1.4X + Native ORC Vectorized (Pushdown) 82 / 84 192.9 5.2 96.8X - Select 1 row (id = 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 1 row (id = 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2113 / 2160 0.5 2015.3 1.0X - Parquet Vectorized (Pushdown) 711 / 790 1.5 677.7 3.0X - Native ORC Vectorized 1612 / 1657 0.7 1537.2 1.3X - Native ORC Vectorized (Pushdown) 92 / 95 11.4 87.7 23.0X + Parquet Vectorized 7963 / 8069 2.0 506.3 1.0X + Parquet Vectorized (Pushdown) 752 / 778 20.9 47.8 10.6X + Native ORC Vectorized 5726 / 5789 2.7 364.1 1.4X + Native ORC Vectorized (Pushdown) 78 / 81 201.4 5.0 102.0X - Select 1 row (id <=> 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 1 row (id <=> 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2105 / 2149 0.5 2007.9 1.0X - Parquet Vectorized (Pushdown) 712 / 794 1.5 679.2 3.0X - Native ORC Vectorized 1619 / 1655 0.6 1543.7 1.3X - Native ORC Vectorized (Pushdown) 90 / 93 11.6 85.9 23.4X + Parquet Vectorized 7983 / 8015 2.0 507.5 1.0X + Parquet Vectorized (Pushdown) 753 / 774 20.9 47.9 10.6X + Native ORC Vectorized 5772 / 5814 2.7 367.0 1.4X + Native ORC Vectorized (Pushdown) 76 / 78 207.3 4.8 105.2X - Select 1 row (524288 <= id <= 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 1 row (7864320 <= id <= 7864320):Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2081 / 2120 0.5 1984.8 1.0X - Parquet Vectorized (Pushdown) 700 / 793 1.5 667.5 3.0X - Native ORC Vectorized 1618 / 1653 0.6 1542.7 1.3X - Native ORC Vectorized (Pushdown) 91 / 94 11.5 86.6 22.9X + Parquet Vectorized 7929 / 7999 2.0 504.1 1.0X + Parquet Vectorized (Pushdown) 747 / 770 21.1 47.5 10.6X + Native ORC Vectorized 5756 / 5810 2.7 366.0 1.4X + Native ORC Vectorized (Pushdown) 76 / 79 206.4 4.8 104.0X - Select 1 row (524287 < id < 524289): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 1 row (7864319 < id < 7864321): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2094 / 2127 0.5 1997.3 1.0X - Parquet Vectorized (Pushdown) 714 / 792 1.5 680.8 2.9X - Native ORC Vectorized 1621 / 1644 0.6 1546.3 1.3X - Native ORC Vectorized (Pushdown) 90 / 94 11.6 86.1 23.2X + Parquet Vectorized 7968 / 8027 2.0 506.6 1.0X + Parquet Vectorized (Pushdown) 750 / 771 21.0 47.7 10.6X + Native ORC Vectorized 5776 / 5811 2.7 367.2 1.4X + Native ORC Vectorized (Pushdown) 75 / 78 208.5 4.8 105.6X - Select 10% rows (id < 104857): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 10% rows (id < 1572864): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 2498 / 2591 0.4 2381.9 1.0X - Parquet Vectorized (Pushdown) 1047 / 1082 1.0 998.2 2.4X - Native ORC Vectorized 1986 / 2119 0.5 1893.8 1.3X - Native ORC Vectorized (Pushdown) 552 / 582 1.9 526.1 4.5X + Parquet Vectorized 8156 / 8257 1.9 518.5 1.0X + Parquet Vectorized (Pushdown) 1620 / 1684 9.7 103.0 5.0X + Native ORC Vectorized 5951 / 5990 2.6 378.3 1.4X + Native ORC Vectorized (Pushdown) 803 / 810 19.6 51.0 10.2X - Select 50% rows (id < 524288): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 50% rows (id < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 4321 / 5021 0.2 4121.3 1.0X - Parquet Vectorized (Pushdown) 3967 / 4183 0.3 3783.6 1.1X - Native ORC Vectorized 4107 / 4565 0.3 3916.9 1.1X - Native ORC Vectorized (Pushdown) 2983 / 3861 0.4 2844.5 1.4X + Parquet Vectorized 8690 / 8717 1.8 552.5 1.0X + Parquet Vectorized (Pushdown) 5067 / 5099 3.1 322.2 1.7X + Native ORC Vectorized 6530 / 6552 2.4 415.1 1.3X + Native ORC Vectorized (Pushdown) 3630 / 3670 4.3 230.8 2.4X - Select 90% rows (id < 943718): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Select 90% rows (id < 14155776): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 6815 / 7287 0.2 6499.0 1.0X - Parquet Vectorized (Pushdown) 6891 / 7220 0.2 6571.5 1.0X - Native ORC Vectorized 7337 / 7565 0.1 6997.1 0.9X - Native ORC Vectorized (Pushdown) 7274 / 7523 0.1 6936.6 0.9X + Parquet Vectorized 9241 / 9293 1.7 587.5 1.0X + Parquet Vectorized (Pushdown) 8474 / 8505 1.9 538.8 1.1X + Native ORC Vectorized 7080 / 7107 2.2 450.1 1.3X + Native ORC Vectorized (Pushdown) 6507 / 6552 2.4 413.7 1.4X Select all rows (id IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7321 / 7380 0.1 6981.5 1.0X - Parquet Vectorized (Pushdown) 7352 / 7398 0.1 7011.2 1.0X - Native ORC Vectorized 7386 / 7660 0.1 7043.9 1.0X - Native ORC Vectorized (Pushdown) 7629 / 7705 0.1 7275.9 1.0X + Parquet Vectorized 9317 / 9366 1.7 592.4 1.0X + Parquet Vectorized (Pushdown) 9316 / 9367 1.7 592.3 1.0X + Native ORC Vectorized 7148 / 7210 2.2 454.5 1.3X + Native ORC Vectorized (Pushdown) 7092 / 7152 2.2 450.9 1.3X Select all rows (id > -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7125 / 7384 0.1 6795.2 1.0X - Parquet Vectorized (Pushdown) 7334 / 7390 0.1 6994.3 1.0X - Native ORC Vectorized 7517 / 7642 0.1 7168.7 0.9X - Native ORC Vectorized (Pushdown) 7323 / 7601 0.1 6983.7 1.0X + Parquet Vectorized 9307 / 9353 1.7 591.7 1.0X + Parquet Vectorized (Pushdown) 9303 / 9340 1.7 591.5 1.0X + Native ORC Vectorized 7192 / 7249 2.2 457.2 1.3X + Native ORC Vectorized (Pushdown) 7182 / 7216 2.2 456.6 1.3X Select all rows (id != -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7281 / 7850 0.1 6944.0 1.0X - Parquet Vectorized (Pushdown) 7311 / 7939 0.1 6972.7 1.0X - Native ORC Vectorized 7530 / 7748 0.1 7181.4 1.0X - Native ORC Vectorized (Pushdown) 7309 / 7667 0.1 6970.2 1.0X + Parquet Vectorized 9145 / 9328 1.7 581.4 1.0X + Parquet Vectorized (Pushdown) 9320 / 9368 1.7 592.5 1.0X + Native ORC Vectorized 7202 / 7230 2.2 457.9 1.3X + Native ORC Vectorized (Pushdown) 7170 / 7206 2.2 455.9 1.3X */ benchmark.run() } def main(args: Array[String]): Unit = { - val numRows = 1024 * 1024 - val width = 20 + val numRows = 1024 * 1024 * 15 + val width = 5 val mid = numRows / 2 withTempPath { dir => withTempTable("orcTable", "patquetTable") { prepareTable(dir, numRows, width) - Seq("id IS NULL", s"$mid < id AND id < $mid").foreach { expr => - val title = s"Select 0 row ($expr)".replace("id AND id", "id") - filterPushDownBenchmark(numRows, title, expr) + Seq("id IS NULL", s"$mid < id AND id < $mid").foreach { whereExpr => + val title = s"Select 0 row ($whereExpr)".replace("id AND id", "id") + filterPushDownBenchmark(numRows, title, whereExpr) } Seq( @@ -210,19 +214,28 @@ object FilterPushdownBenchmark { s"id <=> $mid", s"$mid <= id AND id <= $mid", s"${mid - 1} < id AND id < ${mid + 1}" - ).foreach { expr => - val title = s"Select 1 row ($expr)".replace("id AND id", "id") - filterPushDownBenchmark(numRows, title, expr) + ).foreach { whereExpr => + val title = s"Select 1 row ($whereExpr)".replace("id AND id", "id") + filterPushDownBenchmark(numRows, title, whereExpr) } + val selectExpr = (1 to width).map(i => s"LENGTH(c$i)").mkString("SUM(", "+", ")") + Seq(10, 50, 90).foreach { percent => - filterPushDownBenchmark(numRows, + filterPushDownBenchmark( + numRows, s"Select $percent% rows (id < ${numRows * percent / 100})", - s"id < ${numRows * percent / 100}") + s"id < ${numRows * percent / 100}", + selectExpr + ) } - Seq("id IS NOT NULL", "id > -1", "id != -1").foreach { expr => - filterPushDownBenchmark(numRows, s"Select all rows ($expr)", expr) + Seq("id IS NOT NULL", "id > -1", "id != -1").foreach { whereExpr => + filterPushDownBenchmark( + numRows, + s"Select all rows ($whereExpr)", + whereExpr, + selectExpr) } } } From eb7035defe225c53ac8e43d63d6e3e6a974f4b1c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 17 Jan 2018 01:17:29 -0800 Subject: [PATCH 4/4] Address comments --- .../spark/sql/FilterPushdownBenchmark.scala | 98 +++++++++---------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala index 94fb06dc6bd11..c6dd7dadc9d93 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala @@ -110,87 +110,87 @@ object FilterPushdownBenchmark { Select 0 row (id IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7906 / 7955 2.0 502.6 1.0X - Parquet Vectorized (Pushdown) 56 / 60 281.1 3.6 141.3X - Native ORC Vectorized 5655 / 5700 2.8 359.5 1.4X - Native ORC Vectorized (Pushdown) 68 / 71 233.0 4.3 117.1X + Parquet Vectorized 7882 / 7957 2.0 501.1 1.0X + Parquet Vectorized (Pushdown) 55 / 60 285.2 3.5 142.9X + Native ORC Vectorized 5592 / 5627 2.8 355.5 1.4X + Native ORC Vectorized (Pushdown) 66 / 70 237.2 4.2 118.9X Select 0 row (7864320 < id < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7891 / 7922 2.0 501.7 1.0X - Parquet Vectorized (Pushdown) 746 / 769 21.1 47.5 10.6X - Native ORC Vectorized 5645 / 5686 2.8 358.9 1.4X - Native ORC Vectorized (Pushdown) 82 / 84 192.9 5.2 96.8X + Parquet Vectorized 7884 / 7909 2.0 501.2 1.0X + Parquet Vectorized (Pushdown) 739 / 752 21.3 47.0 10.7X + Native ORC Vectorized 5614 / 5646 2.8 356.9 1.4X + Native ORC Vectorized (Pushdown) 81 / 83 195.2 5.1 97.8X Select 1 row (id = 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7963 / 8069 2.0 506.3 1.0X - Parquet Vectorized (Pushdown) 752 / 778 20.9 47.8 10.6X - Native ORC Vectorized 5726 / 5789 2.7 364.1 1.4X - Native ORC Vectorized (Pushdown) 78 / 81 201.4 5.0 102.0X + Parquet Vectorized 7905 / 8027 2.0 502.6 1.0X + Parquet Vectorized (Pushdown) 740 / 766 21.2 47.1 10.7X + Native ORC Vectorized 5684 / 5738 2.8 361.4 1.4X + Native ORC Vectorized (Pushdown) 78 / 81 202.4 4.9 101.7X Select 1 row (id <=> 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7983 / 8015 2.0 507.5 1.0X - Parquet Vectorized (Pushdown) 753 / 774 20.9 47.9 10.6X - Native ORC Vectorized 5772 / 5814 2.7 367.0 1.4X - Native ORC Vectorized (Pushdown) 76 / 78 207.3 4.8 105.2X + Parquet Vectorized 7928 / 7993 2.0 504.1 1.0X + Parquet Vectorized (Pushdown) 747 / 772 21.0 47.5 10.6X + Native ORC Vectorized 5728 / 5753 2.7 364.2 1.4X + Native ORC Vectorized (Pushdown) 76 / 78 207.9 4.8 104.8X Select 1 row (7864320 <= id <= 7864320):Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7929 / 7999 2.0 504.1 1.0X - Parquet Vectorized (Pushdown) 747 / 770 21.1 47.5 10.6X - Native ORC Vectorized 5756 / 5810 2.7 366.0 1.4X - Native ORC Vectorized (Pushdown) 76 / 79 206.4 4.8 104.0X + Parquet Vectorized 7939 / 8021 2.0 504.8 1.0X + Parquet Vectorized (Pushdown) 746 / 770 21.1 47.4 10.6X + Native ORC Vectorized 5690 / 5734 2.8 361.7 1.4X + Native ORC Vectorized (Pushdown) 76 / 79 206.7 4.8 104.3X Select 1 row (7864319 < id < 7864321): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 7968 / 8027 2.0 506.6 1.0X - Parquet Vectorized (Pushdown) 750 / 771 21.0 47.7 10.6X - Native ORC Vectorized 5776 / 5811 2.7 367.2 1.4X - Native ORC Vectorized (Pushdown) 75 / 78 208.5 4.8 105.6X + Parquet Vectorized 7972 / 8019 2.0 506.9 1.0X + Parquet Vectorized (Pushdown) 742 / 764 21.2 47.2 10.7X + Native ORC Vectorized 5704 / 5743 2.8 362.6 1.4X + Native ORC Vectorized (Pushdown) 76 / 78 207.9 4.8 105.4X Select 10% rows (id < 1572864): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 8156 / 8257 1.9 518.5 1.0X - Parquet Vectorized (Pushdown) 1620 / 1684 9.7 103.0 5.0X - Native ORC Vectorized 5951 / 5990 2.6 378.3 1.4X - Native ORC Vectorized (Pushdown) 803 / 810 19.6 51.0 10.2X + Parquet Vectorized 8733 / 8808 1.8 555.2 1.0X + Parquet Vectorized (Pushdown) 2213 / 2267 7.1 140.7 3.9X + Native ORC Vectorized 6420 / 6463 2.4 408.2 1.4X + Native ORC Vectorized (Pushdown) 1313 / 1331 12.0 83.5 6.7X Select 50% rows (id < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 8690 / 8717 1.8 552.5 1.0X - Parquet Vectorized (Pushdown) 5067 / 5099 3.1 322.2 1.7X - Native ORC Vectorized 6530 / 6552 2.4 415.1 1.3X - Native ORC Vectorized (Pushdown) 3630 / 3670 4.3 230.8 2.4X + Parquet Vectorized 11518 / 11591 1.4 732.3 1.0X + Parquet Vectorized (Pushdown) 7962 / 7991 2.0 506.2 1.4X + Native ORC Vectorized 8927 / 8985 1.8 567.6 1.3X + Native ORC Vectorized (Pushdown) 6102 / 6160 2.6 387.9 1.9X Select 90% rows (id < 14155776): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 9241 / 9293 1.7 587.5 1.0X - Parquet Vectorized (Pushdown) 8474 / 8505 1.9 538.8 1.1X - Native ORC Vectorized 7080 / 7107 2.2 450.1 1.3X - Native ORC Vectorized (Pushdown) 6507 / 6552 2.4 413.7 1.4X + Parquet Vectorized 14255 / 14389 1.1 906.3 1.0X + Parquet Vectorized (Pushdown) 13564 / 13594 1.2 862.4 1.1X + Native ORC Vectorized 11442 / 11608 1.4 727.5 1.2X + Native ORC Vectorized (Pushdown) 10991 / 11029 1.4 698.8 1.3X Select all rows (id IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 9317 / 9366 1.7 592.4 1.0X - Parquet Vectorized (Pushdown) 9316 / 9367 1.7 592.3 1.0X - Native ORC Vectorized 7148 / 7210 2.2 454.5 1.3X - Native ORC Vectorized (Pushdown) 7092 / 7152 2.2 450.9 1.3X + Parquet Vectorized 14917 / 14938 1.1 948.4 1.0X + Parquet Vectorized (Pushdown) 14910 / 14964 1.1 948.0 1.0X + Native ORC Vectorized 11986 / 12069 1.3 762.0 1.2X + Native ORC Vectorized (Pushdown) 12037 / 12123 1.3 765.3 1.2X Select all rows (id > -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 9307 / 9353 1.7 591.7 1.0X - Parquet Vectorized (Pushdown) 9303 / 9340 1.7 591.5 1.0X - Native ORC Vectorized 7192 / 7249 2.2 457.2 1.3X - Native ORC Vectorized (Pushdown) 7182 / 7216 2.2 456.6 1.3X + Parquet Vectorized 14951 / 14976 1.1 950.6 1.0X + Parquet Vectorized (Pushdown) 14934 / 15016 1.1 949.5 1.0X + Native ORC Vectorized 12000 / 12156 1.3 763.0 1.2X + Native ORC Vectorized (Pushdown) 12079 / 12113 1.3 767.9 1.2X Select all rows (id != -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------- - Parquet Vectorized 9145 / 9328 1.7 581.4 1.0X - Parquet Vectorized (Pushdown) 9320 / 9368 1.7 592.5 1.0X - Native ORC Vectorized 7202 / 7230 2.2 457.9 1.3X - Native ORC Vectorized (Pushdown) 7170 / 7206 2.2 455.9 1.3X + Parquet Vectorized 14930 / 14972 1.1 949.3 1.0X + Parquet Vectorized (Pushdown) 15015 / 15047 1.0 954.6 1.0X + Native ORC Vectorized 12090 / 12259 1.3 768.7 1.2X + Native ORC Vectorized (Pushdown) 12021 / 12096 1.3 764.2 1.2X */ benchmark.run() } @@ -219,7 +219,7 @@ object FilterPushdownBenchmark { filterPushDownBenchmark(numRows, title, whereExpr) } - val selectExpr = (1 to width).map(i => s"LENGTH(c$i)").mkString("SUM(", "+", ")") + val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(id)") Seq(10, 50, 90).foreach { percent => filterPushDownBenchmark(