From 411687987ec5618f23b6ba24c0c322b3aeef04bb Mon Sep 17 00:00:00 2001 From: caoxuewen Date: Fri, 26 Oct 2018 11:52:31 +0800 Subject: [PATCH 1/3] Refactor JSONBenchmarks to use main method --- .../benchmarks/JSONBenchmarks-results.txt | 33 +++++++ .../datasources/json/JsonBenchmarks.scala | 86 ++++++------------- 2 files changed, 59 insertions(+), 60 deletions(-) create mode 100644 sql/core/benchmarks/JSONBenchmarks-results.txt diff --git a/sql/core/benchmarks/JSONBenchmarks-results.txt b/sql/core/benchmarks/JSONBenchmarks-results.txt new file mode 100644 index 0000000000000..c1d1457a80d7b --- /dev/null +++ b/sql/core/benchmarks/JSONBenchmarks-results.txt @@ -0,0 +1,33 @@ +================================================================================================ +Benchmark for performance of JSON parsing +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 +Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +No encoding 48088 / 48180 2.1 480.9 1.0X +UTF-8 is set 71881 / 71992 1.4 718.8 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 +Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +No encoding 12107 / 12246 8.3 121.1 1.0X +UTF-8 is set 12375 / 12475 8.1 123.8 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 +Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +No encoding 168172 / 199309 0.1 16817.2 1.0X +UTF-8 is set 167959 / 211007 0.1 16795.9 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 +Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Select 10 columns + count() 11828 / 12138 0.8 1182.8 1.0X +Select 1 column + count() 10049 / 10056 1.0 1004.9 1.2X +count() 2611 / 2617 3.8 261.1 4.5X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala index 3c4a5ab32724b..c2995ef561762 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala @@ -16,32 +16,31 @@ */ package org.apache.spark.sql.execution.datasources.json -import java.io.File - -import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.sql.{Row, SparkSession} -import org.apache.spark.sql.catalyst.plans.SQLHelper +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types._ /** * The benchmarks aims to measure performance of JSON parsing when encoding is set and isn't. - * To run this: - * spark-submit --class --jars + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars , + * + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/JSONBenchmarks-results.txt". + * }}} */ -object JSONBenchmarks extends SQLHelper { - val conf = new SparkConf() - - val spark = SparkSession.builder - .master("local[1]") - .appName("benchmark-json-datasource") - .config(conf) - .getOrCreate() + +object JSONBenchmarks extends SqlBasedBenchmark { import spark.implicits._ def schemaInferring(rowsNum: Int): Unit = { - val benchmark = new Benchmark("JSON schema inferring", rowsNum) + val benchmark = new Benchmark("JSON schema inferring", rowsNum, output = output) withTempPath { path => // scalastyle:off println @@ -65,21 +64,12 @@ object JSONBenchmarks extends SQLHelper { .json(path.getAbsolutePath) } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5 - Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - - JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - --------------------------------------------------------------------------------------------- - No encoding 45908 / 46480 2.2 459.1 1.0X - UTF-8 is set 68469 / 69762 1.5 684.7 0.7X - */ benchmark.run() } } def perlineParsing(rowsNum: Int): Unit = { - val benchmark = new Benchmark("JSON per-line parsing", rowsNum) + val benchmark = new Benchmark("JSON per-line parsing", rowsNum, output = output) withTempPath { path => // scalastyle:off println @@ -107,21 +97,12 @@ object JSONBenchmarks extends SQLHelper { .count() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5 - Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - - JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - --------------------------------------------------------------------------------------------- - No encoding 9982 / 10237 10.0 99.8 1.0X - UTF-8 is set 16373 / 16806 6.1 163.7 0.6X - */ benchmark.run() } } def perlineParsingOfWideColumn(rowsNum: Int): Unit = { - val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum) + val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum, output = output) withTempPath { path => // scalastyle:off println @@ -156,22 +137,14 @@ object JSONBenchmarks extends SQLHelper { .count() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5 - Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - - JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - --------------------------------------------------------------------------------------------- - No encoding 26038 / 26386 0.4 2603.8 1.0X - UTF-8 is set 28343 / 28557 0.4 2834.3 0.9X - */ benchmark.run() } } def countBenchmark(rowsNum: Int): Unit = { val colsNum = 10 - val benchmark = new Benchmark(s"Count a dataset with $colsNum columns", rowsNum) + val benchmark = + new Benchmark(s"Count a dataset with $colsNum columns", rowsNum, output = output) withTempPath { path => val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i", IntegerType)) @@ -195,23 +168,16 @@ object JSONBenchmarks extends SQLHelper { ds.count() } - /* - Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz - - Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - --------------------------------------------------------------------------------------------- - Select 10 columns + count() 9961 / 10006 1.0 996.1 1.0X - Select 1 column + count() 8355 / 8470 1.2 835.5 1.2X - count() 2104 / 2156 4.8 210.4 4.7X - */ benchmark.run() } } - def main(args: Array[String]): Unit = { - schemaInferring(100 * 1000 * 1000) - perlineParsing(100 * 1000 * 1000) - perlineParsingOfWideColumn(10 * 1000 * 1000) - countBenchmark(10 * 1000 * 1000) + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("Benchmark for performance of JSON parsing") { + schemaInferring(100 * 1000 * 1000) + perlineParsing(100 * 1000 * 1000) + perlineParsingOfWideColumn(10 * 1000 * 1000) + countBenchmark(10 * 1000 * 1000) + } } } From c1cde631c8aaaa55c61d877d12e5cb1948b94d55 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 30 Oct 2018 08:24:16 +0000 Subject: [PATCH 2/3] Update result --- .../benchmarks/JSONBenchmarks-results.txt | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/sql/core/benchmarks/JSONBenchmarks-results.txt b/sql/core/benchmarks/JSONBenchmarks-results.txt index c1d1457a80d7b..99937309a4145 100644 --- a/sql/core/benchmarks/JSONBenchmarks-results.txt +++ b/sql/core/benchmarks/JSONBenchmarks-results.txt @@ -2,32 +2,36 @@ Benchmark for performance of JSON parsing ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 -Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +Preparing data for benchmarking ... +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -No encoding 48088 / 48180 2.1 480.9 1.0X -UTF-8 is set 71881 / 71992 1.4 718.8 0.7X +No encoding 62946 / 63310 1.6 629.5 1.0X +UTF-8 is set 112814 / 112866 0.9 1128.1 0.6X -OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 -Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +Preparing data for benchmarking ... +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -No encoding 12107 / 12246 8.3 121.1 1.0X -UTF-8 is set 12375 / 12475 8.1 123.8 1.0X +No encoding 16468 / 16553 6.1 164.7 1.0X +UTF-8 is set 16420 / 16441 6.1 164.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 -Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +Preparing data for benchmarking ... +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -No encoding 168172 / 199309 0.1 16817.2 1.0X -UTF-8 is set 167959 / 211007 0.1 16795.9 1.0X +No encoding 39789 / 40053 0.3 3978.9 1.0X +UTF-8 is set 39505 / 39584 0.3 3950.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_163-b01 on Windows 7 6.1 -Intel64 Family 6 Model 94 Stepping 3, GenuineIntel +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Select 10 columns + count() 11828 / 12138 0.8 1182.8 1.0X -Select 1 column + count() 10049 / 10056 1.0 1004.9 1.2X -count() 2611 / 2617 3.8 261.1 4.5X +Select 10 columns + count() 15997 / 16015 0.6 1599.7 1.0X +Select 1 column + count() 13280 / 13326 0.8 1328.0 1.2X +count() 3006 / 3021 3.3 300.6 5.3X + From 422df479aae55b52e17eccf1f46675c9b7369ddd Mon Sep 17 00:00:00 2001 From: caoxuewen Date: Tue, 30 Oct 2018 17:26:18 +0800 Subject: [PATCH 3/3] fix file name --- .../{JSONBenchmarks-results.txt => JSONBenchmark-results.txt} | 0 .../json/{JsonBenchmarks.scala => JsonBenchmark.scala} | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename sql/core/benchmarks/{JSONBenchmarks-results.txt => JSONBenchmark-results.txt} (100%) rename sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/{JsonBenchmarks.scala => JsonBenchmark.scala} (97%) diff --git a/sql/core/benchmarks/JSONBenchmarks-results.txt b/sql/core/benchmarks/JSONBenchmark-results.txt similarity index 100% rename from sql/core/benchmarks/JSONBenchmarks-results.txt rename to sql/core/benchmarks/JSONBenchmark-results.txt diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala similarity index 97% rename from sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala index c2995ef561762..04f724ec8638f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala @@ -32,11 +32,11 @@ import org.apache.spark.sql.types._ * 2. build/sbt "sql/test:runMain " * 3. generate result: * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " - * Results will be written to "benchmarks/JSONBenchmarks-results.txt". + * Results will be written to "benchmarks/JSONBenchmark-results.txt". * }}} */ -object JSONBenchmarks extends SqlBasedBenchmark { +object JSONBenchmark extends SqlBasedBenchmark { import spark.implicits._ def schemaInferring(rowsNum: Int): Unit = {