From 4a74328ee77f56ba9fbd61aa20b0f7259664c5a5 Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 10 May 2017 11:56:28 +0200 Subject: [PATCH 01/20] Add Scala examples --- .../apache/spark/examples/sql/SQLDataSourceExample.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index ad74da72bd5e6..6e58f7b7fb322 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -52,6 +52,14 @@ object SQLDataSourceExample { // $example on:direct_sql$ val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") // $example off:direct_sql$ + // $example on:write_sorting_and_bucketing$ + peopleDF.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed") + // $example off:write_sorting_and_bucketing$ + // $example on:write_partitioning$ + usersDF.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet") + // $example on:write_partitioning$ + + spark.sql("DROP TABLE IF EXISTS people_bucketed") } private def runBasicParquetExample(spark: SparkSession): Unit = { From 573b0b907393e93228abc713cb6017b1d1fddaf7 Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 10 May 2017 11:56:50 +0200 Subject: [PATCH 02/20] Add Python examples --- examples/src/main/python/sql/datasource.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index e4abb0933345d..baafe696b7c02 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -35,11 +35,21 @@ def basic_datasource_example(spark): df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") # $example off:generic_load_save_functions$ + # $example on:write_partitioning$ + df.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet") + # $example off:write_partitioning$ + # $example on:manual_load_options$ df = spark.read.load("examples/src/main/resources/people.json", format="json") df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") # $example off:manual_load_options$ + # $example on:write_sorting_and_bucketing$ + df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed") + # $example off:write_sorting_and_bucketing$ + + spark.sql("DROP TABLE IF EXISTS people_bucketed") + # $example on:direct_sql$ df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") # $example off:direct_sql$ From 90ad3f321f633f80ebca3526a6a84277478318b0 Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 10 May 2017 12:22:27 +0200 Subject: [PATCH 03/20] Add Java examples --- .../spark/examples/sql/JavaSQLDataSourceExample.java | 8 ++++++++ .../apache/spark/examples/sql/SQLDataSourceExample.scala | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index b66abaed66000..08257d9e44332 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -120,6 +120,14 @@ private static void runBasicDataSourceExample(SparkSession spark) { Dataset sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); // $example off:direct_sql$ + // $example on:write_sorting_and_bucketing$ + peopleDF.write().bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed"); + // $example off:write_sorting_and_bucketing$ + // $example on:write_partitioning$ + usersDF.write().partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet"); + // $example off:write_partitioning$ + + spark.sql("DROP TABLE IF EXISTS people_bucketed"); } private static void runBasicParquetExample(SparkSession spark) { diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index 6e58f7b7fb322..624ac54710b7e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -57,7 +57,7 @@ object SQLDataSourceExample { // $example off:write_sorting_and_bucketing$ // $example on:write_partitioning$ usersDF.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet") - // $example on:write_partitioning$ + // $example off:write_partitioning$ spark.sql("DROP TABLE IF EXISTS people_bucketed") } From 563a7e8fadce2c2d076be0fcb240f5b14a5d9692 Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 10 May 2017 12:32:47 +0200 Subject: [PATCH 04/20] Add examples to sql guide --- docs/sql-programming-guide.md | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 490c1ce8a7cc5..1a6159b43c67e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -581,6 +581,46 @@ Starting from Spark 2.1, persistent datasource tables have per-partition metadat Note that partition information is not gathered by default when creating external datasource tables (those with a `path` option). To sync the partition information in the metastore, you can invoke `MSCK REPAIR TABLE`. +### Bucketing, Sorting and Partitioning + +For file-based data source it is also possible to bucket and and sort or partition the output. +Bucketing and sorting is applicable only to persistent tables: + +
+ +
+{% include_example write_sorting_and_bucketing scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} +
+ +
+{% include_example write_sorting_and_bucketing java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} +
+ +
+{% include_example write_sorting_and_bucketing python/sql/datasource.py %} +
+ +
+ +while partitioning can be used with both `save` and `saveAsTable`: + + +
+ +
+{% include_example write_partitioning scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} +
+ +
+{% include_example write_partitioning java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} +
+ +
+{% include_example write_partitioning python/sql/datasource.py %} +
+ +
+ ## Parquet Files [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems. From f9621d91451637c82367ae5fe8c6ea2f7de1fb95 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 10:53:47 +0200 Subject: [PATCH 05/20] Remove duplicated and --- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 1a6159b43c67e..848e7416e075e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -583,7 +583,7 @@ Note that partition information is not gathered by default when creating externa ### Bucketing, Sorting and Partitioning -For file-based data source it is also possible to bucket and and sort or partition the output. +For file-based data source it is also possible to bucket and sort or partition the output. Bucketing and sorting is applicable only to persistent tables:
From 01cbfadc9dd728675a4bcff4d1a3c93280b00abf Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 20:09:22 +0200 Subject: [PATCH 06/20] Add Python example for artitionBy + bucketBy --- examples/src/main/python/sql/datasource.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index baafe696b7c02..358534cf2040e 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -39,6 +39,17 @@ def basic_datasource_example(spark): df.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet") # $example off:write_partitioning$ + # $example on:write_partition_and_bucket$ + df = spark.read.parquet("examples/src/main/resources/users.parquet") + (df + .write + .partitionBy("favorite_color") + .bucketBy(42, "name") + .saveAsTable("people_partitioned_bucketed")) + # $example off:write_partition_and_bucket$ + + spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed") + # $example on:manual_load_options$ df = spark.read.load("examples/src/main/resources/people.json", format="json") df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") From 72806f1740a904dee3861eb81875a879381f698b Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 20:11:34 +0200 Subject: [PATCH 07/20] Add Java example for artitionBy + bucketBy --- .../spark/examples/sql/JavaSQLDataSourceExample.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index 08257d9e44332..dbe57147b7a42 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -126,6 +126,13 @@ private static void runBasicDataSourceExample(SparkSession spark) { // $example on:write_partitioning$ usersDF.write().partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet"); // $example off:write_partitioning$ + // $example on:write_partition_and_bucket$ + peopleDF + .write() + .partitionBy("favorite_color") + .bucketBy(42, "name") + .saveAsTable("people_partitioned_bucketed"); + // $example off:write_partition_and_bucket$ spark.sql("DROP TABLE IF EXISTS people_bucketed"); } From 0294e475ff3f1284b3acb430b2a3dba561a54dee Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 20:13:17 +0200 Subject: [PATCH 08/20] Add Scala example for artitionBy + bucketBy --- .../apache/spark/examples/sql/SQLDataSourceExample.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index 624ac54710b7e..613357f0b1806 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -58,6 +58,13 @@ object SQLDataSourceExample { // $example on:write_partitioning$ usersDF.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet") // $example off:write_partitioning$ + //$example on:write_partition_and_bucket$ + peopleDF + .write + .partitionBy("favorite_color") + .bucketBy(42, "name") + .saveAsTable("people_partitioned_bucketed") + // $example off:write_partition_and_bucket$ spark.sql("DROP TABLE IF EXISTS people_bucketed") } From f76b113d5c7fdcb28fef5c43ecf8f308f08f1023 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 20:21:21 +0200 Subject: [PATCH 09/20] Add partitionBy + bucketBy to SQL Guide --- docs/sql-programming-guide.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 848e7416e075e..7f9b08bb2f0a2 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -621,6 +621,24 @@ while partitioning can be used with both `save` and `saveAsTable`:
+It is possible to use both partitions and buckets for a single table: + +
+ +
+{% include_example write_partition_and_bucket scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} +
+ +
+{% include_example write_partition_and_bucket java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} +
+ +
+{% include_example write_partition_and_bucket python/sql/datasource.py %} +
+ +
+ ## Parquet Files [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems. From 7bf4bbc30a6fa821d85285519c035be0a4f66b0c Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 21:21:25 +0200 Subject: [PATCH 10/20] Add cardinality note --- docs/sql-programming-guide.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 7f9b08bb2f0a2..4df6030e05b50 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -639,6 +639,10 @@ It is possible to use both partitions and buckets for a single table: +`partitionBy` creates a directory structure as described in the [Partition Discovery](#partition-discovery) section. +Because of that it has limited applicability to columns with high cardinality. In contrast `bucketBy` distributes +data across fixed number of buckets and can be used if a number of unique values is unbounded. + ## Parquet Files [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems. From cc1bfcf281b32860113215c3f34cbacf3bb47cbb Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 21:57:34 +0200 Subject: [PATCH 11/20] Fix scala style --- .../org/apache/spark/examples/sql/SQLDataSourceExample.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index 613357f0b1806..b81eafcadba00 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -58,7 +58,7 @@ object SQLDataSourceExample { // $example on:write_partitioning$ usersDF.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet") // $example off:write_partitioning$ - //$example on:write_partition_and_bucket$ + // $example on:write_partition_and_bucket$ peopleDF .write .partitionBy("favorite_color") From 606f1e3a5f672d8f7a7dc98fe041e347e65a2d03 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 22:11:51 +0200 Subject: [PATCH 12/20] Missing drop --- .../apache/spark/examples/sql/JavaSQLDataSourceExample.java | 1 + examples/src/main/python/sql/datasource.py | 6 ++---- .../apache/spark/examples/sql/SQLDataSourceExample.scala | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index dbe57147b7a42..706856b5215e4 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -135,6 +135,7 @@ private static void runBasicDataSourceExample(SparkSession spark) { // $example off:write_partition_and_bucket$ spark.sql("DROP TABLE IF EXISTS people_bucketed"); + spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed"); } private static void runBasicParquetExample(SparkSession spark) { diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index 358534cf2040e..3ef2b2606d815 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -48,8 +48,6 @@ def basic_datasource_example(spark): .saveAsTable("people_partitioned_bucketed")) # $example off:write_partition_and_bucket$ - spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed") - # $example on:manual_load_options$ df = spark.read.load("examples/src/main/resources/people.json", format="json") df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") @@ -59,12 +57,12 @@ def basic_datasource_example(spark): df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed") # $example off:write_sorting_and_bucketing$ - spark.sql("DROP TABLE IF EXISTS people_bucketed") - # $example on:direct_sql$ df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") # $example off:direct_sql$ + spark.sql("DROP TABLE IF EXISTS people_bucketed") + spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed") def parquet_example(spark): # $example on:basic_parquet_example$ diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index b81eafcadba00..6ff03bdb22129 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -67,6 +67,7 @@ object SQLDataSourceExample { // $example off:write_partition_and_bucket$ spark.sql("DROP TABLE IF EXISTS people_bucketed") + spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed") } private def runBasicParquetExample(spark: SparkSession): Unit = { From a7aff811aa88b1f93364aa51ab95b6b64fa63d8d Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 11 May 2017 22:17:58 +0200 Subject: [PATCH 13/20] Python style --- examples/src/main/python/sql/datasource.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index 3ef2b2606d815..8777cca66bfe9 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -64,6 +64,7 @@ def basic_datasource_example(spark): spark.sql("DROP TABLE IF EXISTS people_bucketed") spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed") + def parquet_example(spark): # $example on:basic_parquet_example$ peopleDF = spark.read.json("examples/src/main/resources/people.json") From c4d7856c82aab845cf9cef4460302461db7e1384 Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 12 May 2017 12:43:54 +0200 Subject: [PATCH 14/20] Add SQL partitionBy example --- docs/sql-programming-guide.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 4df6030e05b50..44cd4b124fab3 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -619,6 +619,19 @@ while partitioning can be used with both `save` and `saveAsTable`: {% include_example write_partitioning python/sql/datasource.py %} +
+ +{% highlight sql %} + +CREATE TABLE users_by_favorite_color( + name STRING, + favorite_NUMBERS array +) PARTITIONED BY(favorite_color STRING); + +{% endhighlight %} + +
+ It is possible to use both partitions and buckets for a single table: From b5babf65571661ca45880cd80a950959f66523a1 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 13 May 2017 13:01:58 +0200 Subject: [PATCH 15/20] Add SQL examples for CLUSTERED BY --- docs/sql-programming-guide.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 44cd4b124fab3..dbc019e2e9e28 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -600,6 +600,21 @@ Bucketing and sorting is applicable only to persistent tables: {% include_example write_sorting_and_bucketing python/sql/datasource.py %} +
+ +{% highlight sql %} + +CREATE TABLE users_bucketed_by_name( + name STRING, + favorite_color STRING, + favorite_NUMBERS array +) USING parquet +CLUSTERED BY(name) INTO 42 BUCKETS; + +{% endhighlight %} + +
+ while partitioning can be used with both `save` and `saveAsTable`: @@ -650,6 +665,22 @@ It is possible to use both partitions and buckets for a single table: {% include_example write_partition_and_bucket python/sql/datasource.py %} +
+ +{% highlight sql %} + +CREATE TABLE users_bucketed_and_partitioned( + name STRING, + favorite_color STRING, + favorite_NUMBERS array +) USING parquet +PARTITIONED BY (favorite_color) +CLUSTERED BY(name) INTO 42 BUCKETS; + +{% endhighlight %} + +
+ `partitionBy` creates a directory structure as described in the [Partition Discovery](#partition-discovery) section. From 92fb3b3e00a666ff3bd1eca4e5dee0cefcca2d55 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 13 May 2017 13:26:25 +0200 Subject: [PATCH 16/20] Update PARTITION BY example to Spark syntax --- docs/sql-programming-guide.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index dbc019e2e9e28..ba753c253cffe 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -640,8 +640,9 @@ while partitioning can be used with both `save` and `saveAsTable`: CREATE TABLE users_by_favorite_color( name STRING, + favorite_color STRING, favorite_NUMBERS array -) PARTITIONED BY(favorite_color STRING); +) USING csv PARTITIONED BY(favorite_color); {% endhighlight %} From 65ac310787927e4180b93863e361d87265c16ce5 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sun, 14 May 2017 13:58:00 +0200 Subject: [PATCH 17/20] Include changes requested by gatorsmile --- docs/sql-programming-guide.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index ba753c253cffe..fdb954870ce14 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -583,8 +583,8 @@ Note that partition information is not gathered by default when creating externa ### Bucketing, Sorting and Partitioning -For file-based data source it is also possible to bucket and sort or partition the output. -Bucketing and sorting is applicable only to persistent tables: +For file-based data source, it is also possible to bucket and sort or partition the output. +Bucketing and sorting are applicable only to persistent tables:
@@ -617,7 +617,7 @@ CLUSTERED BY(name) INTO 42 BUCKETS;
-while partitioning can be used with both `save` and `saveAsTable`: +while partitioning can be used with both `save` and `saveAsTable` when using the Dataset APIs.
@@ -650,7 +650,7 @@ CREATE TABLE users_by_favorite_color(
-It is possible to use both partitions and buckets for a single table: +It is possible to use both partitioning and bucketing for a single table:
@@ -685,8 +685,9 @@ CLUSTERED BY(name) INTO 42 BUCKETS;
`partitionBy` creates a directory structure as described in the [Partition Discovery](#partition-discovery) section. -Because of that it has limited applicability to columns with high cardinality. In contrast `bucketBy` distributes -data across fixed number of buckets and can be used if a number of unique values is unbounded. +Thus, it has limited applicability to columns with high cardinality. In contrast + `bucketBy` distributes +data across fixed number of buckets and can be used when a number of unique values is unbounded. ## Parquet Files From f7b6f43456bac435251a4826a7d9352adc44f9d0 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sun, 14 May 2017 20:18:51 +0200 Subject: [PATCH 18/20] Add SORTED BY --- docs/sql-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index fdb954870ce14..b9deb7522f737 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -673,10 +673,10 @@ It is possible to use both partitioning and bucketing for a single table: CREATE TABLE users_bucketed_and_partitioned( name STRING, favorite_color STRING, - favorite_NUMBERS array + favorite_numbers array ) USING parquet PARTITIONED BY (favorite_color) -CLUSTERED BY(name) INTO 42 BUCKETS; +CLUSTERED BY(name) SORTED BY (favorite_numbers) INTO 42 BUCKETS; {% endhighlight %} From 3a8b6e94dd40372704aa4e1cdce015bcc1c3b893 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sun, 14 May 2017 20:19:35 +0200 Subject: [PATCH 19/20] Constitent case for favorite_numbers --- docs/sql-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index b9deb7522f737..f483cfb23a52d 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -607,7 +607,7 @@ Bucketing and sorting are applicable only to persistent tables: CREATE TABLE users_bucketed_by_name( name STRING, favorite_color STRING, - favorite_NUMBERS array + favorite_numbers array ) USING parquet CLUSTERED BY(name) INTO 42 BUCKETS; @@ -641,7 +641,7 @@ while partitioning can be used with both `save` and `saveAsTable` when using the CREATE TABLE users_by_favorite_color( name STRING, favorite_color STRING, - favorite_NUMBERS array + favorite_numbers array ) USING csv PARTITIONED BY(favorite_color); {% endhighlight %} From bea0676088dadbc5af544f581aa8a2ed49355acc Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 26 May 2017 23:01:17 +0200 Subject: [PATCH 20/20] Missing article --- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index f483cfb23a52d..69fca3b6be5a7 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -687,7 +687,7 @@ CLUSTERED BY(name) SORTED BY (favorite_numbers) INTO 42 BUCKETS; `partitionBy` creates a directory structure as described in the [Partition Discovery](#partition-discovery) section. Thus, it has limited applicability to columns with high cardinality. In contrast `bucketBy` distributes -data across fixed number of buckets and can be used when a number of unique values is unbounded. +data across a fixed number of buckets and can be used when a number of unique values is unbounded. ## Parquet Files