From a57eb0629a275dd44915191c1c67082334f572d2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 27 Sep 2022 17:09:40 -0600 Subject: [PATCH 1/6] Add documentation for querying S3 data with CLI --- datafusion-cli/README.md | 31 +++++++++++++++++++++++++++++++ docs/source/user-guide/cli.md | 31 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index 05029a0183633..0413311635872 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -65,6 +65,37 @@ DataFusion CLI v12.0.0 1 row in set. Query took 0.017 seconds. ``` +## Querying S3 Data Sources + +The CLI can query data in S3 if the following environment variables are defined: + +- `AWS_DEFAULT_REGION` +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` + +Example: + +```bash +$ aws s3 cp test.csv s3://my-bucket/ +upload: ./test.csv to s3://my-bucket/test.csv + +$ export AWS_DEFAULT_REGION=us-east-2 +$ export AWS_SECRET_ACCESS_KEY=*************************** +$ export AWS_ACCESS_KEY_ID=************** + +$ ./target/release/datafusion-cli +DataFusion CLI v12.0.0 +❯ create external table test stored as csv location 's3://my-bucket/test.csv'; +0 rows in set. Query took 0.374 seconds. +❯ select * from test; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. Query took 0.171 seconds. +``` + ## DataFusion-Cli Build the `datafusion-cli` by `cd` into the sub-directory: diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md index 4299990c0903b..2d98b43a31658 100644 --- a/docs/source/user-guide/cli.md +++ b/docs/source/user-guide/cli.md @@ -151,6 +151,37 @@ STORED AS CSV LOCATION '/path/to/aggregate_test_100.csv'; ``` +## Querying S3 Data Sources + +The CLI can query data in S3 if the following environment variables are defined: + +- `AWS_DEFAULT_REGION` +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` + +Example: + +```bash +$ aws s3 cp test.csv s3://my-bucket/ +upload: ./test.csv to s3://my-bucket/test.csv + +$ export AWS_DEFAULT_REGION=us-east-2 +$ export AWS_SECRET_ACCESS_KEY=*************************** +$ export AWS_ACCESS_KEY_ID=************** + +$ ./target/release/datafusion-cli +DataFusion CLI v12.0.0 +❯ create external table test stored as csv location 's3://my-bucket/test.csv'; +0 rows in set. Query took 0.374 seconds. +❯ select * from test; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. Query took 0.171 seconds. +``` + ## Commands Available commands inside DataFusion CLI are: From da500cf1082d435a53c1f590e46501bcfa7f721c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 27 Sep 2022 17:22:13 -0600 Subject: [PATCH 2/6] add s3 example --- datafusion-examples/examples/query-s3.rs | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 datafusion-examples/examples/query-s3.rs diff --git a/datafusion-examples/examples/query-s3.rs b/datafusion-examples/examples/query-s3.rs new file mode 100644 index 0000000000000..cc319e0acf4b4 --- /dev/null +++ b/datafusion-examples/examples/query-s3.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::error::Result; +use datafusion::prelude::*; + +/// This example demonstrates querying data in an S3 bucket. +/// +/// The following environment variables must be defined: +/// +/// - AWS_DEFAULT_REGION +/// - AWS_ACCESS_KEY_ID +/// - AWS_SECRET_ACCESS_KEY +/// +#[tokio::main] +async fn main() -> Result<()> { + // read AWS configs from the environment + let config = SessionConfig::from_env(); + + let ctx = SessionContext::with_config(config); + + ctx.register_parquet( + "trips", + "s3://nyc-tlc/trip data/yellow_tripdata_2022-06.parquet", + ParquetReadOptions::default(), + ) + .await?; + + // execute the query + let df = ctx.sql("SELECT * FROM trips LIMIT 10").await?; + + // print the results + df.show().await?; + + Ok(()) +} From 459711c9fcd2f284018c452e380a5e94c0a33654 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 27 Sep 2022 17:38:52 -0600 Subject: [PATCH 3/6] update test --- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 1 + datafusion-examples/examples/query-s3.rs | 23 +++++++++++++++++++---- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index e74b81a36baf7..cf0d2f4f043c3 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -38,4 +38,4 @@ mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.5.0", features = ["aws", "gcp"] } rustyline = "10.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot"] } -url = "2.2" +url = "2.2" \ No newline at end of file diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 5d2150848afb4..c1feb1542fbd8 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -39,6 +39,7 @@ async-trait = "0.1.41" datafusion = { path = "../datafusion/core" } futures = "0.3" num_cpus = "1.13.0" +object_store = { version = "0.5.0", features = ["aws"] } prost = "0.11.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.82" diff --git a/datafusion-examples/examples/query-s3.rs b/datafusion-examples/examples/query-s3.rs index cc319e0acf4b4..d97663f9ba618 100644 --- a/datafusion-examples/examples/query-s3.rs +++ b/datafusion-examples/examples/query-s3.rs @@ -17,6 +17,9 @@ use datafusion::error::Result; use datafusion::prelude::*; +use object_store::aws::AmazonS3Builder; +use std::env; +use std::sync::Arc; /// This example demonstrates querying data in an S3 bucket. /// @@ -28,14 +31,26 @@ use datafusion::prelude::*; /// #[tokio::main] async fn main() -> Result<()> { - // read AWS configs from the environment - let config = SessionConfig::from_env(); + let ctx = SessionContext::new(); - let ctx = SessionContext::with_config(config); + let bucket_name = "nyc-tlc"; + + let s3 = AmazonS3Builder::new() + .with_bucket_name(bucket_name) + .with_region(env::var("AWS_DEFAULT_REGION").unwrap()) + .with_access_key_id(env::var("AWS_ACCESS_KEY_ID").unwrap()) + .with_secret_access_key(env::var("AWS_SECRET_ACCESS_KEY").unwrap()) + .build()?; + + ctx.runtime_env() + .register_object_store("s3", bucket_name, Arc::new(s3)); ctx.register_parquet( "trips", - "s3://nyc-tlc/trip data/yellow_tripdata_2022-06.parquet", + &format!( + "s3://{}/trip data/yellow_tripdata_2022-06.parquet", + bucket_name + ), ParquetReadOptions::default(), ) .await?; From 9f805eb488d74ddbe0280e7b1a35496d383a6a56 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 28 Sep 2022 07:23:49 -0600 Subject: [PATCH 4/6] fix example, use AWS_REGION --- datafusion-cli/README.md | 7 ++++-- datafusion-cli/src/object_storage.rs | 4 +-- .../examples/{query-s3.rs => query-aws-s3.rs} | 25 +++++++++++-------- docs/source/user-guide/cli.md | 7 ++++-- 4 files changed, 26 insertions(+), 17 deletions(-) rename datafusion-examples/examples/{query-s3.rs => query-aws-s3.rs} (74%) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index 0413311635872..a61088362c18a 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -69,17 +69,20 @@ DataFusion CLI v12.0.0 The CLI can query data in S3 if the following environment variables are defined: -- `AWS_DEFAULT_REGION` +- `AWS_REGION` - `AWS_ACCESS_KEY_ID` - `AWS_SECRET_ACCESS_KEY` +Note that the region must be set to the region where the bucket exists until the following issue is resolved: +- https://github.com/apache/arrow-rs/issues/2795 + Example: ```bash $ aws s3 cp test.csv s3://my-bucket/ upload: ./test.csv to s3://my-bucket/test.csv -$ export AWS_DEFAULT_REGION=us-east-2 +$ export AWS_REGION=us-east-1 $ export AWS_SECRET_ACCESS_KEY=*************************** $ export AWS_ACCESS_KEY_ID=************** diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs index 4d21e84a17c3d..19993e751c26d 100644 --- a/datafusion-cli/src/object_storage.rs +++ b/datafusion-cli/src/object_storage.rs @@ -138,8 +138,8 @@ mod tests { .unwrap_err(); assert!(err.to_string().contains("Generic S3 error: Missing region")); - env::set_var("AWS_DEFAULT_REGION", "us-east-1"); + env::set_var("AWS_REGION", "us-east-1"); assert!(provider.get_by_url(&Url::from_str(s3).unwrap()).is_ok()); - env::remove_var("AWS_DEFAULT_REGION"); + env::remove_var("AWS_REGION"); } } diff --git a/datafusion-examples/examples/query-s3.rs b/datafusion-examples/examples/query-aws-s3.rs similarity index 74% rename from datafusion-examples/examples/query-s3.rs rename to datafusion-examples/examples/query-aws-s3.rs index d97663f9ba618..5969eb73e0269 100644 --- a/datafusion-examples/examples/query-s3.rs +++ b/datafusion-examples/examples/query-aws-s3.rs @@ -25,7 +25,6 @@ use std::sync::Arc; /// /// The following environment variables must be defined: /// -/// - AWS_DEFAULT_REGION /// - AWS_ACCESS_KEY_ID /// - AWS_SECRET_ACCESS_KEY /// @@ -33,11 +32,15 @@ use std::sync::Arc; async fn main() -> Result<()> { let ctx = SessionContext::new(); + // the region must be set to the region where the bucket exists until the following + // issue is resolved + // https://github.com/apache/arrow-rs/issues/2795 + let region = "us-east-1"; let bucket_name = "nyc-tlc"; let s3 = AmazonS3Builder::new() .with_bucket_name(bucket_name) - .with_region(env::var("AWS_DEFAULT_REGION").unwrap()) + .with_region(region) .with_access_key_id(env::var("AWS_ACCESS_KEY_ID").unwrap()) .with_secret_access_key(env::var("AWS_SECRET_ACCESS_KEY").unwrap()) .build()?; @@ -45,15 +48,15 @@ async fn main() -> Result<()> { ctx.runtime_env() .register_object_store("s3", bucket_name, Arc::new(s3)); - ctx.register_parquet( - "trips", - &format!( - "s3://{}/trip data/yellow_tripdata_2022-06.parquet", - bucket_name - ), - ParquetReadOptions::default(), - ) - .await?; + // cannot query the parquet files from this bucket because the path contains a whitespace + // and we don't support that yet + // https://github.com/apache/arrow-rs/issues/2799 + let path = format!( + "s3://{}/csv_backup/yellow_tripdata_2022-02.csv", + bucket_name + ); + ctx.register_csv("trips", &path, CsvReadOptions::default()) + .await?; // execute the query let df = ctx.sql("SELECT * FROM trips LIMIT 10").await?; diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md index 2d98b43a31658..ea41ea50d8f47 100644 --- a/docs/source/user-guide/cli.md +++ b/docs/source/user-guide/cli.md @@ -155,17 +155,20 @@ LOCATION '/path/to/aggregate_test_100.csv'; The CLI can query data in S3 if the following environment variables are defined: -- `AWS_DEFAULT_REGION` +- `AWS_REGION` - `AWS_ACCESS_KEY_ID` - `AWS_SECRET_ACCESS_KEY` +Note that the region must be set to the region where the bucket exists until the following issue is resolved: +- https://github.com/apache/arrow-rs/issues/2795 + Example: ```bash $ aws s3 cp test.csv s3://my-bucket/ upload: ./test.csv to s3://my-bucket/test.csv -$ export AWS_DEFAULT_REGION=us-east-2 +$ export AWS_REGION=us-east-2 $ export AWS_SECRET_ACCESS_KEY=*************************** $ export AWS_ACCESS_KEY_ID=************** From d2fe8cdc34f4d308e58fae12771a26a0d41b320b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 28 Sep 2022 08:44:36 -0600 Subject: [PATCH 5/6] prettier --- datafusion-cli/README.md | 1 + docs/source/user-guide/cli.md | 1 + 2 files changed, 2 insertions(+) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index a61088362c18a..6c0cf63ae08f7 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -74,6 +74,7 @@ The CLI can query data in S3 if the following environment variables are defined: - `AWS_SECRET_ACCESS_KEY` Note that the region must be set to the region where the bucket exists until the following issue is resolved: + - https://github.com/apache/arrow-rs/issues/2795 Example: diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md index ea41ea50d8f47..e692f4adc7bd1 100644 --- a/docs/source/user-guide/cli.md +++ b/docs/source/user-guide/cli.md @@ -160,6 +160,7 @@ The CLI can query data in S3 if the following environment variables are defined: - `AWS_SECRET_ACCESS_KEY` Note that the region must be set to the region where the bucket exists until the following issue is resolved: + - https://github.com/apache/arrow-rs/issues/2795 Example: From 30bc3f891a57da90113abce53dc9220ec3586839 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 28 Sep 2022 09:26:50 -0600 Subject: [PATCH 6/6] toml fmt --- datafusion-cli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index cf0d2f4f043c3..e74b81a36baf7 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -38,4 +38,4 @@ mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.5.0", features = ["aws", "gcp"] } rustyline = "10.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot"] } -url = "2.2" \ No newline at end of file +url = "2.2"