From 9db0c1b8490a8556be459e17277fc6aa6d829aa8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 16 Aug 2023 14:11:15 -0400 Subject: [PATCH 1/4] Document and `scratch` directory for sqllogictest and make test specific --- datafusion/sqllogictest/README.md | 30 ++++++++++++---- datafusion/sqllogictest/bin/sqllogictests.rs | 13 ++++--- datafusion/sqllogictest/test_files/copy.slt | 36 ++++++++++---------- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 1f69bb864e715..cbdd5e82d53cb 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -177,14 +177,32 @@ You can update the tests / generate expected output by passing the `--complete` cargo test --test sqllogictests -- ddl --complete ``` -#### sqllogictests +#### Running tests: `scratchdir` -sqllogictest is a program originally written for SQLite to verify the correctness of SQL queries against the SQLite -engine. The program is engine-agnostic and can parse sqllogictest files (`.slt`), runs queries against an SQL engine and -compare the output to the expected output. +The DataFusion sqllogictest runner automatically creates a directory +named `test_files/scratch/`, creating it if needed and +clearing any file contents if it exists. -Tests in the `.slt` file are a sequence of query record generally starting with `CREATE` statements to populate tables -and then further queries to test the populated data (arrow-datafusion exception). +For example, the `test_files/copy.slt` file should use scratch +directory `test_files/scratch/copy`. + +Tests that need to write temporary files should write (only) to this +directory to ensure they do not interfere with other concurrently +running tests. + +#### `.slt` file format + +[`sqllogictest`] was originally written for SQLite to verify the +correctness of SQL queries against the SQLite engine. The format is designed +engine-agnostic and can parse sqllogictest files (`.slt`), runs +queries against an SQL engine and compares the output to the expected +output. + +[`sqllogictest`]: https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki + +Tests in the `.slt` file are a sequence of query record generally +starting with `CREATE` statements to populate tables and then further +queries to test the populated data. Each `.slt` file runs in its own, isolated `SessionContext`, to make the test setup explicit and so they can run in parallel. Thus it important to keep the tests from having externally visible side effects (like writing to a global diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index d097d97fb7a98..095d5bdcff9da 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -55,17 +55,17 @@ pub async fn main() -> Result<()> { run_tests().await } -/// Sets up an empty directory at test_files/scratch +/// Sets up an empty directory at test_files/scratch/ /// creating it if needed and clearing any file contents if it exists /// This allows tests for inserting to external tables or copy to /// to persist data to disk and have consistent state when running /// a new test -fn setup_scratch_dir() -> Result<()> { - let path = std::path::Path::new("test_files/scratch"); +fn setup_scratch_dir(name: &Path) -> Result<()> { + let path = PathBuf::from("test_files").join("scratch").join(name); if path.exists() { - fs::remove_dir_all(path)?; + fs::remove_dir_all(&path)?; } - fs::create_dir(path)?; + fs::create_dir_all(&path)?; Ok(()) } @@ -73,8 +73,6 @@ async fn run_tests() -> Result<()> { // Enable logging (e.g. set RUST_LOG=debug to see debug logs) env_logger::init(); - setup_scratch_dir()?; - let options = Options::new(); // Run all tests in parallel, reporting failures at the end @@ -138,6 +136,7 @@ async fn run_test_file(test_file: TestFile) -> Result<()> { info!("Skipping: {}", path.display()); return Ok(()); }; + setup_scratch_dir(&relative_path)?; let mut runner = sqllogictest::Runner::new(|| async { Ok(DataFusion::new( test_ctx.session_ctx().clone(), diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 364459fa2df1a..061607c48809d 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -21,26 +21,26 @@ create table source_table(col1 integer, col2 varchar) as values (1, 'Foo'), (2, # Copy to directory as multiple files query IT -COPY source_table TO 'test_files/scratch/table' (format parquet, per_thread_output true); +COPY source_table TO 'test_files/scratch/copy/table' (format parquet, per_thread_output true); ---- 2 #Explain copy queries not currently working -query error DataFusion error: This feature is not implemented: Unsupported SQL statement: Some\("COPY source_table TO 'test_files/scratch/table'"\) -EXPLAIN COPY source_table to 'test_files/scratch/table' +query error DataFusion error: This feature is not implemented: Unsupported SQL statement: Some\("COPY source_table TO 'test_files/scratch/copy/table'"\) +EXPLAIN COPY source_table to 'test_files/scratch/copy/table' query error DataFusion error: SQL error: ParserError\("Expected end of statement, found: source_table"\) -EXPLAIN COPY source_table to 'test_files/scratch/table' (format parquet, per_thread_output true) +EXPLAIN COPY source_table to 'test_files/scratch/copy/table' (format parquet, per_thread_output true) # Copy more files to directory via query query IT -COPY (select * from source_table UNION ALL select * from source_table) to 'test_files/scratch/table' (format parquet, per_thread_output true); +COPY (select * from source_table UNION ALL select * from source_table) to 'test_files/scratch/copy/table' (format parquet, per_thread_output true); ---- 4 # validate multiple parquet file output statement ok -CREATE EXTERNAL TABLE validate_parquet STORED AS PARQUET LOCATION 'test_files/scratch/table/'; +CREATE EXTERNAL TABLE validate_parquet STORED AS PARQUET LOCATION 'test_files/scratch/copy/table/'; query IT select * from validate_parquet; @@ -54,13 +54,13 @@ select * from validate_parquet; # Copy from table to single file query IT -COPY source_table to 'test_files/scratch/table.parquet'; +COPY source_table to 'test_files/scratch/copy/table.parquet'; ---- 2 # validate single parquet file output statement ok -CREATE EXTERNAL TABLE validate_parquet_single STORED AS PARQUET LOCATION 'test_files/scratch/table.parquet'; +CREATE EXTERNAL TABLE validate_parquet_single STORED AS PARQUET LOCATION 'test_files/scratch/copy/table.parquet'; query IT select * from validate_parquet_single; @@ -70,13 +70,13 @@ select * from validate_parquet_single; # copy from table to folder of csv files query IT -COPY source_table to 'test_files/scratch/table_csv' (format csv, per_thread_output true); +COPY source_table to 'test_files/scratch/copy/table_csv' (format csv, per_thread_output true); ---- 2 # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/table_csv'; +CREATE EXTERNAL TABLE validate_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/copy/table_csv'; query IT select * from validate_csv; @@ -86,13 +86,13 @@ select * from validate_csv; # Copy from table to single csv query IT -COPY source_table to 'test_files/scratch/table.csv'; +COPY source_table to 'test_files/scratch/copy/table.csv'; ---- 2 # Validate single csv output statement ok -CREATE EXTERNAL TABLE validate_single_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/table.csv'; +CREATE EXTERNAL TABLE validate_single_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/copy/table.csv'; query IT select * from validate_single_csv; @@ -102,13 +102,13 @@ select * from validate_single_csv; # Copy from table to folder of json query IT -COPY source_table to 'test_files/scratch/table_json' (format json, per_thread_output true); +COPY source_table to 'test_files/scratch/copy/table_json' (format json, per_thread_output true); ---- 2 # Validate json output statement ok -CREATE EXTERNAL TABLE validate_json STORED AS json LOCATION 'test_files/scratch/table_json'; +CREATE EXTERNAL TABLE validate_json STORED AS json LOCATION 'test_files/scratch/copy/table_json'; query IT select * from validate_json; @@ -118,13 +118,13 @@ select * from validate_json; # Copy from table to single json file query IT -COPY source_table to 'test_files/scratch/table.json'; +COPY source_table to 'test_files/scratch/copy/table.json'; ---- 2 # Validate single JSON file` statement ok -CREATE EXTERNAL TABLE validate_single_json STORED AS json LOCATION 'test_files/scratch/table_json'; +CREATE EXTERNAL TABLE validate_single_json STORED AS json LOCATION 'test_files/scratch/copy/table_json'; query IT select * from validate_single_json; @@ -134,13 +134,13 @@ select * from validate_single_json; # Copy from table with options query IT -COPY source_table to 'test_files/scratch/table.json' (row_group_size 55); +COPY source_table to 'test_files/scratch/copy/table.json' (row_group_size 55); ---- 2 # Copy from table with options (and trailing comma) query IT -COPY source_table to 'test_files/scratch/table.json' (row_group_size 55, row_group_limit_bytes 9,); +COPY source_table to 'test_files/scratch/copy/table.json' (row_group_size 55, row_group_limit_bytes 9,); ---- 2 From e41bdc415df896ed21b65886956bb54172e45991 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 16 Aug 2023 14:17:14 -0400 Subject: [PATCH 2/4] Fix name --- datafusion/sqllogictest/bin/sqllogictests.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index 095d5bdcff9da..207efdd27d9e9 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -61,7 +61,11 @@ pub async fn main() -> Result<()> { /// to persist data to disk and have consistent state when running /// a new test fn setup_scratch_dir(name: &Path) -> Result<()> { - let path = PathBuf::from("test_files").join("scratch").join(name); + // go from copy.slt --> copy + let file_stem = name.file_stem().expect("File should have a stem"); + let path = PathBuf::from("test_files").join("scratch").join(file_stem); + + info!("Creating scratch dir in {path:?}"); if path.exists() { fs::remove_dir_all(&path)?; } From c0e897ede615c18edee632dfd3151b42c1ce7ee0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Aug 2023 09:19:35 -0400 Subject: [PATCH 3/4] Update test pats --- .../test_files/insert_to_external.slt | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 6c9c6fdd47e46..4e14120aad1d7 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -49,7 +49,7 @@ statement ok CREATE EXTERNAL TABLE single_file_test(a bigint, b bigint) STORED AS csv -LOCATION 'test_files/scratch/single_csv_table.csv' +LOCATION 'test_files/scratch/insert_to_external/single_csv_table.csv' OPTIONS( create_local_path 'true', single_file 'true', @@ -70,7 +70,7 @@ statement ok CREATE EXTERNAL TABLE directory_test(a bigint, b bigint) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q0' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q0' OPTIONS( create_local_path 'true', ); @@ -87,10 +87,10 @@ select * from directory_test; 3 4 statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q1' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q1' OPTIONS (create_local_path 'true'); query TT @@ -153,10 +153,10 @@ drop table table_without_values; # test_insert_into_as_select_multi_partitioned statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q2' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q2' OPTIONS (create_local_path 'true'); query TT @@ -197,10 +197,10 @@ drop table table_without_values; # test_insert_into_with_sort statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(c1 varchar NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q3' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q3' OPTIONS (create_local_path 'true'); # verify that the sort order of the insert query is maintained into the @@ -237,10 +237,10 @@ drop table table_without_values; # test insert with column names statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(id BIGINT, name varchar) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q4' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q4' OPTIONS (create_local_path 'true'); query IT @@ -276,10 +276,10 @@ drop table table_without_values; # test insert with non-nullable column statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NOT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q5' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q5' OPTIONS (create_local_path 'true'); query II From d381ba51d63825ecc7899a29b828e28e763b6379 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 18 Aug 2023 12:44:45 -0400 Subject: [PATCH 4/4] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Metehan Yıldırım <100111937+metesynnada@users.noreply.github.com> --- datafusion/sqllogictest/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index cbdd5e82d53cb..3e94859d35a79 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -187,7 +187,7 @@ For example, the `test_files/copy.slt` file should use scratch directory `test_files/scratch/copy`. Tests that need to write temporary files should write (only) to this -directory to ensure they do not interfere with other concurrently +directory to ensure they do not interfere with others concurrently running tests. #### `.slt` file format @@ -200,7 +200,7 @@ output. [`sqllogictest`]: https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki -Tests in the `.slt` file are a sequence of query record generally +Tests in the `.slt` file are a sequence of query records generally starting with `CREATE` statements to populate tables and then further queries to test the populated data.