diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 1f69bb864e715..3e94859d35a79 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -177,14 +177,32 @@ You can update the tests / generate expected output by passing the `--complete` cargo test --test sqllogictests -- ddl --complete ``` -#### sqllogictests +#### Running tests: `scratchdir` -sqllogictest is a program originally written for SQLite to verify the correctness of SQL queries against the SQLite -engine. The program is engine-agnostic and can parse sqllogictest files (`.slt`), runs queries against an SQL engine and -compare the output to the expected output. +The DataFusion sqllogictest runner automatically creates a directory +named `test_files/scratch/`, creating it if needed and +clearing any file contents if it exists. -Tests in the `.slt` file are a sequence of query record generally starting with `CREATE` statements to populate tables -and then further queries to test the populated data (arrow-datafusion exception). +For example, the `test_files/copy.slt` file should use scratch +directory `test_files/scratch/copy`. + +Tests that need to write temporary files should write (only) to this +directory to ensure they do not interfere with others concurrently +running tests. + +#### `.slt` file format + +[`sqllogictest`] was originally written for SQLite to verify the +correctness of SQL queries against the SQLite engine. The format is designed +engine-agnostic and can parse sqllogictest files (`.slt`), runs +queries against an SQL engine and compares the output to the expected +output. + +[`sqllogictest`]: https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki + +Tests in the `.slt` file are a sequence of query records generally +starting with `CREATE` statements to populate tables and then further +queries to test the populated data. Each `.slt` file runs in its own, isolated `SessionContext`, to make the test setup explicit and so they can run in parallel. Thus it important to keep the tests from having externally visible side effects (like writing to a global diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index d097d97fb7a98..207efdd27d9e9 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -55,17 +55,21 @@ pub async fn main() -> Result<()> { run_tests().await } -/// Sets up an empty directory at test_files/scratch +/// Sets up an empty directory at test_files/scratch/ /// creating it if needed and clearing any file contents if it exists /// This allows tests for inserting to external tables or copy to /// to persist data to disk and have consistent state when running /// a new test -fn setup_scratch_dir() -> Result<()> { - let path = std::path::Path::new("test_files/scratch"); +fn setup_scratch_dir(name: &Path) -> Result<()> { + // go from copy.slt --> copy + let file_stem = name.file_stem().expect("File should have a stem"); + let path = PathBuf::from("test_files").join("scratch").join(file_stem); + + info!("Creating scratch dir in {path:?}"); if path.exists() { - fs::remove_dir_all(path)?; + fs::remove_dir_all(&path)?; } - fs::create_dir(path)?; + fs::create_dir_all(&path)?; Ok(()) } @@ -73,8 +77,6 @@ async fn run_tests() -> Result<()> { // Enable logging (e.g. set RUST_LOG=debug to see debug logs) env_logger::init(); - setup_scratch_dir()?; - let options = Options::new(); // Run all tests in parallel, reporting failures at the end @@ -138,6 +140,7 @@ async fn run_test_file(test_file: TestFile) -> Result<()> { info!("Skipping: {}", path.display()); return Ok(()); }; + setup_scratch_dir(&relative_path)?; let mut runner = sqllogictest::Runner::new(|| async { Ok(DataFusion::new( test_ctx.session_ctx().clone(), diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index a44a662ada239..d13caa47c17d4 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -21,19 +21,19 @@ create table source_table(col1 integer, col2 varchar) as values (1, 'Foo'), (2, # Copy to directory as multiple files query IT -COPY source_table TO 'test_files/scratch/table' (format parquet, per_thread_output true); +COPY source_table TO 'test_files/scratch/copy/table' (format parquet, per_thread_output true); ---- 2 # Error case query error DataFusion error: Error during planning: Copy To format not explicitly set and unable to get file extension! -EXPLAIN COPY source_table to 'test_files/scratch/table' +EXPLAIN COPY source_table to 'test_files/scratch/copy/table' query TT -EXPLAIN COPY source_table to 'test_files/scratch/table' (format parquet, per_thread_output true) +EXPLAIN COPY source_table to 'test_files/scratch/copy/table' (format parquet, per_thread_output true) ---- logical_plan -CopyTo: format=parquet output_url=test_files/scratch/table per_thread_output=true options: (format parquet, per_thread_output true) +CopyTo: format=parquet output_url=test_files/scratch/copy/table per_thread_output=true options: (format parquet, per_thread_output true) --TableScan: source_table projection=[col1, col2] physical_plan InsertExec: sink=ParquetSink(writer_mode=PutMultipart, file_groups=[]) @@ -41,13 +41,13 @@ InsertExec: sink=ParquetSink(writer_mode=PutMultipart, file_groups=[]) # Copy more files to directory via query query IT -COPY (select * from source_table UNION ALL select * from source_table) to 'test_files/scratch/table' (format parquet, per_thread_output true); +COPY (select * from source_table UNION ALL select * from source_table) to 'test_files/scratch/copy/table' (format parquet, per_thread_output true); ---- 4 # validate multiple parquet file output statement ok -CREATE EXTERNAL TABLE validate_parquet STORED AS PARQUET LOCATION 'test_files/scratch/table/'; +CREATE EXTERNAL TABLE validate_parquet STORED AS PARQUET LOCATION 'test_files/scratch/copy/table/'; query IT select * from validate_parquet; @@ -61,13 +61,13 @@ select * from validate_parquet; # Copy from table to single file query IT -COPY source_table to 'test_files/scratch/table.parquet'; +COPY source_table to 'test_files/scratch/copy/table.parquet'; ---- 2 # validate single parquet file output statement ok -CREATE EXTERNAL TABLE validate_parquet_single STORED AS PARQUET LOCATION 'test_files/scratch/table.parquet'; +CREATE EXTERNAL TABLE validate_parquet_single STORED AS PARQUET LOCATION 'test_files/scratch/copy/table.parquet'; query IT select * from validate_parquet_single; @@ -77,13 +77,13 @@ select * from validate_parquet_single; # copy from table to folder of csv files query IT -COPY source_table to 'test_files/scratch/table_csv' (format csv, per_thread_output true); +COPY source_table to 'test_files/scratch/copy/table_csv' (format csv, per_thread_output true); ---- 2 # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/table_csv'; +CREATE EXTERNAL TABLE validate_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/copy/table_csv'; query IT select * from validate_csv; @@ -93,13 +93,13 @@ select * from validate_csv; # Copy from table to single csv query IT -COPY source_table to 'test_files/scratch/table.csv'; +COPY source_table to 'test_files/scratch/copy/table.csv'; ---- 2 # Validate single csv output statement ok -CREATE EXTERNAL TABLE validate_single_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/table.csv'; +CREATE EXTERNAL TABLE validate_single_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/copy/table.csv'; query IT select * from validate_single_csv; @@ -109,13 +109,13 @@ select * from validate_single_csv; # Copy from table to folder of json query IT -COPY source_table to 'test_files/scratch/table_json' (format json, per_thread_output true); +COPY source_table to 'test_files/scratch/copy/table_json' (format json, per_thread_output true); ---- 2 # Validate json output statement ok -CREATE EXTERNAL TABLE validate_json STORED AS json LOCATION 'test_files/scratch/table_json'; +CREATE EXTERNAL TABLE validate_json STORED AS json LOCATION 'test_files/scratch/copy/table_json'; query IT select * from validate_json; @@ -125,13 +125,13 @@ select * from validate_json; # Copy from table to single json file query IT -COPY source_table to 'test_files/scratch/table.json'; +COPY source_table to 'test_files/scratch/copy/table.json'; ---- 2 # Validate single JSON file` statement ok -CREATE EXTERNAL TABLE validate_single_json STORED AS json LOCATION 'test_files/scratch/table_json'; +CREATE EXTERNAL TABLE validate_single_json STORED AS json LOCATION 'test_files/scratch/copy/table_json'; query IT select * from validate_single_json; @@ -141,13 +141,13 @@ select * from validate_single_json; # Copy from table with options query IT -COPY source_table to 'test_files/scratch/table.json' (row_group_size 55); +COPY source_table to 'test_files/scratch/copy/table.json' (row_group_size 55); ---- 2 # Copy from table with options (and trailing comma) query IT -COPY source_table to 'test_files/scratch/table.json' (row_group_size 55, row_group_limit_bytes 9,); +COPY source_table to 'test_files/scratch/copy/table.json' (row_group_size 55, row_group_limit_bytes 9,); ---- 2 diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 6c9c6fdd47e46..4e14120aad1d7 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -49,7 +49,7 @@ statement ok CREATE EXTERNAL TABLE single_file_test(a bigint, b bigint) STORED AS csv -LOCATION 'test_files/scratch/single_csv_table.csv' +LOCATION 'test_files/scratch/insert_to_external/single_csv_table.csv' OPTIONS( create_local_path 'true', single_file 'true', @@ -70,7 +70,7 @@ statement ok CREATE EXTERNAL TABLE directory_test(a bigint, b bigint) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q0' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q0' OPTIONS( create_local_path 'true', ); @@ -87,10 +87,10 @@ select * from directory_test; 3 4 statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q1' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q1' OPTIONS (create_local_path 'true'); query TT @@ -153,10 +153,10 @@ drop table table_without_values; # test_insert_into_as_select_multi_partitioned statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q2' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q2' OPTIONS (create_local_path 'true'); query TT @@ -197,10 +197,10 @@ drop table table_without_values; # test_insert_into_with_sort statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(c1 varchar NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q3' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q3' OPTIONS (create_local_path 'true'); # verify that the sort order of the insert query is maintained into the @@ -237,10 +237,10 @@ drop table table_without_values; # test insert with column names statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(id BIGINT, name varchar) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q4' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q4' OPTIONS (create_local_path 'true'); query IT @@ -276,10 +276,10 @@ drop table table_without_values; # test insert with non-nullable column statement ok -CREATE EXTERNAL TABLE +CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NOT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/external_parquet_table_q5' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q5' OPTIONS (create_local_path 'true'); query II