From b9b7c9b98d958aa6dd24e9068f510901843e2e5f Mon Sep 17 00:00:00 2001 From: BePPPower Date: Wed, 11 Oct 2023 10:33:02 +0800 Subject: [PATCH 1/7] fix --- be/src/vec/exec/format/csv/csv_reader.cpp | 9 ++++----- .../ExternalFileTableValuedFunction.java | 14 +++++++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 2382cb92de407f..630124852da751 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -528,6 +528,9 @@ Status CsvReader::get_parsed_schema(std::vector* col_names, bool is_parse_name = false; RETURN_IF_ERROR(_prepare_parse(&read_line, &is_parse_name)); + if (_file_reader->size() == 0) { + return Status::OK(); + } if (read_line == 1) { if (!is_parse_name) { //parse csv file without names and types size_t col_nums = 0; @@ -833,10 +836,6 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool* is_parse_name) { reader_options, &_file_system, &_file_reader)); } - if (_file_reader->size() == 0 && _params.file_type != TFileType::FILE_STREAM && - _params.file_type != TFileType::FILE_BROKER) { - return Status::EndOfFile("get parsed schema failed, empty csv file: " + _range.path); - } // get column_separator and line_delimiter _value_separator = _params.file_attributes.text_params.column_separator; @@ -927,7 +926,7 @@ Status CsvReader::_parse_col_names(std::vector* col_names) { // no use of _line_reader_eof RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); if (size == 0) { - return Status::InternalError("The first line is empty, can not parse column names"); + return Status::OK(); } if (!validate_utf8(const_cast(reinterpret_cast(ptr)), size)) { return Status::InternalError("Only support csv data in utf8 codec"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 648011ff6dcc3f..03d30a1c529e88 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -344,6 +344,10 @@ public List getTableColumns() throws AnalysisException { TNetworkAddress address = new TNetworkAddress(be.getHost(), be.getBrpcPort()); try { PFetchTableSchemaRequest request = getFetchTableStructureRequest(); + if (request == null) { + columns.add(new Column("__dummy_col", ScalarType.createStringType(), true)); + return columns; + } Future future = BackendServiceProxy.getInstance() .fetchTableStructureAsync(address, request); @@ -431,10 +435,10 @@ private Pair getColumnType(List typeNodes, int start) return Pair.of(type, parsedNodes); } - private void fillColumns(InternalService.PFetchTableSchemaResult result) - throws AnalysisException { + private void fillColumns(InternalService.PFetchTableSchemaResult result) { if (result.getColumnNums() == 0) { - throw new AnalysisException("The amount of column is 0"); + columns.add(new Column("__dummy_col", ScalarType.createStringType(), true)); + return; } // add fetched file columns for (int idx = 0; idx < result.getColumnNums(); ++idx) { @@ -450,7 +454,7 @@ private void fillColumns(InternalService.PFetchTableSchemaResult result) } } - private PFetchTableSchemaRequest getFetchTableStructureRequest() throws AnalysisException, TException { + private PFetchTableSchemaRequest getFetchTableStructureRequest() throws TException { // set TFileScanRangeParams TFileScanRangeParams fileScanRangeParams = new TFileScanRangeParams(); fileScanRangeParams.setFormatType(fileFormatType); @@ -482,7 +486,7 @@ private PFetchTableSchemaRequest getFetchTableStructureRequest() throws Analysis break; } if (firstFile == null) { - throw new AnalysisException("Can not get first file, please check uri."); + return null; } // set TFileRangeDesc From 9a5ff4be2dc92d4d0bf581337f18c8c786f3856c Mon Sep 17 00:00:00 2001 From: BePPPower Date: Wed, 11 Oct 2023 11:42:46 +0800 Subject: [PATCH 2/7] add test --- .../data/load_p0/tvf/test_tvf_empty_file.out | 6 ++ .../data/load_p0/tvf/test_tvf_error_url.out | 11 ++++ .../load_p0/tvf/test_tvf_empty_file.groovy | 44 +++++++++++++ .../load_p0/tvf/test_tvf_error_url.groovy | 61 +++++++++++++++++++ 4 files changed, 122 insertions(+) create mode 100644 regression-test/data/load_p0/tvf/test_tvf_empty_file.out create mode 100644 regression-test/data/load_p0/tvf/test_tvf_error_url.out create mode 100644 regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy create mode 100644 regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy diff --git a/regression-test/data/load_p0/tvf/test_tvf_empty_file.out b/regression-test/data/load_p0/tvf/test_tvf_empty_file.out new file mode 100644 index 00000000000000..58e48d2b7c3904 --- /dev/null +++ b/regression-test/data/load_p0/tvf/test_tvf_empty_file.out @@ -0,0 +1,6 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- + +-- !desc -- +__dummy_col TEXT Yes false \N NONE + diff --git a/regression-test/data/load_p0/tvf/test_tvf_error_url.out b/regression-test/data/load_p0/tvf/test_tvf_error_url.out new file mode 100644 index 00000000000000..468a50ff85d073 --- /dev/null +++ b/regression-test/data/load_p0/tvf/test_tvf_error_url.out @@ -0,0 +1,11 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- + +-- !desc -- +__dummy_col TEXT Yes false \N NONE + +-- !select2 -- + +-- !desc2 -- +__dummy_col TEXT Yes false \N NONE + diff --git a/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy b/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy new file mode 100644 index 00000000000000..0a15d872ff59a0 --- /dev/null +++ b/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_tvf_empty_file", "p0") { + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); + + String path = "regression/datalake" + + order_qt_select """ SELECT * FROM S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/empty_file_test.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_desc """ desc function S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/empty_file_test.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ +} \ No newline at end of file diff --git a/regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy b/regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy new file mode 100644 index 00000000000000..d1dcff4d5300b4 --- /dev/null +++ b/regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_tvf_error_url", "p0") { + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); + + String path = "select_tvf/no_exists_file_test" + order_qt_select """ SELECT * FROM S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/no_exist_file1.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_desc """ desc function S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/no_exist_file1.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_select2 """ SELECT * FROM S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_desc2 """ desc function S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ +} From 3e8dbed26411506e5ee58e0dffb2dea21e9abb6c Mon Sep 17 00:00:00 2001 From: BePPPower Date: Thu, 12 Oct 2023 11:26:02 +0800 Subject: [PATCH 3/7] fix code --- be/src/vec/exec/format/csv/csv_reader.cpp | 9 ++-- .../ExternalFileTableValuedFunction.java | 48 +++++++++++-------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 630124852da751..2382cb92de407f 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -528,9 +528,6 @@ Status CsvReader::get_parsed_schema(std::vector* col_names, bool is_parse_name = false; RETURN_IF_ERROR(_prepare_parse(&read_line, &is_parse_name)); - if (_file_reader->size() == 0) { - return Status::OK(); - } if (read_line == 1) { if (!is_parse_name) { //parse csv file without names and types size_t col_nums = 0; @@ -836,6 +833,10 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool* is_parse_name) { reader_options, &_file_system, &_file_reader)); } + if (_file_reader->size() == 0 && _params.file_type != TFileType::FILE_STREAM && + _params.file_type != TFileType::FILE_BROKER) { + return Status::EndOfFile("get parsed schema failed, empty csv file: " + _range.path); + } // get column_separator and line_delimiter _value_separator = _params.file_attributes.text_params.column_separator; @@ -926,7 +927,7 @@ Status CsvReader::_parse_col_names(std::vector* col_names) { // no use of _line_reader_eof RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); if (size == 0) { - return Status::OK(); + return Status::InternalError("The first line is empty, can not parse column names"); } if (!validate_utf8(const_cast(reinterpret_cast(ptr)), size)) { return Status::InternalError("Only support csv data in utf8 codec"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 03d30a1c529e88..5a7558422af8d3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -344,26 +344,27 @@ public List getTableColumns() throws AnalysisException { TNetworkAddress address = new TNetworkAddress(be.getHost(), be.getBrpcPort()); try { PFetchTableSchemaRequest request = getFetchTableStructureRequest(); - if (request == null) { - columns.add(new Column("__dummy_col", ScalarType.createStringType(), true)); - return columns; - } - Future future = BackendServiceProxy.getInstance() - .fetchTableStructureAsync(address, request); - - InternalService.PFetchTableSchemaResult result = future.get(); - TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); - String errMsg; - if (code != TStatusCode.OK) { - if (!result.getStatus().getErrorMsgsList().isEmpty()) { - errMsg = result.getStatus().getErrorMsgsList().get(0); - } else { - errMsg = "fetchTableStructureAsync failed. backend address: " - + address.getHostname() + ":" + address.getPort(); + InternalService.PFetchTableSchemaResult result = null; + + // `request == null` means we don't need to get schemas from BE, + // and we fill a dummy col for this table. + if (request != null) { + Future future = BackendServiceProxy.getInstance() + .fetchTableStructureAsync(address, request); + + result = future.get(); + TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); + String errMsg; + if (code != TStatusCode.OK) { + if (!result.getStatus().getErrorMsgsList().isEmpty()) { + errMsg = result.getStatus().getErrorMsgsList().get(0); + } else { + errMsg = "fetchTableStructureAsync failed. backend address: " + + address.getHostname() + ":" + address.getPort(); + } + throw new AnalysisException(errMsg); } - throw new AnalysisException(errMsg); } - fillColumns(result); } catch (RpcException e) { throw new AnalysisException("fetchTableStructureResult rpc exception", e); @@ -436,7 +437,9 @@ private Pair getColumnType(List typeNodes, int start) } private void fillColumns(InternalService.PFetchTableSchemaResult result) { - if (result.getColumnNums() == 0) { + // `result == null` means we don't need to get schemas from BE, + // and we fill a dummy col for this table. + if (result == null) { columns.add(new Column("__dummy_col", ScalarType.createStringType(), true)); return; } @@ -479,12 +482,17 @@ private PFetchTableSchemaRequest getFetchTableStructureRequest() throws TExcepti // get first file, used to parse table schema TBrokerFileStatus firstFile = null; for (TBrokerFileStatus fileStatus : fileStatuses) { - if (fileStatus.isIsDir()) { + if (fileStatus.isIsDir() || fileStatus.size == 0) { continue; } firstFile = fileStatus; break; } + + // `firstFile == null` means: + // 1. No matching file path exists + // 2. All matched files have a size of 0 + // For these two situations, we don't need to get schema from BE if (firstFile == null) { return null; } From d612d432eed7eb1ad7982087eb43c0ae4b9a177a Mon Sep 17 00:00:00 2001 From: BePPPower Date: Thu, 12 Oct 2023 13:14:49 +0800 Subject: [PATCH 4/7] modify docs and add tests --- .../sql-functions/table-functions/hdfs.md | 2 +- .../sql-functions/table-functions/s3.md | 1 + .../sql-functions/table-functions/hdfs.md | 2 +- .../sql-functions/table-functions/s3.md | 1 + .../data/load_p0/tvf/test_tvf_empty_file.out | 11 +++ .../load_p0/tvf/test_tvf_empty_file.groovy | 71 +++++++++++++------ 6 files changed, 63 insertions(+), 25 deletions(-) diff --git a/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md b/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md index 7cbd21366a207b..6e319421988385 100644 --- a/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md +++ b/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md @@ -51,7 +51,7 @@ hdfs( Related parameters for accessing hdfs: -- `uri`: (required) hdfs uri. +- `uri`: (required) hdfs uri. If the uri path does not exist or the files are empty files, hdfs tvf will return an empty result set. - `fs.defaultFS`: (required) - `hadoop.username`: (required) Can be any string, but cannot be empty. - `hadoop.security.authentication`: (optional) diff --git a/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md b/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md index d089c981558bcc..c482542595da71 100644 --- a/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md +++ b/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md @@ -63,6 +63,7 @@ Related parameters for accessing S3: > Note: URI currently supports three SCHEMA: http://, https:// and s3://. > 1. If you use http:// or https://, you will decide whether to use the 'path style' to access s3 based on the 'use_path_style' parameter > 2. If you use s3://, you will use the "virtual-hosted style' to access the s3, 'use_path_style' parameter is invalid. +> 3. If the uri path does not exist or the files are empty files, s3 tvf will return an empty result set. > > For detailed use cases, you can refer to Best Practice at the bottom. diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md index c7faaa7a86f46f..709f3ec80d56cf 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md @@ -53,7 +53,7 @@ hdfs( **参数说明** 访问hdfs相关参数: -- `uri`:(必填) 访问hdfs的uri。 +- `uri`:(必填) 访问hdfs的uri。如果uri路径不存在或文件都是空文件,hdfs tvf将返回空集合。 - `fs.defaultFS`:(必填) - `hadoop.username`: (必填)可以是任意字符串,但不能为空 - `hadoop.security.authentication`:(选填) diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md index 081734985ca490..5a9ffd60404b8c 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md @@ -66,6 +66,7 @@ S3 tvf中的每一个参数都是一个 `"key"="value"` 对。 > 注意:uri目前支持三种schema:http://, https:// 和 s3:// > 1. 如果使用http://或https://, 则会根据 'use_path_style' 参数来决定是否使用'path style'方式访问s3 > 2. 如果使用s3://, 则都使用 'virtual-hosted style' 方式访问s3, 'use_path_style'参数无效。 +> 3. 如果uri路径不存在或文件都是空文件,s3 tvf将返回空集合 > > 详细使用案例可以参考最下方 Best Practice。 diff --git a/regression-test/data/load_p0/tvf/test_tvf_empty_file.out b/regression-test/data/load_p0/tvf/test_tvf_empty_file.out index 58e48d2b7c3904..59822770e2cd0c 100644 --- a/regression-test/data/load_p0/tvf/test_tvf_empty_file.out +++ b/regression-test/data/load_p0/tvf/test_tvf_empty_file.out @@ -4,3 +4,14 @@ -- !desc -- __dummy_col TEXT Yes false \N NONE +-- !select2 -- +1 doris 18 +2 nereids 20 +3 xxx 22 +4 yyy 21 + +-- !des2 -- +c1 TEXT Yes false \N NONE +c2 TEXT Yes false \N NONE +c3 TEXT Yes false \N NONE + diff --git a/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy b/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy index 0a15d872ff59a0..9877716ae8c679 100644 --- a/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy +++ b/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy @@ -16,29 +16,54 @@ // under the License. suite("test_tvf_empty_file", "p0") { - String ak = getS3AK() - String sk = getS3SK() - String s3_endpoint = getS3Endpoint() - String region = getS3Region() - String bucket = context.config.otherConfigs.get("s3BucketName"); + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); - String path = "regression/datalake" + String path = "regression/datalake" - order_qt_select """ SELECT * FROM S3 ( - "uri" = "http://${s3_endpoint}/${bucket}/${path}/empty_file_test.csv", - "ACCESS_KEY"= "${ak}", - "SECRET_KEY" = "${sk}", - "format" = "csv", - "region" = "${region}" - ); - """ + // ${path}/empty_file_test.csv is an empty file + // so it should return empty sets. + order_qt_select """ SELECT * FROM S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ - order_qt_desc """ desc function S3 ( - "uri" = "http://${s3_endpoint}/${bucket}/${path}/empty_file_test.csv", - "ACCESS_KEY"= "${ak}", - "SECRET_KEY" = "${sk}", - "format" = "csv", - "region" = "${region}" - ); - """ -} \ No newline at end of file + order_qt_desc """ desc function S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + // ${path}/empty_file_test*.csv matches 3 files: + // empty_file_test.csv, empty_file_test_1.csv, empty_file_test_2.csv + // empty_file_test.csv is an empty file, but + // empty_file_test_1.csv and empty_file_test_2.csv have data + // so it should return data of empty_file_test_1.csv and empty_file_test_2.cs + order_qt_select2 """ SELECT * FROM S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ) order by c1; + """ + + order_qt_des2 """ desc function S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ +} From 94d168be60533c3e0daf0b33ca67cddcd1da64f6 Mon Sep 17 00:00:00 2001 From: BePPPower Date: Fri, 13 Oct 2023 14:16:07 +0800 Subject: [PATCH 5/7] add hdfs test --- .../tvf/test_hdfs_tvf_error_uri.out | 6 +++ .../tvf/test_hdfs_tvf_error_uri.groovy | 43 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out create mode 100644 regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out new file mode 100644 index 00000000000000..115f42f2a0e23f --- /dev/null +++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out @@ -0,0 +1,6 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- + +-- !desc1 -- +__dummy_col TEXT Yes false \N NONE + diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy new file mode 100644 index 00000000000000..3f663c25e730a1 --- /dev/null +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hdfs_tvf_error_uri","external,hive,tvf,external_docker") { + String hdfs_port = context.config.otherConfigs.get("hdfs_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + // It's okay to use random `hdfsUser`, but can not be empty. + def hdfsUserName = "doris" + def format = "csv" + def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" + def uri = "" + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + // test csv format + uri = "${defaultFS}" + "/user/doris/preinstalled_data/csv_format_test/no_exist_file.csv" + format = "csv" + order_qt_select1 """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}"); """ + + order_qt_desc1 """ desc function HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}"); """ + } +} \ No newline at end of file From abb9bbc68d29ee1fe757ec3afab583c438949326 Mon Sep 17 00:00:00 2001 From: BePPPower Date: Fri, 13 Oct 2023 15:15:55 +0800 Subject: [PATCH 6/7] add docs --- docs/zh-CN/docs/lakehouse/file.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/zh-CN/docs/lakehouse/file.md b/docs/zh-CN/docs/lakehouse/file.md index 4a8996a10f13fb..6c2bc28a653906 100644 --- a/docs/zh-CN/docs/lakehouse/file.md +++ b/docs/zh-CN/docs/lakehouse/file.md @@ -193,3 +193,9 @@ FROM s3( "use_path_style" = "true"); ``` +### 注意事项 + +1. 如果 `S3 / hdfs` tvf指定的uri匹配不到文件,或者匹配到的所有文件都是空文件,那么 `S3 / hdfs` tvf将会返回空结果集。在这种情况下使用`DESC FUNCTION`查看这个文件的Schema,会得到一列虚假的列`__dummy_col`,可忽略这一列。 + +2. 如果指定tvf的format为csv,所读文件不为空文件但文件第一行为空,则会提示错误`The first line is empty, can not parse column numbers`, 这因为无法通过该文件的第一行解析出schema。 + From 52f04d1008ca5b2f92e3c9c870a10bad64c39170 Mon Sep 17 00:00:00 2001 From: BePPPower Date: Fri, 13 Oct 2023 18:38:59 +0800 Subject: [PATCH 7/7] add en docs --- docs/en/docs/lakehouse/file.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/en/docs/lakehouse/file.md b/docs/en/docs/lakehouse/file.md index d7f462049608eb..cc78338ee423fc 100644 --- a/docs/en/docs/lakehouse/file.md +++ b/docs/en/docs/lakehouse/file.md @@ -193,3 +193,9 @@ FROM s3( "use_path_style" = "true"); ``` + +### Note + +1. If the URI specified by the `S3 / HDFS` TVF is not matched with the file, or all the matched files are empty files, then the` S3 / HDFS` TVF will return to the empty result set. In this case, using the `DESC FUNCTION` to view the schema of this file, you will get a dummy column` __dummy_col`, which can be ignored. + +2. If the format of the TVF is specified to `CSV`, and the read file is not a empty file but the first line of this file is empty, then it will prompt the error `The first line is empty, can not parse column numbers`. This is because the schema cannot be parsed from the first line of the file \ No newline at end of file