From bdc69822b3a1f01985078b44cd0db94db812771b Mon Sep 17 00:00:00 2001 From: Tiewei Fang <43782773+BePPPower@users.noreply.github.com> Date: Tue, 17 Oct 2023 09:52:53 +0800 Subject: [PATCH] [fix](Tvf) return empty set when tvf queries an empty file or an error uri (#25280) return errors when tvf queries an empty file or an error uri: 1. get parsed schema failed, empty csv file 2. Can not get first file, please check uri. we just return empty set when tvf queries an empty file or an error uri. ```sql mysql> select * from s3( "uri" = "https://error_uri/exp_1.csv", "s3.access_key"= "xx", "s3.secret_key" = "yy", "format" = "csv") limit 10; Empty set (1.29 sec) ``` --- .../ExternalFileTableValuedFunction.java | 56 ++++++++------- .../tvf/test_hdfs_tvf_error_uri.out | 6 ++ .../data/load_p0/tvf/test_tvf_empty_file.out | 17 +++++ .../data/load_p0/tvf/test_tvf_error_url.out | 11 +++ .../tvf/test_hdfs_tvf_error_uri.groovy | 43 ++++++++++++ .../load_p0/tvf/test_tvf_empty_file.groovy | 69 +++++++++++++++++++ .../load_p0/tvf/test_tvf_error_url.groovy | 61 ++++++++++++++++ 7 files changed, 240 insertions(+), 23 deletions(-) create mode 100644 regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out create mode 100644 regression-test/data/load_p0/tvf/test_tvf_empty_file.out create mode 100644 regression-test/data/load_p0/tvf/test_tvf_error_url.out create mode 100644 regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy create mode 100644 regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy create mode 100644 regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 1e89fd41a5d399..eabe82804c3107 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -37,7 +37,6 @@ import org.apache.doris.common.util.BrokerUtil; import org.apache.doris.common.util.FileFormatConstants; import org.apache.doris.common.util.FileFormatUtils; -import org.apache.doris.common.util.NetUtils; import org.apache.doris.common.util.Util; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.ScanNode; @@ -316,23 +315,27 @@ public List getTableColumns() throws AnalysisException { TNetworkAddress address = new TNetworkAddress(be.getHost(), be.getBrpcPort()); try { PFetchTableSchemaRequest request = getFetchTableStructureRequest(); - Future future = BackendServiceProxy.getInstance() - .fetchTableStructureAsync(address, request); - - InternalService.PFetchTableSchemaResult result = future.get(); - TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); - String errMsg; - if (code != TStatusCode.OK) { - if (!result.getStatus().getErrorMsgsList().isEmpty()) { - errMsg = result.getStatus().getErrorMsgsList().get(0); - } else { - errMsg = "fetchTableStructureAsync failed. backend address: " - + NetUtils - .getHostPortInAccessibleFormat(address.getHostname(), address.getPort()); + InternalService.PFetchTableSchemaResult result = null; + + // `request == null` means we don't need to get schemas from BE, + // and we fill a dummy col for this table. + if (request != null) { + Future future = BackendServiceProxy.getInstance() + .fetchTableStructureAsync(address, request); + + result = future.get(); + TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); + String errMsg; + if (code != TStatusCode.OK) { + if (!result.getStatus().getErrorMsgsList().isEmpty()) { + errMsg = result.getStatus().getErrorMsgsList().get(0); + } else { + errMsg = "fetchTableStructureAsync failed. backend address: " + + address.getHostname() + ":" + address.getPort(); + } + throw new AnalysisException(errMsg); } - throw new AnalysisException(errMsg); } - fillColumns(result); } catch (RpcException e) { throw new AnalysisException("fetchTableStructureResult rpc exception", e); @@ -393,10 +396,12 @@ private Pair getColumnType(List typeNodes, int start) return Pair.of(type, parsedNodes); } - private void fillColumns(InternalService.PFetchTableSchemaResult result) - throws AnalysisException { - if (result.getColumnNums() == 0) { - throw new AnalysisException("The amount of column is 0"); + private void fillColumns(InternalService.PFetchTableSchemaResult result) { + // `result == null` means we don't need to get schemas from BE, + // and we fill a dummy col for this table. + if (result == null) { + columns.add(new Column("__dummy_col", ScalarType.createStringType(), true)); + return; } // add fetched file columns for (int idx = 0; idx < result.getColumnNums(); ++idx) { @@ -412,7 +417,7 @@ private void fillColumns(InternalService.PFetchTableSchemaResult result) } } - private PFetchTableSchemaRequest getFetchTableStructureRequest() throws AnalysisException, TException { + private PFetchTableSchemaRequest getFetchTableStructureRequest() throws TException { // set TFileScanRangeParams TFileScanRangeParams fileScanRangeParams = new TFileScanRangeParams(); fileScanRangeParams.setFormatType(fileFormatType); @@ -429,14 +434,19 @@ private PFetchTableSchemaRequest getFetchTableStructureRequest() throws Analysis // get first file, used to parse table schema TBrokerFileStatus firstFile = null; for (TBrokerFileStatus fileStatus : fileStatuses) { - if (fileStatus.isIsDir()) { + if (fileStatus.isIsDir() || fileStatus.size == 0) { continue; } firstFile = fileStatus; break; } + + // `firstFile == null` means: + // 1. No matching file path exists + // 2. All matched files have a size of 0 + // For these two situations, we don't need to get schema from BE if (firstFile == null) { - throw new AnalysisException("Can not get first file, please check uri."); + return null; } // set TFileRangeDesc diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out new file mode 100644 index 00000000000000..115f42f2a0e23f --- /dev/null +++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_error_uri.out @@ -0,0 +1,6 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- + +-- !desc1 -- +__dummy_col TEXT Yes false \N NONE + diff --git a/regression-test/data/load_p0/tvf/test_tvf_empty_file.out b/regression-test/data/load_p0/tvf/test_tvf_empty_file.out new file mode 100644 index 00000000000000..59822770e2cd0c --- /dev/null +++ b/regression-test/data/load_p0/tvf/test_tvf_empty_file.out @@ -0,0 +1,17 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- + +-- !desc -- +__dummy_col TEXT Yes false \N NONE + +-- !select2 -- +1 doris 18 +2 nereids 20 +3 xxx 22 +4 yyy 21 + +-- !des2 -- +c1 TEXT Yes false \N NONE +c2 TEXT Yes false \N NONE +c3 TEXT Yes false \N NONE + diff --git a/regression-test/data/load_p0/tvf/test_tvf_error_url.out b/regression-test/data/load_p0/tvf/test_tvf_error_url.out new file mode 100644 index 00000000000000..468a50ff85d073 --- /dev/null +++ b/regression-test/data/load_p0/tvf/test_tvf_error_url.out @@ -0,0 +1,11 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- + +-- !desc -- +__dummy_col TEXT Yes false \N NONE + +-- !select2 -- + +-- !desc2 -- +__dummy_col TEXT Yes false \N NONE + diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy new file mode 100644 index 00000000000000..3f663c25e730a1 --- /dev/null +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_error_uri.groovy @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hdfs_tvf_error_uri","external,hive,tvf,external_docker") { + String hdfs_port = context.config.otherConfigs.get("hdfs_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + // It's okay to use random `hdfsUser`, but can not be empty. + def hdfsUserName = "doris" + def format = "csv" + def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" + def uri = "" + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + // test csv format + uri = "${defaultFS}" + "/user/doris/preinstalled_data/csv_format_test/no_exist_file.csv" + format = "csv" + order_qt_select1 """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}"); """ + + order_qt_desc1 """ desc function HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}"); """ + } +} \ No newline at end of file diff --git a/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy b/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy new file mode 100644 index 00000000000000..9877716ae8c679 --- /dev/null +++ b/regression-test/suites/load_p0/tvf/test_tvf_empty_file.groovy @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_tvf_empty_file", "p0") { + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); + + String path = "regression/datalake" + + // ${path}/empty_file_test.csv is an empty file + // so it should return empty sets. + order_qt_select """ SELECT * FROM S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_desc """ desc function S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + // ${path}/empty_file_test*.csv matches 3 files: + // empty_file_test.csv, empty_file_test_1.csv, empty_file_test_2.csv + // empty_file_test.csv is an empty file, but + // empty_file_test_1.csv and empty_file_test_2.csv have data + // so it should return data of empty_file_test_1.csv and empty_file_test_2.cs + order_qt_select2 """ SELECT * FROM S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ) order by c1; + """ + + order_qt_des2 """ desc function S3 ( + "uri" = "http://${bucket}.${s3_endpoint}/${path}/empty_file_test*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ +} diff --git a/regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy b/regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy new file mode 100644 index 00000000000000..d1dcff4d5300b4 --- /dev/null +++ b/regression-test/suites/load_p0/tvf/test_tvf_error_url.groovy @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_tvf_error_url", "p0") { + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); + + String path = "select_tvf/no_exists_file_test" + order_qt_select """ SELECT * FROM S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/no_exist_file1.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_desc """ desc function S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/no_exist_file1.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_select2 """ SELECT * FROM S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ + + order_qt_desc2 """ desc function S3 ( + "uri" = "http://${s3_endpoint}/${bucket}/${path}/*.csv", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "region" = "${region}" + ); + """ +}