From 9b70da6902bf79c87d2116b43928fc37602f5eaf Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 25 Mar 2025 18:30:35 +0800 Subject: [PATCH] [fix](multi-catalog) Fix bug: "Can not create a Path from an empty string" (#49382) Problem Summary: In HiveMetaStoreCache, the function FileInputFormat.setInputPaths is used to set input paths. However, this function splits paths using commas, which is not the expected behavior. As a result, when partition values contain commas, it leads to incorrect path parsing and potential errors. ```java public static void setInputPaths(JobConf conf, String org.apache.hadoop.shaded.com.aSeparatedPaths) { setInputPaths(conf, StringUtils.stringToPath( getPathStrings(org.apache.hadoop.shaded.com.aSeparatedPaths))); } ``` To prevent FileInputFormat.setInputPaths from splitting paths by commas, we use another overloaded version of the method. Instead of passing a comma-separated string, we explicitly pass a Path object, ensuring that partition values containing commas are handled correctly. ```java public static void setInputPaths(JobConf conf, Path... inputPaths) { Path path = new Path(conf.getWorkingDirectory(), inputPaths[0]); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for(int i = 1; i < inputPaths.length;i++) { str.append(StringUtils.COMMA_STR); path = new Path(conf.getWorkingDirectory(), inputPaths[i]); str.append(StringUtils.escapeString(path.toString())); } conf.set(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input. FileInputFormat.INPUT_DIR, str.toString()); } ``` --- .../create_preinstalled_scripts/run74.hql | 53 ++++++++++++++++++ .../partition_col=,/000000_0 | Bin 0 -> 408 bytes .../partition_col=a, b, c/000000_0 | Bin 0 -> 408 bytes .../partition_col=a, b/000000_0 | Bin 0 -> 408 bytes .../datasource/hive/HiveMetaStoreCache.java | 3 +- .../hive/test_hive_partitions.out | 10 ++++ .../hive/test_hive_partitions.groovy | 6 +- 7 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run74.hql create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=,/000000_0 create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=a, b, c/000000_0 create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=a, b/000000_0 diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run74.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run74.hql new file mode 100644 index 00000000000000..31e98f370d5009 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run74.hql @@ -0,0 +1,53 @@ +create database if not exists partition_tables; +use partition_tables; + +CREATE TABLE decimal_partition_table ( + id INT, + name STRING, + value FLOAT +) +PARTITIONED BY (partition_col DECIMAL(10, 2)) +STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/partition_tables/decimal_partition_table'; + +CREATE TABLE int_partition_table ( + id INT, + name STRING, + value FLOAT +) +PARTITIONED BY (partition_col INT) +STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/partition_tables/int_partition_table'; + +CREATE TABLE string_partition_table ( + id INT, + name STRING, + value FLOAT +) +PARTITIONED BY (partition_col STRING) +STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/partition_tables/string_partition_table'; + +CREATE TABLE date_partition_table ( + id INT, + name STRING, + value FLOAT +) +PARTITIONED BY (partition_col DATE) +STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/partition_tables/date_partition_table'; + +CREATE TABLE string_partition_table_with_comma ( + id INT, + name STRING, + value FLOAT +) +PARTITIONED BY (partition_col STRING) +STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/partition_tables/string_partition_table_with_comma'; + +msck repair table decimal_partition_table; +msck repair table int_partition_table; +msck repair table string_partition_table; +msck repair table date_partition_table; +msck repair table string_partition_table_with_comma; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=,/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=,/000000_0 new file mode 100644 index 0000000000000000000000000000000000000000..a93ce013162c2057aa7974cd8afce3a6c638a218 GIT binary patch literal 408 zcmZXR!AiqG5QcZxSQhNTLuW}Khg=#eXfaJBEd<1yLJq}4kCjb!mj;@usmW9LBz+cr zACFGjDy2Q_@Xzq=Km5DXhvxtY^q_YQ3`GaHYg?2Mva13WVR+7HJ48Z&I6$eisXT(@ zsp0q6=e=#U^?+?#U{3mYu_zk-vd$N}l+Yo-`pjm9BG5%)tz@bDxSC^g-pFOGiC~WF zBtCcX`W+`7r^NxsVkTxjPfil}a~HaMbZfYUQ<3`f1OxnK@Hf;2zUOwk{#`c|ZI5z@ zA1W9bJiW>4z1AN$W#u)xS{JV?FBp!8fj7wNVyV1HW-?Mz$60o(BCWFA#FJ4dbr6OV UGa4Hen%Ly#id^C=Pw`KF0rNvglmGw# literal 0 HcmV?d00001 diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=a, b, c/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=a, b, c/000000_0 new file mode 100644 index 0000000000000000000000000000000000000000..4e6e043ccf509af68041c13b0b0eb838916e6eb1 GIT binary patch literal 408 zcmZXR!Arw16vmURj)FUQ*b6Q6(96mM9d@a7gC6vz(8EyJagrt}E38vH+dpc5p+{f0 zDa9S~_}=4}@9`yz$CnTY3}JW!3`HMo&#`)hkYnYj2*cB(?GOnLyn|BZP}PAbLh9U5 zh}Q?lYHNpWTVQVbbh$1X^R`oKQ%dL)U?b+RLK7IEu(q-^BV27`bJ566Z3t(s=O%&e z;q`l)^xYN*9P=e#26l3mAXs@YIH3o_1Dx_SSS1+XH$$+e9@r=Dxa-e#Pto>Y4*pF! zBje`}S$)>#>$a@?##FoFed~vl*(CJGSzT3d literal 0 HcmV?d00001 diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=a, b/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/partition_tables/string_partition_table_with_comma/partition_col=a, b/000000_0 new file mode 100644 index 0000000000000000000000000000000000000000..63b7f592e7bac3e8c9b845b03026ec2c351e570e GIT binary patch literal 408 zcmZXR!AiqG6h&vo7zWJ3Mc7TGjZ&|;cMTF63o3Rx5vT~=n22@Nz=Q3aw00ILTTmwU40(WhL5kd}Apdy_;quoS8fasxA*;Mr)ijX=D z{Q9)+ZKIn#wzI&T^zmX9B*2EuW@S#mLSbxVrH8nhqd9Noy3s^1$8{2) zyLkPM6VvH%z++&79mUvxJ;XN^ zjEtV%Wc}XgkDIFYT3v6;*Nqp9r{lmIsis`#UL=)_a;ambZu3azDl6jIB$PS`!&xzz S7I|32MOIvqOMJjN{>cyJQAHyF literal 0 HcmV?d00001 diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index 48a625c35a7599..751919e85f4abb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -404,7 +404,8 @@ private FileCacheValue loadFiles(FileCacheKey key) { } catch (Exception e) { LOG.warn("unknown scheme in path: " + finalLocation, e); } - FileInputFormat.setInputPaths(jobConf, finalLocation.get()); + // NOTICE: the setInputPaths has 2 overloads, the 2nd arg should be Path not String + FileInputFormat.setInputPaths(jobConf, finalLocation.getPath()); try { FileCacheValue result = getFileCache(finalLocation.get(), key.inputFormat, jobConf, key.getPartitionValues(), key.bindBrokerName); diff --git a/regression-test/data/external_table_p0/hive/test_hive_partitions.out b/regression-test/data/external_table_p0/hive/test_hive_partitions.out index 904eb6eda301ca..ea0c8f1518c5e0 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_partitions.out +++ b/regression-test/data/external_table_p0/hive/test_hive_partitions.out @@ -120,6 +120,11 @@ nation=us/city=washington -- !q21 -- +-- !string_partition_table_with_comma -- +1 a 1.1 , +2 b 2.2 a, b +3 c 3.3 a, b, c + -- !q01 -- 33 1.11xyz 34 1.11XYZ @@ -241,3 +246,8 @@ nation=us/city=washington -- !q21 -- +-- !string_partition_table_with_comma -- +1 a 1.1 , +2 b 2.2 a, b +3 c 3.3 a, b, c + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_partitions.groovy b/regression-test/suites/external_table_p0/hive/test_hive_partitions.groovy index 0e41adc31278c5..cc3425106a59a0 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_partitions.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_partitions.groovy @@ -91,6 +91,10 @@ suite("test_hive_partitions", "p0,external,hive,external_docker,external_docker_ q01() + qt_string_partition_table_with_comma """ + select * from partition_tables.string_partition_table_with_comma order by id; + """ + sql """set num_partitions_in_batch_mode=1""" explain { sql ("select * from partition_table") @@ -99,8 +103,6 @@ suite("test_hive_partitions", "p0,external,hive,external_docker,external_docker_ contains "(approximate)inputSplitNum=60" } sql """unset variable num_partitions_in_batch_mode""" - - // sql """drop catalog if exists ${catalog_name}""" } finally { } }