From 8fe971a835986a8a83acfeb756c5fc57a029f81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E5=B0=8F=E5=88=9A?= Date: Fri, 12 Jul 2024 13:19:10 +0800 Subject: [PATCH 1/3] [fix](hive) Make doris read hive text table parameters and behavior consistent with hive (#37638) ## Proposed changes When hive reads the text table, it will first try to parse "field.delim" into Byte type. If it fails, it will take the first character as Byte. If "field.delim" is not set, use the same method to parse "serialization.format". ```java separatorCandidates.add(LazyUtils.getByte(tableProperties.getProperty(serdeConstants.FIELD_DELIM, tableProperties.getProperty(serdeConstants.SERIALIZATION_FORMAT)), DefaultSeparators[0])); ... public static byte getByte(String altValue, byte defaultVal) { if (altValue != null && altValue.length() > 0) { try { return Byte.parseByte(altValue); } catch (NumberFormatException e) { return (byte) altValue.charAt(0); } } return defaultVal; } ``` --- .../serde_prop/some_serde_table.hql | 57 +++++++++++++++++++ .../hive/HiveMetaStoreClientHelper.java | 17 ++++++ .../datasource/hive/source/HiveScanNode.java | 44 +++++++------- .../hive/test_hive_serde_prop.out | 36 +++++++++++- .../hive/test_hive_serde_prop.groovy | 6 +- 5 files changed, 137 insertions(+), 23 deletions(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index fa6ad791118c1e..13e7cb86e0390f 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -30,5 +30,62 @@ TBLPROPERTIES ( 'field.delim'='|' ); +CREATE TABLE `serde_test3`( + `id` int, + `name` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + 'serialization.format'='g') +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'; + + +CREATE TABLE `serde_test4`( + `id` int, + `name` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + 'field.delim' = 'gg', + "line.delim" = "hh") +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'; + +CREATE TABLE `serde_test5`( + `id` int, + `name` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + 'field.delim' = '16', + "line.delim" = "21") +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'; + +CREATE TABLE `serde_test6`( + `id` int, + `name` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + 'field.delim' = '\16', + "line.delim" = "\21") +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'; + + insert into serde_test1 values(1, "abc"),(2, "def"); insert into serde_test2 values(1, "abc"),(2, "def"); +insert into serde_test3 values(1, "abc"),(2, "def"); +insert into serde_test4 values(1, "abc"),(2, "def"); +insert into serde_test5 values(1, "abc"),(2, "def"); +insert into serde_test6 values(1, "abc"),(2, "def"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 22bf13755a2e11..795cbef18c9a74 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -873,4 +873,21 @@ public static String firstPresentOrDefault(String defaultValue, Optional } return defaultValue; } + + /** + * Return the byte value of the number string. + * + * @param altValue + * The string containing a number. + */ + public static String getByte(String altValue) { + if (altValue != null && altValue.length() > 0) { + try { + return Character.toString((char) (Byte.parseByte(altValue) + 256) % 256); + } catch (NumberFormatException e) { + return altValue.substring(0, 1); + } + } + return null; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 0214ecc464238a..abb8cc8dda3c13 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -86,6 +86,7 @@ public class HiveScanNode extends FileQueryScanNode { public static final String DEFAULT_LINE_DELIMITER = "\n"; public static final String PROP_SEPARATOR_CHAR = "separatorChar"; public static final String PROP_QUOTE_CHAR = "quoteChar"; + public static final String PROP_SERIALIZATION_FORMAT = "serialization.format"; public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim"; public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim"; @@ -447,29 +448,32 @@ protected TFileAttributes getFileAttributes() throws UserException { TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); // 1. set column separator - Optional fieldDelim = - HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_FIELD_DELIMITER); - Optional columnSeparator = - HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_SEPARATOR_CHAR); - textParams.setColumnSeparator(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator)); + Optional fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_FIELD_DELIMITER); + Optional serFormat = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_SERIALIZATION_FORMAT); + Optional columnSeparator = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_SEPARATOR_CHAR); + textParams.setColumnSeparator(HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat))); // 2. set line delimiter - Optional lineDelim = - HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_LINE_DELIMITER); - textParams.setLineDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_LINE_DELIMITER, lineDelim)); + Optional lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_LINE_DELIMITER); + textParams.setLineDelimiter(HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_LINE_DELIMITER, lineDelim))); // 3. set mapkv delimiter - Optional mapkvDelim = - HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_MAP_KV_DELIMITER); - textParams.setMapkvDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_MAP_KV_DELIMITER, mapkvDelim)); + Optional mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_MAP_KV_DELIMITER); + textParams.setMapkvDelimiter(HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_MAP_KV_DELIMITER, mapkvDelim))); // 4. set collection delimiter - Optional collectionDelimHive2 = - HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_COLLECTION_DELIMITER_HIVE2); - Optional collectionDelimHive3 = - HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_COLLECTION_DELIMITER_HIVE3); - textParams.setCollectionDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3)); + Optional collectionDelimHive2 = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_COLLECTION_DELIMITER_HIVE2); + Optional collectionDelimHive3 = HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), + PROP_COLLECTION_DELIMITER_HIVE3); + textParams.setCollectionDelimiter( + HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3))); // 5. set quote char Map serdeParams = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters(); if (serdeParams.containsKey(PROP_QUOTE_CHAR)) { diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index b00eebec49d711..38918c3fc6f771 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -7,7 +7,23 @@ b 2.2 1 abc 2 def --- !2 -- +-- !3 -- +1 abc +2 def + +-- !4 -- +1 abc +2 def + +-- !5 -- +1 abc +2 def + +-- !6 -- +1 abc +2 def + +-- !7 -- 1 abc 2 def @@ -19,7 +35,23 @@ b 2.2 1 abc 2 def --- !2 -- +-- !3 -- +1 abc +2 def + +-- !4 -- +1 abc +2 def + +-- !5 -- +1 abc +2 def + +-- !6 -- +1 abc +2 def + +-- !7 -- 1 abc 2 def diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index 3ae6b21bbba4f6..0da2eb3160ac83 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -40,7 +40,11 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte qt_2 """select * from ${catalog_name}.regression.serde_test1 order by id;""" - qt_2 """select * from ${catalog_name}.regression.serde_test2 order by id;""" + qt_3 """select * from ${catalog_name}.regression.serde_test2 order by id;""" + qt_4 """select * from ${catalog_name}.regression.serde_test3 order by id;""" + qt_5 """select * from ${catalog_name}.regression.serde_test4 order by id;""" + qt_6 """select * from ${catalog_name}.regression.serde_test5 order by id;""" + qt_7 """select * from ${catalog_name}.regression.serde_test6 order by id;""" } } From c95e050e052e0161ac25ed2f17e7ec16300e925a Mon Sep 17 00:00:00 2001 From: suxiaogang223 Date: Mon, 15 Jul 2024 19:52:00 +0800 Subject: [PATCH 2/3] Refactor character conversion to use Java 8's String.valueOf() --- .../apache/doris/datasource/hive/HiveMetaStoreClientHelper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 795cbef18c9a74..11570298b49bc9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -883,7 +883,7 @@ public static String firstPresentOrDefault(String defaultValue, Optional public static String getByte(String altValue) { if (altValue != null && altValue.length() > 0) { try { - return Character.toString((char) (Byte.parseByte(altValue) + 256) % 256); + return String.valueOf((char) (Byte.parseByte(altValue) + 256) % 256); } catch (NumberFormatException e) { return altValue.substring(0, 1); } From 0d4d15c38462d716a9762f77647498bc87844126 Mon Sep 17 00:00:00 2001 From: suxiaogang223 Date: Tue, 16 Jul 2024 19:54:02 +0800 Subject: [PATCH 3/3] fix --- .../apache/doris/datasource/hive/HiveMetaStoreClientHelper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 11570298b49bc9..c086172f1f9c9e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -883,7 +883,7 @@ public static String firstPresentOrDefault(String defaultValue, Optional public static String getByte(String altValue) { if (altValue != null && altValue.length() > 0) { try { - return String.valueOf((char) (Byte.parseByte(altValue) + 256) % 256); + return String.valueOf((char) ((Byte.parseByte(altValue) + 256) % 256)); } catch (NumberFormatException e) { return altValue.substring(0, 1); }