From 86459dc6c5801b790776bdea34e5add4476673ed Mon Sep 17 00:00:00 2001 From: Tiewei Fang Date: Mon, 7 Apr 2025 15:12:34 +0800 Subject: [PATCH] [Fix](Serde) Support hive compatible output format (#49036) Problem Summary: The output format of complex data types are different between Hive and Doris, such as array, map and struct. When user migrate from Hive to Doris, they expect the same format so that they don't need to modify their business code. This PR mainly changes: Add a new option to session variable `serde_dialect`: If set to hive, the output format returned to MySQL client of some datatypes will be changed: Array Doris: ["abc", "def", "", null, 1] Hive: ["abc","def","",null,true] Map Doris: {"k1":null, "k2":"v3"} Hive: {"k1":null,"k2":"v3"} Struct Doris: {"s_id":100, "s_name":"abc , "", "s_address":null} Hive: {"s_id":100,"s_name":"abc ,"","s_address":null} Related #37039 --- .../serde/data_type_array_serde.cpp | 5 +- .../data_types/serde/data_type_map_serde.cpp | 7 +- .../serde/data_type_number_serde.cpp | 9 +- be/src/vec/data_types/serde/data_type_serde.h | 20 ++++ .../serde/data_type_struct_serde.cpp | 5 +- be/src/vec/sink/vmysql_result_writer.cpp | 16 +++ .../apache/doris/nereids/NereidsPlanner.java | 1 + .../org/apache/doris/qe/SessionVariable.java | 10 +- gensrc/thrift/PaloInternalService.thrift | 3 +- .../serde/test_serde_dialect_hive.out | 7 ++ .../serde/test_serde_dialect_hive.groovy | 107 ++++++++++++++++++ 11 files changed, 182 insertions(+), 8 deletions(-) create mode 100644 regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out create mode 100644 regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp b/be/src/vec/data_types/serde/data_type_array_serde.cpp index 872dd84d8c7355..e5fc7461e45648 100644 --- a/be/src/vec/data_types/serde/data_type_array_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp @@ -336,7 +336,8 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, const auto end_arr_element = offsets[row_idx_of_col_arr]; for (int j = begin_arr_element; j < end_arr_element; ++j) { if (j != begin_arr_element) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -345,6 +346,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_nested_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -358,6 +360,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR( nested_serde->write_column_to_mysql(data, result, j, false, options)); } + --options.level; } } if (0 != result.push_string("]", 1)) { diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp b/be/src/vec/data_types/serde/data_type_map_serde.cpp index 2140885942d1d9..bf018ce3a80fcc 100644 --- a/be/src/vec/data_types/serde/data_type_map_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp @@ -418,7 +418,8 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, auto& offsets = map_column.get_offsets(); for (auto j = offsets[col_index - 1]; j < offsets[col_index]; ++j) { if (j != offsets[col_index - 1]) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -427,6 +428,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_key_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -440,6 +442,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(key_serde->write_column_to_mysql(nested_keys_column, result, j, false, options)); } + --options.level; } if (0 != result.push_string(&options.map_key_delim, 1)) { return Status::InternalError("pack mysql buffer failed."); @@ -449,6 +452,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_val_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -462,6 +466,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(value_serde->write_column_to_mysql(nested_values_column, result, j, false, options)); } + --options.level; } } if (0 != result.push_string("}", 1)) { diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp b/be/src/vec/data_types/serde/data_type_number_serde.cpp index 972668e65fdd71..e81343c9edead7 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp @@ -271,8 +271,15 @@ Status DataTypeNumberSerDe::_write_column_to_mysql(const IColumn& column, int buf_ret = 0; auto& data = assert_cast(column).get_data(); const auto col_index = index_check_const(row_idx, col_const); - if constexpr (std::is_same_v || std::is_same_v) { + if constexpr (std::is_same_v) { buf_ret = result.push_tinyint(data[col_index]); + } else if constexpr (std::is_same_v) { + if (options.level > 0 && !options.is_bool_value_num) { + std::string bool_value = data[col_index] ? "true" : "false"; + result.push_string(bool_value.c_str(), bool_value.size()); + } else { + buf_ret = result.push_tinyint(data[col_index]); + } } else if constexpr (std::is_same_v || std::is_same_v) { buf_ret = result.push_smallint(data[col_index]); } else if constexpr (std::is_same_v || std::is_same_v) { diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 105b1bbaedd9f3..c46b3f311a2128 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -181,6 +181,26 @@ class DataTypeSerDe { const char* nested_string_wrapper; int wrapper_len; + /** + * mysql_collection_delim is used to separate elements in collection, such as array, map, struct + * It is used to write to mysql. + */ + std::string mysql_collection_delim = ", "; + + /** + * is_bool_value_num is used to display bool value in collection, such as array, map, struct + * eg, if set to true, the array will be: + * [1] + * if set to false, the array will be: + * [true] + */ + bool is_bool_value_num = true; + + /** + * Indicate the nested level of column. It is used to control some behavior of serde + */ + mutable int level = 0; + [[nodiscard]] char get_collection_delimiter( int hive_text_complex_type_delimiter_level) const { CHECK(0 <= hive_text_complex_type_delimiter_level && diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp b/be/src/vec/data_types/serde/data_type_struct_serde.cpp index aefea80f0c753b..0b1bb0254825f1 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp @@ -352,7 +352,8 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, bool begin = true; for (size_t j = 0; j < elem_serdes_ptrs.size(); ++j) { if (!begin) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -376,6 +377,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (remove_nullable(col.get_column_ptr(j))->is_column_string() && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { @@ -390,6 +392,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(elem_serdes_ptrs[j]->write_column_to_mysql( col.get_column(j), result, col_index, false, options)); } + --options.level; } begin = false; } diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp index 932ee955590765..0f69fe9f5b9b72 100644 --- a/be/src/vec/sink/vmysql_result_writer.cpp +++ b/be/src/vec/sink/vmysql_result_writer.cpp @@ -123,6 +123,8 @@ Status VMysqlResultWriter::_set_options( _options.map_key_delim = ':'; _options.null_format = "null"; _options.null_len = 4; + _options.mysql_collection_delim = ", "; + _options.is_bool_value_num = true; break; case TSerdeDialect::PRESTO: // eg: @@ -133,6 +135,20 @@ Status VMysqlResultWriter::_set_options( _options.map_key_delim = '='; _options.null_format = "NULL"; _options.null_len = 4; + _options.mysql_collection_delim = ", "; + _options.is_bool_value_num = true; + break; + case TSerdeDialect::HIVE: + // eg: + // array: ["abc","def","",null] + // map: {"k1":null,"k2":"v3"} + _options.nested_string_wrapper = "\""; + _options.wrapper_len = 1; + _options.map_key_delim = ':'; + _options.null_format = "null"; + _options.null_len = 4; + _options.mysql_collection_delim = ","; + _options.is_bool_value_num = false; break; default: return Status::InternalError("unknown serde dialect: {}", serde_dialect); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java index 885a32c70b22fc..3a987a9443a5c2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java @@ -785,6 +785,7 @@ private void setFormatOptions() { statementContext.setFormatOptions(FormatOptions.getForPresto()); break; case "doris": + case "hive": statementContext.setFormatOptions(FormatOptions.getDefault()); break; default: diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 326f75aba3d480..53d244c5a00095 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -4449,9 +4449,11 @@ public void checkSerdeDialect(String serdeDialect) { throw new UnsupportedOperationException("serdeDialect value is empty"); } - if (!serdeDialect.equalsIgnoreCase("doris") && !serdeDialect.equalsIgnoreCase("presto") - && !serdeDialect.equalsIgnoreCase("trino")) { - LOG.warn("serdeDialect value is invalid, the invalid value is {}", serdeDialect); + if (!serdeDialect.equalsIgnoreCase("doris") + && !serdeDialect.equalsIgnoreCase("presto") + && !serdeDialect.equalsIgnoreCase("trino") + && !serdeDialect.equalsIgnoreCase("hive")) { + LOG.warn("serde dialect value is invalid, the invalid value is {}", serdeDialect); throw new UnsupportedOperationException( "sqlDialect value is invalid, the invalid value is " + serdeDialect); } @@ -4623,6 +4625,8 @@ public TSerdeDialect getSerdeDialect() { case "presto": case "trino": return TSerdeDialect.PRESTO; + case "hive": + return TSerdeDialect.HIVE; default: throw new IllegalArgumentException("Unknown serde dialect: " + serdeDialect); } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index fb0859eb50fa84..22dcba20fe5780 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -83,7 +83,8 @@ struct TResourceLimit { enum TSerdeDialect { DORIS = 0, - PRESTO = 1 + PRESTO = 1, + HIVE = 2 } // Query options that correspond to PaloService.PaloQueryOptions, diff --git a/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out new file mode 100644 index 00000000000000..3ea1043cdf6f9c --- /dev/null +++ b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql01 -- +1 2 3 4 5 1.1 2.0 123456.123456789 2024-06-30 2024-06-30T10:10:11 2024-06-30T10:10:11.123456 59.50.185.152 ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff this is a string with , and " abc ef 123ndedwdw true [1,2,3,4,5] [1, 2, 3, null, 5] [1.1, 2.1, 3.1, null, 5] [1.10000, 2.10000, 3.00000, null, 5.12345] ["abc", "de, f"", null, ""] [{"k1":"v1", "k2":null, "k3":"", "k4":"a , "a"}, {"k1":"v1", "k2":null, "k3 , "abc":"", "k4":"a , "a"}] [["abc", "de, f"", null, ""], [], null] \N \N {"k1":"v1", "k2":null, "k3":"", "k4":"a , "a"} {"k1":[["abc", "de, f"", null, ""], [], null], "k2":null} {10:{"k1":[["abc", "de, f"", null, ""], [], null]}, 11:null} \N {"s_id":100, "s_name":"abc , "", "s_address":null} {"s_id":null, "s_name":["abc", "de, f"", null, ""], "s_address":""} ["2024-06-01", null, "2024-06-03"] ["2024-06-01 10:10:10.000", null, "2024-06-03 01:11:23.123"] [1, 1, 0, 0, 1, 0, 0] {"s_id":100, "s_name":"abc , "", "s_gender":1} {"k1":0, "k2":1, "k3":1, "k4":0} + +-- !sql01 -- +1 2 3 4 5 1.1 2.0 123456.123456789 2024-06-30 2024-06-30T10:10:11 2024-06-30T10:10:11.123456 59.50.185.152 ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff this is a string with , and " abc ef 123ndedwdw true [1,2,3,4,5] [1,2,3,null,5] [1.1,2.1,3.1,null,5] [1.10000,2.10000,3.00000,null,5.12345] ["abc","de, f"",null,""] [{"k1":"v1","k2":null,"k3":"","k4":"a , "a"},{"k1":"v1","k2":null,"k3 , "abc":"","k4":"a , "a"}] [["abc","de, f"",null,""],[],null] \N \N {"k1":"v1","k2":null,"k3":"","k4":"a , "a"} {"k1":[["abc","de, f"",null,""],[],null],"k2":null} {10:{"k1":[["abc","de, f"",null,""],[],null]},11:null} \N {"s_id":100,"s_name":"abc , "","s_address":null} {"s_id":null,"s_name":["abc","de, f"",null,""],"s_address":""} ["2024-06-01",null,"2024-06-03"] ["2024-06-01 10:10:10.000",null,"2024-06-03 01:11:23.123"] [true,true,false,false,true,false,false] {"s_id":100,"s_name":"abc , "","s_gender":true} {"k1":false,"k2":true,"k3":true,"k4":false} + diff --git a/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy new file mode 100644 index 00000000000000..b8e3037d770f7d --- /dev/null +++ b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_serde_dialect_hive", "p0") { + + sql """create database if not exists test_serde_dialect_hive;""" + sql """use test_serde_dialect_hive;""" + sql """drop table if exists test_serde_dialect_hive_tbl""" + sql """ + create table if not exists test_serde_dialect_hive_tbl ( + c1 tinyint, + c2 smallint, + c3 int, + c4 bigint, + c5 largeint, + c6 float, + c7 double, + c8 decimal(27, 9), + c9 date, + c10 datetime, + c11 datetime(6), + c12 ipv4, + c13 ipv6, + c14 string, + c15 char(6), + c16 varchar(1024), + c17 boolean, + c18 json, + c19 array, + c20 array, + c21 array, + c22 array, + c23 array>, + c24 array>, + c25 array>, + c26 array, s_name:array, s_address:map>>, + c27 map, + c28 map>>, + c29 map>>>, + c30 map, s_name:array, s_address:map>>>, + c31 struct, + c32 struct, s_address:string>, + c33 array, + c34 array, + c35 array, + c36 struct, + c37 map + ) + distributed by random buckets 1 + properties("replication_num" = "1"); + """ + + sql """ + insert into test_serde_dialect_hive_tbl + (c1, c2,c3, c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c27,c28,c29,c31,c32,c33,c34,c35,c36,c37) + values( + 1,2,3,4,5,1.1,2.0000,123456.123456789,"2024-06-30", "2024-06-30 10:10:11", "2024-06-30 10:10:11.123456", + '59.50.185.152', + 'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff', + 'this is a string with , and "', + 'abc ef', + ' 123ndedwdw', + true, + '[1, 2, 3, 4, 5]', + [1,2,3,null,5], + [1.1,2.1,3.1,null,5.00], + [1.1,2.1,3.00000,null,5.12345], + ['abc', 'de, f"', null, ''], + [{'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, {'k1': 'v1', 'k2': null, 'k3 , "abc':'', 'k4':'a , "a'}], + [['abc', 'de, f"', null, ''],[],null], + {'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, + {'k1': [['abc', 'de, f"', null, ''],[],null], 'k2': null}, + {10: {'k1': [['abc', 'de, f"', null, ''],[],null]}, 11: null}, + named_struct('s_id', 100, 's_name', 'abc , "', 's_address', null), + named_struct('s_id', null, 's_name', ['abc', 'de, f"', null, ''], 's_address', ''), + ['2024-06-01',null,'2024-06-03'], + ['2024-06-01 10:10:10',null,'2024-06-03 01:11:23.123'], + [true, true, false, false, true, false, false], + named_struct('s_id', 100, 's_name', 'abc , "', 's_gender', true), + {'k1': false, 'k2': true, 'k3':true, 'k4': false} + ); + """ + + sql """set serde_dialect="doris";""" + qt_sql01 """select * from test_serde_dialect_hive_tbl""" + sql """set serde_dialect="hive";""" + qt_sql01 """select * from test_serde_dialect_hive_tbl""" + + test { + sql """set serde_dialect="invalid"""" + exception "sqlDialect value is invalid" + } +}