From f05e95ae338d1eb1da7c1e2205632ec8f5f4c365 Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 22 Oct 2024 19:42:29 +0800 Subject: [PATCH] [enhance](hive) Add regression-test cases for hive text ddl and hive text insert and fix reading null string bug (#42200) ## Proposed changes Add regression-test cases for hive text table properties: | **Property** | **Description** | **Example Value** | **Supported in Doris** | |------------------------------|---------------------------------------------------------------------|-------------------|----------------------| | `field.delim` | Defines the delimiter between columns in each row. | `\1` | Yes | | `collection.delim` | Defines the delimiter for items in an array (collection type). | `\2` | Yes | | `mapkey.delim` | Defines the delimiter between keys and values in a map. | `\3` | Yes | | `serialization.null.format` | Defines how `NULL` values are represented in the text file. | `\\N` | Yes | | `escape.delim` | Specifies the escape character used for escaping special characters.| `\\` | Yes | | `line.delim` | Defines the delimiter between rows or lines in the file. | `\n` | Yes | ### Explanation: - **`field.delim`**: This property is used to specify how columns are separated in a row. For example, `\1` indicates the delimiter is a non-printable character. - **`collection.delim`**: Used to define how elements in an array or collection are separated. In this case, `\2` is used as a separator. - **`mapkey.delim`**: This property defines how keys and values in a map are separated. For instance, `\3` is used to separate map keys and values. - **`serialization.null.format`**: This setting specifies the format for `NULL` values in the data. `\\N` is commonly used to represent `NULL`. - **`escape.delim`**: Defines the escape character used in the text file for escaping special characters, such as the delimiter itself. Here, `\\` is used as the escape character. - **`line.delim`**: This property is used to specify the delimiter between lines or rows. Typically, `\n` (newline) is used as the line delimiter. **Note**: Unlike the other delimiters, `line.delim` is not escaped. If the content in the table contains the same character as the line delimiter, it may cause query errors. However, the other delimiters (`field.delim`, `collection.delim`, `mapkey.delim`) are escaped, so they will not cause issues. --- be/src/vec/exec/format/csv/csv_reader.cpp | 2 +- .../create_preinstalled_scripts/run63.hql | 18 +- .../hive/ddl/test_hive_ddl_text_format.out | 57 ++++++ .../hive/ddl/test_hive_ddl_text_format.groovy | 177 +++++++++++++----- 4 files changed, 200 insertions(+), 54 deletions(-) create mode 100644 regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 0583b74d73572a..bf0e543d650142 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -622,7 +622,7 @@ template Status CsvReader::deserialize_nullable_string(IColumn& column, Slice& slice) { auto& null_column = assert_cast(column); if (!(from_json && _options.converted_from_string && slice.trim_double_quotes())) { - if (slice.size == 2 && slice[0] == '\\' && slice[1] == 'N') { + if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) { null_column.insert_data(nullptr, 0); return Status::OK(); } diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql index aebd75229599b9..c287595278f6c4 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql @@ -560,7 +560,14 @@ CREATE TABLE `all_types_text`( `t_array_string_all_nulls` array, `dt` int) stored as textfile -TBLPROPERTIES("line.delim"="\n", "field.delim"="\1"); +TBLPROPERTIES( + 'field.delim'='\t', + 'line.delim'='\n', + 'collection.delim'=',', + 'mapkey.delim'=':', + 'escape.delim'='|', + 'serialization.null.format'='null' +); CREATE TABLE all_types_par_text( `boolean_col` boolean, @@ -628,4 +635,11 @@ CREATE TABLE all_types_par_text( PARTITIONED BY ( `dt` int) stored as textfile -TBLPROPERTIES("line.delim"="\n", "field.delim"="\1"); +TBLPROPERTIES( + 'field.delim'='\t', + 'line.delim'='\n', + 'collection.delim'=',', + 'mapkey.delim'=':', + 'escape.delim'='|', + 'serialization.null.format'='null' +); diff --git a/regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out b/regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out new file mode 100644 index 00000000000000..faf343ce09b718 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out @@ -0,0 +1,57 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !default_properties -- +1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"} +2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"} +3 Charlie \N {"keyC":"valueC", "keyD":"valueD"} + +-- !hive_docker_default_properties -- +1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"} +2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"} +3 Charlie \N {"keyC":"valueC","keyD":"valueD"} + +-- !standard_properties -- +1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"} +2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"} +3 Charlie \N {"keyC":"valueC", "keyD":"valueD"} + +-- !hive_docker_standard_properties -- +1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"} +2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"} +3 Charlie \N {"keyC":"valueC","keyD":"valueD"} + +-- !different_properties -- +1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"} +2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"} +3 Charlie \N {"keyC":"valueC", "keyD":"valueD"} + +-- !hive_docker_different_properties -- +1 Alice ["tag1,tag2"] {"key1":"value1,key2:value2\\u00042"} + +-- !default_properties -- +1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"} +2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"} +3 Charlie \N {"keyC":"valueC", "keyD":"valueD"} + +-- !hive_docker_default_properties -- +1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"} +2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"} +3 Charlie \N {"keyC":"valueC","keyD":"valueD"} + +-- !standard_properties -- +1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"} +2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"} +3 Charlie \N {"keyC":"valueC", "keyD":"valueD"} + +-- !hive_docker_standard_properties -- +1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"} +2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"} +3 Charlie \N {"keyC":"valueC","keyD":"valueD"} + +-- !different_properties -- +1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"} +2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"} +3 Charlie \N {"keyC":"valueC", "keyD":"valueD"} + +-- !hive_docker_different_properties -- +1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2\\u00042"} + diff --git a/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy b/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy index aaa5b198e69c85..730db1247cdd07 100644 --- a/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy +++ b/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy @@ -17,62 +17,137 @@ suite("test_hive_ddl_text_format", "p0,external,hive,external_docker,external_docker_hive") { String enabled = context.config.otherConfigs.get("enableHiveTest") - if (enabled != null && enabled.equalsIgnoreCase("true")) { - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - String hms_port = context.config.otherConfigs.get("hive3HmsPort") - String hdfs_port = context.config.otherConfigs.get("hive3HdfsPort") - String catalog_name = "test_hive_ddl_text_format" - String table_name = "table_with_pars"; + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + for (String hivePrefix : ["hive2", "hive3"]) { + setHivePrefix(hivePrefix) + try{ + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort") + String catalog_name = "test_hive_ddl_text_format" + String table_name = "table_with_pars"; - sql """drop catalog if exists ${catalog_name};""" + sql """drop catalog if exists ${catalog_name};""" - sql """ - create catalog if not exists ${catalog_name} properties ( - 'type'='hms', - 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}', - 'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}', - 'use_meta_cache' = 'true' + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}', + 'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}', + 'use_meta_cache' = 'true' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """use `default`;""" + + sql """ drop table if exists text_table_default_properties """ + sql """ + create table text_table_default_properties ( + id int, + `name` string, + tags array, + attributes map + ) PROPERTIES ( + 'file_format'='text' ); - """ - logger.info("catalog " + catalog_name + " created") - sql """switch ${catalog_name};""" - logger.info("switched to catalog " + catalog_name) - sql """use `default`;""" + """ + sql """ + INSERT INTO text_table_default_properties VALUES + (1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')), + (2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')), + (3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD')); + """ + order_qt_default_properties """ select * from text_table_default_properties """ + + order_qt_hive_docker_default_properties""" select * from text_table_default_properties """ - sql """ drop table if exists tb_text """ - sql """ - create table tb_text ( - id int, - `name` string - ) PROPERTIES ( - 'compression'='gzip', - 'file_format'='text', - 'field.delim'='\t', - 'line.delim'='\n', - 'collection.delim'=';', - 'mapkey.delim'=':', - 'serialization.null.format'='\\N' - ); - """ + sql """ drop table if exists text_table_standard_properties """ + // Escape characters need to be considered in groovy scripts + sql """ + create table text_table_standard_properties ( + id int, + `name` string, + tags array, + attributes map + ) PROPERTIES ( + 'compression'='plain', + 'file_format'='text', + 'field.delim'='\\1', + 'line.delim'='\\n', + 'collection.delim'='\\2', + 'mapkey.delim'='\\3', + 'escape.delim'= '\\\\', + 'serialization.null.format'='\\\\N' + ); + """ + sql """ + INSERT INTO text_table_standard_properties VALUES + (1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')), + (2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')), + (3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD')); + """ + order_qt_standard_properties """ select * from text_table_standard_properties """ + order_qt_hive_docker_standard_properties """ select * from text_table_standard_properties order by id; """ + + sql """ drop table if exists text_table_different_properties """ + sql """ + create table text_table_different_properties ( + id int, + `name` string, + tags array, + attributes map + ) PROPERTIES ( + 'compression'='gzip', + 'file_format'='text', + 'field.delim'='A', + 'line.delim'='\\4', + 'collection.delim'=',', + 'mapkey.delim'=':', + 'escape.delim'='|', + 'serialization.null.format'='null' + ); + """ + sql """ + INSERT INTO text_table_different_properties VALUES + (1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')), + (2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')), + (3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD')); + """ + order_qt_different_properties """ select * from text_table_different_properties """ + order_qt_hive_docker_different_properties """ select * from text_table_different_properties order by id; """ - String serde = "'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'" - String input_format = "'org.apache.hadoop.mapred.TextInputFormat'" - String output_format = "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'" - String doris_fileformat = "'doris.file_format'='text'" - String filed_delim = "'field.delim'" - String line_delim = "'line.delim'" - String mapkey_delim = "'mapkey.delim'" + String serde = "'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'" + String input_format = "'org.apache.hadoop.mapred.TextInputFormat'" + String output_format = "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'" + String doris_fileformat = "'doris.file_format'='text'" + String filed_delim = "'field.delim'" + String line_delim = "'line.delim'" + String mapkey_delim = "'mapkey.delim'" + String collection_delim = "'collection.delim'" + String escape_delim = "'escape.delim'" + String serialization_null_format = "'serialization.null.format'" - def create_tbl_res = sql """ show create table tb_text """ - String res = create_tbl_res.toString() - logger.info("${res}") - assertTrue(res.containsIgnoreCase("${serde}")) - assertTrue(res.containsIgnoreCase("${input_format}")) - assertTrue(res.containsIgnoreCase("${output_format}")) - assertTrue(res.containsIgnoreCase("${doris_fileformat}")) - assertTrue(res.containsIgnoreCase("${filed_delim}")) - assertTrue(res.containsIgnoreCase("${filed_delim}")) - assertTrue(res.containsIgnoreCase("${line_delim}")) - assertTrue(res.containsIgnoreCase("${mapkey_delim}")) + def create_tbl_res = sql """ show create table text_table_standard_properties """ + String res = create_tbl_res.toString() + logger.info("${res}") + assertTrue(res.containsIgnoreCase("${serde}")) + assertTrue(res.containsIgnoreCase("${input_format}")) + assertTrue(res.containsIgnoreCase("${output_format}")) + assertTrue(res.containsIgnoreCase("${doris_fileformat}")) + assertTrue(res.containsIgnoreCase("${filed_delim}")) + assertTrue(res.containsIgnoreCase("${filed_delim}")) + assertTrue(res.containsIgnoreCase("${line_delim}")) + assertTrue(res.containsIgnoreCase("${mapkey_delim}")) + assertTrue(res.containsIgnoreCase("${collection_delim}")) + assertTrue(res.containsIgnoreCase("${escape_delim}")) + assertTrue(res.containsIgnoreCase("${serialization_null_format}")) + } finally { + } } }