From f05e95ae338d1eb1da7c1e2205632ec8f5f4c365 Mon Sep 17 00:00:00 2001
From: Socrates <suxiaogang223@icloud.com>
Date: Tue, 22 Oct 2024 19:42:29 +0800
Subject: [PATCH] [enhance](hive) Add regression-test cases for hive text ddl
 and hive text insert and fix reading null string bug (#42200)

## Proposed changes
Add regression-test cases for hive text table properties:
| **Property** | **Description** | **Example Value** | **Supported in
Doris** |

|------------------------------|---------------------------------------------------------------------|-------------------|----------------------|
| `field.delim` | Defines the delimiter between columns in each row. |
`\1` | Yes |
| `collection.delim` | Defines the delimiter for items in an array
(collection type). | `\2` | Yes |
| `mapkey.delim` | Defines the delimiter between keys and values in a
map. | `\3` | Yes |
| `serialization.null.format` | Defines how `NULL` values are
represented in the text file. | `\\N` | Yes |
| `escape.delim` | Specifies the escape character used for escaping
special characters.| `\\` | Yes |
| `line.delim` | Defines the delimiter between rows or lines in the
file. | `\n` | Yes |

### Explanation:
- **`field.delim`**: This property is used to specify how columns are
separated in a row. For example, `\1` indicates the delimiter is a
non-printable character.
- **`collection.delim`**: Used to define how elements in an array or
collection are separated. In this case, `\2` is used as a separator.
- **`mapkey.delim`**: This property defines how keys and values in a map
are separated. For instance, `\3` is used to separate map keys and
values.
- **`serialization.null.format`**: This setting specifies the format for
`NULL` values in the data. `\\N` is commonly used to represent `NULL`.
- **`escape.delim`**: Defines the escape character used in the text file
for escaping special characters, such as the delimiter itself. Here,
`\\` is used as the escape character.
- **`line.delim`**: This property is used to specify the delimiter
between lines or rows. Typically, `\n` (newline) is used as the line
delimiter.
**Note**: Unlike the other delimiters, `line.delim` is not escaped. If
the content in the table contains the same character as the line
delimiter, it may cause query errors. However, the other delimiters
(`field.delim`, `collection.delim`, `mapkey.delim`) are escaped, so they
will not cause issues.
---
 be/src/vec/exec/format/csv/csv_reader.cpp     |   2 +-
 .../create_preinstalled_scripts/run63.hql     |  18 +-
 .../hive/ddl/test_hive_ddl_text_format.out    |  57 ++++++
 .../hive/ddl/test_hive_ddl_text_format.groovy | 177 +++++++++++++-----
 4 files changed, 200 insertions(+), 54 deletions(-)
 create mode 100644 regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp
index 0583b74d73572a..bf0e543d650142 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -622,7 +622,7 @@ template <bool from_json>
 Status CsvReader::deserialize_nullable_string(IColumn& column, Slice& slice) {
     auto& null_column = assert_cast<ColumnNullable&>(column);
     if (!(from_json && _options.converted_from_string && slice.trim_double_quotes())) {
-        if (slice.size == 2 && slice[0] == '\\' && slice[1] == 'N') {
+        if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
             null_column.insert_data(nullptr, 0);
             return Status::OK();
         }
diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql
index aebd75229599b9..c287595278f6c4 100755
--- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql
+++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run63.hql
@@ -560,7 +560,14 @@ CREATE TABLE `all_types_text`(
   `t_array_string_all_nulls` array<string>,
   `dt` int)
 stored as textfile
-TBLPROPERTIES("line.delim"="\n", "field.delim"="\1");
+TBLPROPERTIES(
+  'field.delim'='\t',
+  'line.delim'='\n',
+  'collection.delim'=',',
+  'mapkey.delim'=':',
+  'escape.delim'='|',
+  'serialization.null.format'='null'
+);
 
 CREATE TABLE all_types_par_text(
     `boolean_col` boolean,
@@ -628,4 +635,11 @@ CREATE TABLE all_types_par_text(
 PARTITIONED BY (
   `dt` int)
 stored as textfile
-TBLPROPERTIES("line.delim"="\n", "field.delim"="\1");
+TBLPROPERTIES(
+  'field.delim'='\t',
+  'line.delim'='\n',
+  'collection.delim'=',',
+  'mapkey.delim'=':',
+  'escape.delim'='|',
+  'serialization.null.format'='null'
+);
diff --git a/regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out b/regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out
new file mode 100644
index 00000000000000..faf343ce09b718
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/ddl/test_hive_ddl_text_format.out
@@ -0,0 +1,57 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !default_properties --
+1	Alice	["tag1", "tag2"]	{"key1":"value1", "key2":"value2"}
+2	Bob	["tagA", "tagB"]	{"keyA":"valueA", "keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC", "keyD":"valueD"}
+
+-- !hive_docker_default_properties --
+1	Alice	["tag1","tag2"]	{"key1":"value1","key2":"value2"}
+2	Bob	["tagA","tagB"]	{"keyA":"valueA","keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC","keyD":"valueD"}
+
+-- !standard_properties --
+1	Alice	["tag1", "tag2"]	{"key1":"value1", "key2":"value2"}
+2	Bob	["tagA", "tagB"]	{"keyA":"valueA", "keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC", "keyD":"valueD"}
+
+-- !hive_docker_standard_properties --
+1	Alice	["tag1","tag2"]	{"key1":"value1","key2":"value2"}
+2	Bob	["tagA","tagB"]	{"keyA":"valueA","keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC","keyD":"valueD"}
+
+-- !different_properties --
+1	Alice	["tag1", "tag2"]	{"key1":"value1", "key2":"value2"}
+2	Bob	["tagA", "tagB"]	{"keyA":"valueA", "keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC", "keyD":"valueD"}
+
+-- !hive_docker_different_properties --
+1	Alice	["tag1,tag2"]	{"key1":"value1,key2:value2\\u00042"}
+
+-- !default_properties --
+1	Alice	["tag1", "tag2"]	{"key1":"value1", "key2":"value2"}
+2	Bob	["tagA", "tagB"]	{"keyA":"valueA", "keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC", "keyD":"valueD"}
+
+-- !hive_docker_default_properties --
+1	Alice	["tag1","tag2"]	{"key1":"value1","key2":"value2"}
+2	Bob	["tagA","tagB"]	{"keyA":"valueA","keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC","keyD":"valueD"}
+
+-- !standard_properties --
+1	Alice	["tag1", "tag2"]	{"key1":"value1", "key2":"value2"}
+2	Bob	["tagA", "tagB"]	{"keyA":"valueA", "keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC", "keyD":"valueD"}
+
+-- !hive_docker_standard_properties --
+1	Alice	["tag1","tag2"]	{"key1":"value1","key2":"value2"}
+2	Bob	["tagA","tagB"]	{"keyA":"valueA","keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC","keyD":"valueD"}
+
+-- !different_properties --
+1	Alice	["tag1", "tag2"]	{"key1":"value1", "key2":"value2"}
+2	Bob	["tagA", "tagB"]	{"keyA":"valueA", "keyB":"valueB"}
+3	Charlie	\N	{"keyC":"valueC", "keyD":"valueD"}
+
+-- !hive_docker_different_properties --
+1	Alice	["tag1","tag2"]	{"key1":"value1","key2":"value2\\u00042"}
+
diff --git a/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy b/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy
index aaa5b198e69c85..730db1247cdd07 100644
--- a/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy
+++ b/regression-test/suites/external_table_p0/hive/ddl/test_hive_ddl_text_format.groovy
@@ -17,62 +17,137 @@
 
 suite("test_hive_ddl_text_format", "p0,external,hive,external_docker,external_docker_hive") {
     String enabled = context.config.otherConfigs.get("enableHiveTest")
-    if (enabled != null && enabled.equalsIgnoreCase("true")) {
-        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
-        String hms_port = context.config.otherConfigs.get("hive3HmsPort")
-        String hdfs_port = context.config.otherConfigs.get("hive3HdfsPort")
-        String catalog_name = "test_hive_ddl_text_format"
-        String table_name = "table_with_pars";
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2", "hive3"]) {
+        setHivePrefix(hivePrefix)
+        try{
+            String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+            String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")
+            String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
+            String catalog_name = "test_hive_ddl_text_format"
+            String table_name = "table_with_pars";
 
-        sql """drop catalog if exists ${catalog_name};"""
+            sql """drop catalog if exists ${catalog_name};"""
 
-        sql """
-            create catalog if not exists ${catalog_name} properties (
-                'type'='hms',
-                'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}',
-                'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}',
-                'use_meta_cache' = 'true'
+            sql """
+                create catalog if not exists ${catalog_name} properties (
+                    'type'='hms',
+                    'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}',
+                    'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}',
+                    'use_meta_cache' = 'true'
+                );
+            """
+            logger.info("catalog " + catalog_name + " created")
+            sql """switch ${catalog_name};"""
+            logger.info("switched to catalog " + catalog_name)
+            sql """use `default`;"""
+
+            sql """ drop table if exists text_table_default_properties """
+            sql """
+            create table text_table_default_properties (
+                id int,
+                `name` string,
+                tags array<string>,
+                attributes map<string, string>
+            ) PROPERTIES (
+                'file_format'='text'
             );
-        """
-        logger.info("catalog " + catalog_name + " created")
-        sql """switch ${catalog_name};"""
-        logger.info("switched to catalog " + catalog_name)
-        sql """use `default`;"""
+            """
+            sql """
+            INSERT INTO text_table_default_properties VALUES
+                (1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')),
+                (2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')),
+                (3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD'));
+            """
+            order_qt_default_properties """ select * from text_table_default_properties """
+
+            order_qt_hive_docker_default_properties""" select * from text_table_default_properties """
 
-        sql """ drop table if exists tb_text """
-        sql """
-        create table tb_text (
-            id int,
-            `name` string
-        ) PROPERTIES (
-            'compression'='gzip',
-            'file_format'='text',
-            'field.delim'='\t',
-            'line.delim'='\n',
-            'collection.delim'=';',
-            'mapkey.delim'=':',
-            'serialization.null.format'='\\N'
-        );
-        """
+            sql """ drop table if exists text_table_standard_properties """
+            // Escape characters need to be considered in groovy scripts
+            sql """
+            create table text_table_standard_properties (
+                id int,
+                `name` string,
+                tags array<string>,
+                attributes map<string, string>
+            ) PROPERTIES (
+                'compression'='plain',
+                'file_format'='text',
+                'field.delim'='\\1',
+                'line.delim'='\\n',
+                'collection.delim'='\\2',
+                'mapkey.delim'='\\3',
+                'escape.delim'= '\\\\',
+                'serialization.null.format'='\\\\N'
+            );
+            """
+            sql """
+            INSERT INTO text_table_standard_properties VALUES
+                (1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')),
+                (2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')),
+                (3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD'));
+            """
+            order_qt_standard_properties """ select * from text_table_standard_properties """
+            order_qt_hive_docker_standard_properties """ select * from text_table_standard_properties order by id; """
+
+            sql """ drop table if exists text_table_different_properties """
+            sql """
+            create table text_table_different_properties (
+                id int,
+                `name` string,
+                tags array<string>,
+                attributes map<string, string>
+            ) PROPERTIES (
+                'compression'='gzip',
+                'file_format'='text',
+                'field.delim'='A',
+                'line.delim'='\\4',
+                'collection.delim'=',',
+                'mapkey.delim'=':',
+                'escape.delim'='|',
+                'serialization.null.format'='null'
+            );
+            """
+            sql """
+            INSERT INTO text_table_different_properties VALUES
+                (1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')),
+                (2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')),
+                (3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD'));
+            """
+            order_qt_different_properties """ select * from text_table_different_properties """
+            order_qt_hive_docker_different_properties """ select * from text_table_different_properties order by id; """
 
-        String serde = "'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'"
-        String input_format = "'org.apache.hadoop.mapred.TextInputFormat'"
-        String output_format = "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'"
-        String doris_fileformat = "'doris.file_format'='text'"
-        String filed_delim = "'field.delim'"
-        String line_delim = "'line.delim'"
-        String mapkey_delim = "'mapkey.delim'"
+            String serde = "'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'"
+            String input_format = "'org.apache.hadoop.mapred.TextInputFormat'"
+            String output_format = "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'"
+            String doris_fileformat = "'doris.file_format'='text'"
+            String filed_delim = "'field.delim'"
+            String line_delim = "'line.delim'"
+            String mapkey_delim = "'mapkey.delim'"
+            String collection_delim = "'collection.delim'"
+            String escape_delim = "'escape.delim'"
+            String serialization_null_format = "'serialization.null.format'"
 
-        def create_tbl_res = sql """ show create table tb_text """
-        String res = create_tbl_res.toString()
-        logger.info("${res}")
-        assertTrue(res.containsIgnoreCase("${serde}"))
-        assertTrue(res.containsIgnoreCase("${input_format}"))
-        assertTrue(res.containsIgnoreCase("${output_format}"))
-        assertTrue(res.containsIgnoreCase("${doris_fileformat}"))
-        assertTrue(res.containsIgnoreCase("${filed_delim}"))
-        assertTrue(res.containsIgnoreCase("${filed_delim}"))
-        assertTrue(res.containsIgnoreCase("${line_delim}"))
-        assertTrue(res.containsIgnoreCase("${mapkey_delim}"))
+            def create_tbl_res = sql """ show create table text_table_standard_properties """
+            String res = create_tbl_res.toString()
+            logger.info("${res}")
+            assertTrue(res.containsIgnoreCase("${serde}"))
+            assertTrue(res.containsIgnoreCase("${input_format}"))
+            assertTrue(res.containsIgnoreCase("${output_format}"))
+            assertTrue(res.containsIgnoreCase("${doris_fileformat}"))
+            assertTrue(res.containsIgnoreCase("${filed_delim}"))
+            assertTrue(res.containsIgnoreCase("${filed_delim}"))
+            assertTrue(res.containsIgnoreCase("${line_delim}"))
+            assertTrue(res.containsIgnoreCase("${mapkey_delim}"))
+            assertTrue(res.containsIgnoreCase("${collection_delim}"))
+            assertTrue(res.containsIgnoreCase("${escape_delim}"))
+            assertTrue(res.containsIgnoreCase("${serialization_null_format}"))
+        } finally {
+        }
     }
 }