From c4e27384643154f884d1dbfd87dd4209a21f6687 Mon Sep 17 00:00:00 2001 From: amorynan Date: Tue, 2 Jul 2024 20:49:17 +0800 Subject: [PATCH 1/5] support json string format --- .../data_types/serde/data_type_string_serde.h | 43 ++++++++-- .../data/jsonb_p0/test_jsonb_cast.csv | 4 + .../data/jsonb_p0/test_jsonb_cast.out | 27 +++++++ .../suites/jsonb_p0/test_jsonb_cast.groovy | 79 +++++++++++++++++++ 4 files changed, 148 insertions(+), 5 deletions(-) create mode 100644 regression-test/data/jsonb_p0/test_jsonb_cast.csv create mode 100644 regression-test/data/jsonb_p0/test_jsonb_cast.out create mode 100644 regression-test/suites/jsonb_p0/test_jsonb_cast.groovy diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index b74b585708623f..60182e8c86e35e 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -73,15 +73,48 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { auto result = check_column_const_set_readability(column, row_num); ColumnPtr ptr = result.first; row_num = result.second; + const auto& value = assert_cast(*ptr).get_data_at(row_num); if (_nesting_level > 1) { + // _nested_level > 1 means string is in a complex type, we add double quotes, and escape + // which should make deal with some special characters in json str bw.write('"'); - } - - const auto& value = assert_cast(*ptr).get_data_at(row_num); - bw.write(value.data, value.size); - if (_nesting_level > 1) { + if constexpr (std::is_same_v) { + // we should make deal with some special characters in json str + StringRef str_ref = value; + for (char it : str_ref) { + switch (it) { + case '\b': + bw.write("\\b", 2); + break; + case '\f': + bw.write("\\f", 2); + break; + case '\n': + bw.write("\\n", 2); + break; + case '\r': + bw.write("\\r", 2); + break; + case '\t': + bw.write("\\t", 2); + break; + case '\\': + bw.write("\\\\", 2); + break; + case '"': + bw.write("\\\"", 2); + break; + default: + bw.write(it); + } + } + } else { + bw.write(value.data, value.size); + } bw.write('"'); + } else { + bw.write(value.data, value.size); } return Status::OK(); } diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.csv b/regression-test/data/jsonb_p0/test_jsonb_cast.csv new file mode 100644 index 00000000000000..08b694ddea822f --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.csv @@ -0,0 +1,4 @@ +1 \N +2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}'] +3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar'] +4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'] \ No newline at end of file diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out new file mode 100644 index 00000000000000..0b572943601d90 --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -0,0 +1,27 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +1 \N +2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] +3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] +4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] + +-- !select_2 -- +1 \N +2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] +3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] +4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] +27 ["{"k1":"v1", "k2": 200}"] +28 ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"] +29 [" \n\r", " \n\r"] +30 ["f\r\n", "f\r\n""] + +-- !select_json -- +1 \N +2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 2}\\\\'}","{\\"x\\" : 1}"] +3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"] +4 ["\\\\some\\\\cool\\\\url","somecoolurl","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] +27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"] +28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"] +29 ["\\f\\n\\r","\\f\\n\\r"] +30 ["f\\b\\r\\n","f\\b\\r\\n\\""] + diff --git a/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy new file mode 100644 index 00000000000000..4d1b2aa7181923 --- /dev/null +++ b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_jsonb_cast", "p0") { + + // define a sql table with array which has some Escape Character and should also to cast to json + def testTable = "tbl_test_array_text_cast_jsonb" + def dataFile = "test_jsonb_cast.csv" + + sql """ set experimental_enable_nereids_planner = true """ + sql """ set enable_fallback_to_original_planner = true """ + + sql "DROP TABLE IF EXISTS ${testTable}" + + sql """ + CREATE TABLE IF NOT EXISTS ${testTable} ( + id INT, + a ARRAY, + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + // load the jsonb data from csv file + streamLoad { + table testTable + + file dataFile // import csv file + time 10000 // limit inflight 10s + set 'strict_mode', 'true' + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals(4, json.NumberTotalRows) + assertEquals(4, json.NumberLoadedRows) + assertTrue(json.LoadBytes > 0) + } + } + + sql """ sync; """ + + // check result + qt_select_1 "SELECT * FROM ${testTable} ORDER BY id" + + + // insert into valid json rows + sql """INSERT INTO ${testTable} VALUES(27, ['{"k1":"v1", "k2": 200}'])""" + sql """INSERT INTO ${testTable} VALUES(28, ['{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}'])""" + sql """INSERT INTO ${testTable} VALUES(29, ['\f\n\r', "\f\n\r"])""" + sql """INSERT INTO ${testTable} VALUES(30, ["\\f\\b\\r\\n", '\\f\\b\\r\\n"'])""" + + // check result + qt_select_2 "SELECT * FROM ${testTable} ORDER BY id" + // check cast as json + qt_select_json "SELECT id, cast(a as JSON) FROM ${testTable} ORDER BY id" +} \ No newline at end of file From 53b0aa1a4937f0401c71c7cb812e72e1378d9b0c Mon Sep 17 00:00:00 2001 From: amorynan Date: Wed, 3 Jul 2024 17:29:41 +0800 Subject: [PATCH 2/5] update with option param for escape char --- .../vec/data_types/serde/data_type_string_serde.h | 13 ++++++++----- be/src/vec/functions/function_cast.h | 1 + regression-test/data/jsonb_p0/test_jsonb_cast.out | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index 60182e8c86e35e..ae988a45775eec 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -76,11 +76,11 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { const auto& value = assert_cast(*ptr).get_data_at(row_num); if (_nesting_level > 1) { - // _nested_level > 1 means string is in a complex type, we add double quotes, and escape - // which should make deal with some special characters in json str bw.write('"'); - if constexpr (std::is_same_v) { - // we should make deal with some special characters in json str + } + if constexpr (std::is_same_v) { + if (options.escape_char != 0) { + // we should make deal with some special characters in json str if we have escape_char StringRef str_ref = value; for (char it : str_ref) { switch (it) { @@ -112,10 +112,13 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { } else { bw.write(value.data, value.size); } - bw.write('"'); } else { bw.write(value.data, value.size); } + if (_nesting_level > 1) { + bw.write('"'); + } + return Status::OK(); } diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 1f02381e6b1221..eee6b9cbfae74a 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -785,6 +785,7 @@ struct ConvertImplGenericToJsonb { auto tmp_col = ColumnString::create(); vectorized::DataTypeSerDe::FormatOptions options; + options.escape_char = '\\'; for (size_t i = 0; i < input_rows_count; i++) { // convert to string tmp_col->clear(); diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out index 0b572943601d90..2ab4174c746d6a 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_cast.out +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -19,7 +19,7 @@ 1 \N 2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 2}\\\\'}","{\\"x\\" : 1}"] 3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"] -4 ["\\\\some\\\\cool\\\\url","somecoolurl","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] +4 ["\\\\/some\\\\/cool\\\\/url","/some/cool/url","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] 27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"] 28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"] 29 ["\\f\\n\\r","\\f\\n\\r"] From c35c1c3e09458ab31c2ab9b4ee86c33420f74287 Mon Sep 17 00:00:00 2001 From: amorynan Date: Thu, 4 Jul 2024 09:57:26 +0800 Subject: [PATCH 3/5] fix comment --- .../data_types/serde/data_type_string_serde.h | 58 ++++++++++--------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index ae988a45775eec..503e56d855d3b4 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -82,33 +82,7 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { if (options.escape_char != 0) { // we should make deal with some special characters in json str if we have escape_char StringRef str_ref = value; - for (char it : str_ref) { - switch (it) { - case '\b': - bw.write("\\b", 2); - break; - case '\f': - bw.write("\\f", 2); - break; - case '\n': - bw.write("\\n", 2); - break; - case '\r': - bw.write("\\r", 2); - break; - case '\t': - bw.write("\\t", 2); - break; - case '\\': - bw.write("\\\\", 2); - break; - case '"': - bw.write("\\\"", 2); - break; - default: - bw.write(it); - } - } + write_with_escaped_char_to_json(str_ref, bw); } else { bw.write(value.data, value.size); } @@ -122,6 +96,36 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { return Status::OK(); } + inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) { + for (char it : value) { + switch (it) { + case '\b': + bw.write("\\b", 2); + break; + case '\f': + bw.write("\\f", 2); + break; + case '\n': + bw.write("\\n", 2); + break; + case '\r': + bw.write("\\r", 2); + break; + case '\t': + bw.write("\\t", 2); + break; + case '\\': + bw.write("\\\\", 2); + break; + case '"': + bw.write("\\\"", 2); + break; + default: + bw.write(it); + } + } + } + Status serialize_column_to_json(const IColumn& column, int start_idx, int end_idx, BufferWritable& bw, FormatOptions& options) const override { SERIALIZE_COLUMN_TO_JSON(); From e6e50b0095f8edd1c5ba91232ae6b65e19153e41 Mon Sep 17 00:00:00 2001 From: amorynan Date: Thu, 4 Jul 2024 10:12:48 +0800 Subject: [PATCH 4/5] fix compile --- be/src/vec/data_types/serde/data_type_string_serde.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index 503e56d855d3b4..24f99a12e67231 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -96,7 +96,7 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { return Status::OK(); } - inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) { + inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) const { for (char it : value) { switch (it) { case '\b': From 72f29232739ba72e3c990710b8cee50496456d74 Mon Sep 17 00:00:00 2001 From: amorynan Date: Fri, 2 Aug 2024 09:46:53 +0800 Subject: [PATCH 5/5] fix out file --- .../data/jsonb_p0/test_jsonb_cast.out | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out index 2ab4174c746d6a..09278359b08078 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_cast.out +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -1,15 +1,15 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !select_1 -- 1 \N -2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] -3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] -4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] +2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"] +3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] -- !select_2 -- 1 \N -2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] -3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] -4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] +2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"] +3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] 27 ["{"k1":"v1", "k2": 200}"] 28 ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"] 29 [" \n\r", " \n\r"] @@ -17,9 +17,9 @@ -- !select_json -- 1 \N -2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 2}\\\\'}","{\\"x\\" : 1}"] -3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"] -4 ["\\\\/some\\\\/cool\\\\/url","/some/cool/url","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] +2 ["{'x' : '{\\"y\\" : 1}', 't' : '{\\"y\\" : 2}'}","{\\"x\\" : 1}"] +3 ["foo'bar', 'foo\\"bar', 'foo\\\\'bar', 'foo''bar"] +4 ["/some/cool/url","/some/cool/url","a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] 27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"] 28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"] 29 ["\\f\\n\\r","\\f\\n\\r"]