From 7f7acab2b97d8556e194a856bec7ec93699ffc05 Mon Sep 17 00:00:00 2001 From: amorynan Date: Wed, 3 Jul 2024 23:28:56 +0800 Subject: [PATCH 1/2] fix string deserialize with unescaped char --- be/src/vec/functions/function_cast.h | 1 + .../data/jsonb_p0/test_jsonb_unescaped.csv | 5 ++ .../test_jsonb_with_unescaped_string.out | 8 +++ .../test_jsonb_with_unescaped_string.groovy | 67 +++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 regression-test/data/jsonb_p0/test_jsonb_unescaped.csv create mode 100644 regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out create mode 100644 regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 17250e10fd77d0..f8d1670c9c8125 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -576,6 +576,7 @@ struct ConvertImplGenericFromString { const bool is_complex = is_complex_type(data_type_to); DataTypeSerDe::FormatOptions format_options; format_options.converted_from_string = true; + format_options.escape_char = '\\'; for (size_t i = 0; i < size; ++i) { const auto& val = col_from_string->get_data_at(i); diff --git a/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv b/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv new file mode 100644 index 00000000000000..e4f859e7511b1b --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv @@ -0,0 +1,5 @@ +1 \N +2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}'] +3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar'] +4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'] +5 ["\"双引号\"", "反斜\\线"] \ No newline at end of file diff --git a/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out b/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out new file mode 100644 index 00000000000000..59a34069882023 --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out @@ -0,0 +1,8 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +1 \N +2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"] +3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] +5 [""双引号"", "反斜\\线"] + diff --git a/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy b/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy new file mode 100644 index 00000000000000..59cc79915411c7 --- /dev/null +++ b/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_jsonb_with_unescaped_string", "p0") { + + // define a sql table with array which has some Escape Character and should also to cast to json + def testTable = "tbl_unescaped_jsonb" + def dataFile = "test_jsonb_unescaped.csv" + + sql """ set experimental_enable_nereids_planner = true """ + sql """ set enable_fallback_to_original_planner = true """ + + sql "DROP TABLE IF EXISTS ${testTable}" + + sql """ + CREATE TABLE IF NOT EXISTS ${testTable} ( + id INT, + a ARRAY, + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + // load the jsonb data from csv file + streamLoad { + table testTable + + file dataFile // import csv file + time 10000 // limit inflight 10s + set 'strict_mode', 'true' + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals(5, json.NumberTotalRows) + assertEquals(5, json.NumberLoadedRows) + assertTrue(json.LoadBytes > 0) + } + } + + sql """ sync; """ + + // check result + qt_select "SELECT * FROM ${testTable} ORDER BY id" +} \ No newline at end of file From 21cad0711756cbe1979fb1ccf693d92222ce054f Mon Sep 17 00:00:00 2001 From: amorynan Date: Wed, 3 Jul 2024 23:48:38 +0800 Subject: [PATCH 2/2] add json format test --- .../data/jsonb_p0/test_jsonb_unescaped.json | 5 +++ .../test_jsonb_with_unescaped_string.out | 9 ++++- .../test_jsonb_with_unescaped_string.groovy | 34 ++++++++++++++++++- 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 regression-test/data/jsonb_p0/test_jsonb_unescaped.json diff --git a/regression-test/data/jsonb_p0/test_jsonb_unescaped.json b/regression-test/data/jsonb_p0/test_jsonb_unescaped.json new file mode 100644 index 00000000000000..de718c8efdea5a --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_unescaped.json @@ -0,0 +1,5 @@ +{"id":1,"a":null} +{"id":2,"a":['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', \'{"x" : 1}']} +{"id":3,"a":['foo\'bar', 'foo\"bar', 'foo\\\'bar', 'foo\'\'bar']} +{"id":4,"a":['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e']} +{"id":5,"a":["\"双引号\"", "反斜\\线"]} \ No newline at end of file diff --git a/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out b/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out index 59a34069882023..99fb23ef9eed17 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out +++ b/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out @@ -1,8 +1,15 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !select -- +-- !select_csv -- 1 \N 2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"] 3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] 4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] 5 [""双引号"", "反斜\\线"] +-- !select_json -- +1 \N +2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "'{"x" : 1}'"] +3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] +5 [""双引号"", "反斜\\线"] + diff --git a/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy b/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy index 59cc79915411c7..b728c46cb20c93 100644 --- a/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy +++ b/regression-test/suites/jsonb_p0/test_jsonb_with_unescaped_string.groovy @@ -22,6 +22,7 @@ suite("test_jsonb_with_unescaped_string", "p0") { // define a sql table with array which has some Escape Character and should also to cast to json def testTable = "tbl_unescaped_jsonb" def dataFile = "test_jsonb_unescaped.csv" + def dataFileJson = "test_jsonb_unescaped.json" sql """ set experimental_enable_nereids_planner = true """ sql """ set enable_fallback_to_original_planner = true """ @@ -63,5 +64,36 @@ suite("test_jsonb_with_unescaped_string", "p0") { sql """ sync; """ // check result - qt_select "SELECT * FROM ${testTable} ORDER BY id" + qt_select_csv "SELECT * FROM ${testTable} ORDER BY id" + + sql "truncate table ${testTable}" + // load the jsonb data from json file + streamLoad { + table testTable + + file dataFileJson // import json file + time 10000 // limit inflight 10s + set 'format', 'json' // import format + set 'read_json_by_line', 'true' // read json by line + set 'strict_mode', 'true' + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals(5, json.NumberTotalRows) + assertEquals(5, json.NumberLoadedRows) + assertTrue(json.LoadBytes > 0) + } + } + + + sql """ sync; """ + + // check result + qt_select_json "SELECT * FROM ${testTable} ORDER BY id" } \ No newline at end of file