diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index c5efb6afadcc..36b84ca5cd8f 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -19,6 +19,7 @@ target_link_libraries(clickhouse-library-bridge PRIVATE daemon dbms bridge + clickhouse_functions_extractkeyvaluepairs ) set_target_properties(clickhouse-library-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index f0c622af587a..0d105dafc92d 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -22,6 +22,7 @@ target_link_libraries(clickhouse-odbc-bridge PRIVATE dbms bridge clickhouse_parsers + clickhouse_functions_extractkeyvaluepairs ch_contrib::nanodbc ch_contrib::unixodbc ) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 582e10b1c372..6ee10afc8b07 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -289,7 +289,7 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const } -const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const +const ColumnWithTypeAndName * Block::findByName(const std::string_view & name, bool case_insensitive) const { if (case_insensitive) { @@ -309,6 +309,11 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool c return &data[it->second]; } +const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const +{ + return findByName(std::string_view{name}, case_insensitive); +} + std::optional Block::findSubcolumnByName(const std::string & name) const { auto [name_in_storage, subcolumn_name] = Nested::splitName(name); diff --git a/src/Core/Block.h b/src/Core/Block.h index ae907c5ff624..363a0d9e682c 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -7,6 +7,7 @@ #include #include +#include class SipHash; @@ -30,7 +31,7 @@ class Block { private: using Container = ColumnsWithTypeAndName; - using IndexByName = std::unordered_map; + using IndexByName = std::unordered_map; Container data; IndexByName index_by_name; @@ -70,6 +71,14 @@ class Block const_cast(this)->findByName(name, case_insensitive)); } + ColumnWithTypeAndName* findByName(const std::string_view & name, bool case_insensitive = false) + { + return const_cast( + const_cast(this)->findByName(name, case_insensitive)); + } + + const ColumnWithTypeAndName * findByName(const std::string_view & name, bool case_insensitive) const; + const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const; std::optional findSubcolumnByName(const std::string & name) const; std::optional findColumnOrSubcolumnByName(const std::string & name) const; diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp index 7197b5f75d52..fd11ca929a56 100644 --- a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -10,7 +10,6 @@ #include -#include #include #include @@ -29,11 +28,6 @@ class ExtractKeyValuePairs : public IFunction { auto builder = KeyValuePairExtractorBuilder(); - if constexpr (WITH_ESCAPING) - { - builder.withEscaping(); - } - if (parsed_arguments.key_value_delimiter) { builder.withKeyValueDelimiter(parsed_arguments.key_value_delimiter.value()); @@ -56,10 +50,17 @@ class ExtractKeyValuePairs : public IFunction builder.withMaxNumberOfPairs(context->getSettingsRef()[Setting::extract_key_value_pairs_max_pairs_per_row]); } - return builder.build(); + if constexpr (WITH_ESCAPING) + { + return builder.buildWithEscaping(); + } + else + { + return builder.buildWithoutEscaping(); + } } - ColumnPtr extract(ColumnPtr data_column, std::shared_ptr extractor, size_t input_rows_count) const + ColumnPtr extract(ColumnPtr data_column, auto & extractor, size_t input_rows_count) const { auto offsets = ColumnUInt64::create(); @@ -72,7 +73,7 @@ class ExtractKeyValuePairs : public IFunction { auto row = data_column->getDataAt(i).toView(); - auto pairs_count = extractor->extract(row, keys, values); + auto pairs_count = extractor.extract(row, keys, values); offset += pairs_count; diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 3895cf3e77db..d49375b03071 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -5,7 +5,8 @@ #include #include -#include +#include +#include namespace DB { @@ -16,37 +17,36 @@ namespace ErrorCodes extern const int LIMIT_EXCEEDED; } +namespace extractKV +{ /* * Handle state transitions and a few states like `FLUSH_PAIR` and `END`. * */ template -class CHKeyValuePairExtractor : public KeyValuePairExtractor +class KeyValuePairExtractor { using State = typename DB::extractKV::StateHandler::State; using NextState = DB::extractKV::StateHandler::NextState; public: - explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_) - : state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_) - {} + using PairWriter = typename StateHandler::PairWriter; - uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override + KeyValuePairExtractor(const Configuration & configuration_, uint64_t max_number_of_pairs_) + : state_handler(StateHandler(configuration_)) + , max_number_of_pairs(max_number_of_pairs_) { - return extract(std::string_view {data}, keys, values); } - uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override +protected: + uint64_t extractImpl(std::string_view data, typename StateHandler::PairWriter & pair_writer) { auto state = State::WAITING_KEY; - auto key = typename StateHandler::StringWriter(*keys); - auto value = typename StateHandler::StringWriter(*values); - uint64_t row_offset = 0; while (state != State::END) { - auto next_state = processState(data, state, key, value, row_offset); + auto next_state = processState(data, state, pair_writer, row_offset); if (next_state.position_in_string > data.size() && next_state.state != State::END) { @@ -61,14 +61,13 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } // below reset discards invalid keys and values - reset(key, value); + reset(pair_writer); return row_offset; } private: - - NextState processState(std::string_view file, State state, auto & key, auto & value, uint64_t & row_offset) + NextState processState(std::string_view file, State state, auto & pair_writer, uint64_t & row_offset) { switch (state) { @@ -78,11 +77,11 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } case State::READING_KEY: { - return state_handler.readKey(file, key); + return state_handler.readKey(file, pair_writer); } case State::READING_QUOTED_KEY: { - return state_handler.readQuotedKey(file, key); + return state_handler.readQuotedKey(file, pair_writer); } case State::READING_KV_DELIMITER: { @@ -94,15 +93,15 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } case State::READING_VALUE: { - return state_handler.readValue(file, value); + return state_handler.readValue(file, pair_writer); } case State::READING_QUOTED_VALUE: { - return state_handler.readQuotedValue(file, value); + return state_handler.readQuotedValue(file, pair_writer); } case State::FLUSH_PAIR: { - return flushPair(file, key, value, row_offset); + return flushPair(file, pair_writer, row_offset); } case State::END: { @@ -111,8 +110,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } } - NextState flushPair(const std::string_view & file, auto & key, - auto & value, uint64_t & row_offset) + NextState flushPair(const std::string_view & file, auto & pair_writer, uint64_t & row_offset) { row_offset++; @@ -121,16 +119,16 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); } - key.commit(); - value.commit(); + pair_writer.commitKey(); + pair_writer.commitValue(); return {0, file.empty() ? State::END : State::WAITING_KEY}; } - void reset(auto & key, auto & value) + void reset(auto & pair_writer) { - key.reset(); - value.reset(); + pair_writer.resetKey(); + pair_writer.resetValue(); } StateHandler state_handler; @@ -138,3 +136,44 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor }; } + +struct KeyValuePairExtractorNoEscaping : extractKV::KeyValuePairExtractor +{ + using StateHandler = extractKV::NoEscapingStateHandler; + explicit KeyValuePairExtractorNoEscaping(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_) + : KeyValuePairExtractor(configuration_, max_number_of_pairs_) {} + + uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) + { + auto pair_writer = typename StateHandler::PairWriter(*keys, *values); + return extractImpl(data, pair_writer); + } +}; + +struct KeyValuePairExtractorInlineEscaping : extractKV::KeyValuePairExtractor +{ + using StateHandler = extractKV::InlineEscapingStateHandler; + explicit KeyValuePairExtractorInlineEscaping(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_) + : KeyValuePairExtractor(configuration_, max_number_of_pairs_) {} + + uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) + { + auto pair_writer = typename StateHandler::PairWriter(*keys, *values); + return extractImpl(data, pair_writer); + } +}; + +struct KeyValuePairExtractorReferenceMap : extractKV::KeyValuePairExtractor +{ + using StateHandler = extractKV::ReferencesMapStateHandler; + explicit KeyValuePairExtractorReferenceMap(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_) + : KeyValuePairExtractor(configuration_, max_number_of_pairs_) {} + + uint64_t extract(std::string_view data, absl::flat_hash_map & map) + { + auto pair_writer = typename StateHandler::PairWriter(map); + return extractImpl(data, pair_writer); + } +}; + +} diff --git a/src/Functions/keyvaluepair/impl/DuplicateKeyFoundException.h b/src/Functions/keyvaluepair/impl/DuplicateKeyFoundException.h new file mode 100644 index 000000000000..b7d1cc5fb4a8 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/DuplicateKeyFoundException.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace DB +{ + +namespace extractKV +{ + +struct DuplicateKeyFoundException : Exception +{ + explicit DuplicateKeyFoundException(std::string_view key_) : key(key_) {} + + std::string_view key; +}; + +} + +} diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h deleted file mode 100644 index 5fd77ce9a994..000000000000 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include - -#include -#include - -namespace DB -{ - -struct KeyValuePairExtractor -{ - virtual ~KeyValuePairExtractor() = default; - - virtual uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) = 0; - - virtual uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) = 0; -}; - -} diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp index 7f2a6449ab0c..6e61efc4e15a 100644 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp @@ -1,7 +1,5 @@ #include -#include -#include #include namespace DB @@ -25,52 +23,10 @@ KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withQuotingCharacte return *this; } -KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping() -{ - with_escaping = true; - return *this; -} - KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withMaxNumberOfPairs(uint64_t max_number_of_pairs_) { max_number_of_pairs = max_number_of_pairs_; return *this; } -std::shared_ptr KeyValuePairExtractorBuilder::build() const -{ - if (with_escaping) - { - return buildWithEscaping(); - } - - return buildWithoutEscaping(); -} - -namespace -{ -using namespace extractKV; - -template -auto makeStateHandler(const T && handler, uint64_t max_number_of_pairs) -{ - return std::make_shared>(handler, max_number_of_pairs); -} - -} - -std::shared_ptr KeyValuePairExtractorBuilder::buildWithoutEscaping() const -{ - auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters); - - return makeStateHandler(NoEscapingStateHandler(configuration), max_number_of_pairs); -} - -std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() const -{ - auto configuration = ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters); - - return makeStateHandler(InlineEscapingStateHandler(configuration), max_number_of_pairs); -} - } diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h index 0c673f12ccf9..c22fe975d0b2 100644 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h @@ -1,7 +1,9 @@ #pragma once -#include #include +#include +#include +#include namespace DB { @@ -18,22 +20,34 @@ class KeyValuePairExtractorBuilder KeyValuePairExtractorBuilder & withQuotingCharacter(char quoting_character_); - KeyValuePairExtractorBuilder & withEscaping(); - KeyValuePairExtractorBuilder & withMaxNumberOfPairs(uint64_t max_number_of_pairs_); - std::shared_ptr build() const; + auto buildWithoutEscaping() const + { + auto configuration = extractKV::ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters); + + return KeyValuePairExtractorNoEscaping(configuration, max_number_of_pairs); + } + + auto buildWithEscaping() const + { + auto configuration = extractKV::ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters); + + return KeyValuePairExtractorInlineEscaping(configuration, max_number_of_pairs); + } + + auto buildWithReferenceMap() const + { + auto configuration = extractKV::ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters); + + return KeyValuePairExtractorReferenceMap(configuration, max_number_of_pairs); + } private: - bool with_escaping = false; char key_value_delimiter = ':'; char quoting_character = '"'; std::vector item_delimiters = {' ', ',', ';'}; uint64_t max_number_of_pairs = std::numeric_limits::max(); - - std::shared_ptr buildWithEscaping() const; - - std::shared_ptr buildWithoutEscaping() const; }; } diff --git a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h index 521dd09c18ae..a7169f21a8fa 100644 --- a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h +++ b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -12,6 +13,8 @@ #include #include #include +#include + namespace DB { @@ -70,9 +73,9 @@ class StateHandlerImpl : public StateHandler * Find first delimiter of interest (`read_needles`). Valid symbols are either `key_value_delimiter` and `escape_character` if escaping * support is on. If it finds a pair delimiter, it discards the key. * */ - [[nodiscard]] NextState readKey(std::string_view file, auto & key) const + [[nodiscard]] NextState readKey(std::string_view file, auto & pair_writer) const { - key.reset(); + pair_writer.resetKey(); size_t pos = 0; @@ -85,7 +88,7 @@ class StateHandlerImpl : public StateHandler { if constexpr (WITH_ESCAPING) { - auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, key); + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, pair_writer); next_pos = character_position + escape_sequence_length; if (!parsed_successfully) @@ -96,7 +99,7 @@ class StateHandlerImpl : public StateHandler } else if (isKeyValueDelimiter(*p)) { - key.append(file.data() + pos, file.data() + character_position); + pair_writer.appendKey(file.data() + pos, file.data() + character_position); return {next_pos, State::WAITING_VALUE}; } @@ -118,9 +121,9 @@ class StateHandlerImpl : public StateHandler /* * Search for closing quoting character and process escape sequences along the way (if escaping support is turned on). * */ - [[nodiscard]] NextState readQuotedKey(std::string_view file, auto & key) const + [[nodiscard]] NextState readQuotedKey(std::string_view file, auto & pair_writer) const { - key.reset(); + pair_writer.resetKey(); size_t pos = 0; @@ -133,7 +136,7 @@ class StateHandlerImpl : public StateHandler { if constexpr (WITH_ESCAPING) { - auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, key); + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, pair_writer); next_pos = character_position + escape_sequence_length; if (!parsed_successfully) @@ -144,9 +147,9 @@ class StateHandlerImpl : public StateHandler } else if (isQuotingCharacter(*p)) { - key.append(file.data() + pos, file.data() + character_position); + pair_writer.appendKey(file.data() + pos, file.data() + character_position); - if (key.isEmpty()) + if (pair_writer.isKeyEmpty()) { return {next_pos, State::WAITING_KEY}; } @@ -211,9 +214,9 @@ class StateHandlerImpl : public StateHandler * Finds next delimiter of interest (`read_needles`). Valid symbols are either `pair_delimiter` and `escape_character` if escaping * support is on. If it finds a `key_value_delimiter`, it discards the value. * */ - [[nodiscard]] NextState readValue(std::string_view file, auto & value) const + [[nodiscard]] NextState readValue(std::string_view file, auto & pair_writer) const { - value.reset(); + pair_writer.resetValue(); size_t pos = 0; @@ -226,7 +229,7 @@ class StateHandlerImpl : public StateHandler { if constexpr (WITH_ESCAPING) { - auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, value); + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, pair_writer); next_pos = character_position + escape_sequence_length; if (!parsed_successfully) @@ -238,7 +241,7 @@ class StateHandlerImpl : public StateHandler } else if (isPairDelimiter(*p)) { - value.append(file.data() + pos, file.data() + character_position); + pair_writer.appendValue(file.data() + pos, file.data() + character_position); return {next_pos, State::FLUSH_PAIR}; } @@ -247,18 +250,18 @@ class StateHandlerImpl : public StateHandler } // Reached end of input, consume rest of the file as value and make sure KV pair is produced. - value.append(file.data() + pos, file.data() + file.size()); + pair_writer.appendValue(file.data() + pos, file.data() + file.size()); return {file.size(), State::FLUSH_PAIR}; } /* * Search for closing quoting character and process escape sequences along the way (if escaping support is turned on). * */ - [[nodiscard]] NextState readQuotedValue(std::string_view file, auto & value) const + [[nodiscard]] NextState readQuotedValue(std::string_view file, auto & pair_writer) const { size_t pos = 0; - value.reset(); + pair_writer.resetValue(); while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) { @@ -269,7 +272,7 @@ class StateHandlerImpl : public StateHandler { if constexpr (WITH_ESCAPING) { - auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, value); + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, pair_writer); next_pos = character_position + escape_sequence_length; if (!parsed_successfully) @@ -280,7 +283,7 @@ class StateHandlerImpl : public StateHandler } else if (isQuotingCharacter(*p)) { - value.append(file.data() + pos, file.data() + character_position); + pair_writer.appendValue(file.data() + pos, file.data() + character_position); return {next_pos, State::FLUSH_PAIR}; } @@ -303,16 +306,32 @@ class StateHandlerImpl : public StateHandler * Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean * that indicates success and a std::size_t that contains the number of bytes read/ consumed. * */ + template std::pair consumeWithEscapeSequence(std::string_view file, size_t start_pos, size_t character_pos, auto & output) const { std::string escaped_sequence; DB::ReadBufferFromMemory buf(file.data() + character_pos, file.size() - character_pos); - output.append(file.data() + start_pos, file.data() + character_pos); + if constexpr (isKey) + { + output.appendKey(file.data() + start_pos, file.data() + character_pos); + } + else + { + output.appendValue(file.data() + start_pos, file.data() + character_pos); + } if (DB::parseComplexEscapeSequence(escaped_sequence, buf)) { - output.append(escaped_sequence); + if constexpr (isKey) + { + output.appendKey(escaped_sequence); + } + else + { + output.appendValue(escaped_sequence); + } + return {true, buf.getPosition()}; } @@ -345,58 +364,98 @@ class StateHandlerImpl : public StateHandler struct NoEscapingStateHandler : public StateHandlerImpl { /* - * View based StringWriter, no temporary copies are used. + * View based PairWriter, no temporary copies are used. * */ - class StringWriter + class PairWriter { - ColumnString & col; + ColumnString & key_col; + ColumnString & value_col; - std::string_view element; + std::string_view key; + std::string_view value; public: - explicit StringWriter(ColumnString & col_) - : col(col_) + PairWriter(ColumnString & key_col_, ColumnString & value_col_) + : key_col(key_col_), value_col(value_col_) {} - ~StringWriter() + ~PairWriter() { // Make sure that ColumnString invariants are not broken. - if (!isEmpty()) + if (!isKeyEmpty()) + { + resetKey(); + } + + if (!isValueEmpty()) { - reset(); + resetValue(); } } - void append(std::string_view new_data) + void appendKey(std::string_view new_data) { - element = new_data; + key = new_data; } template - void append(const T * begin, const T * end) + void appendKey(const T * begin, const T * end) { - append({begin, end}); + appendKey({begin, end}); } - void reset() + void appendValue(std::string_view new_data) { - element = {}; + value = new_data; } - bool isEmpty() const + template + void appendValue(const T * begin, const T * end) { - return element.empty(); + appendValue({begin, end}); } - void commit() + void resetKey() { - col.insertData(element.data(), element.size()); - reset(); + key = {}; } - std::string_view uncommittedChunk() const + void resetValue() { - return element; + value = {}; + } + + bool isKeyEmpty() const + { + return key.empty(); + } + + bool isValueEmpty() const + { + return value.empty(); + } + + void commitKey() + { + key_col.insertData(key.data(), key.size()); + resetKey(); + } + + void commitValue() + { + value_col.insertData(value.data(), value.size()); + resetValue(); + } + + + std::string_view uncommittedKeyChunk() const + { + return key; + } + + std::string_view uncommittedValueChunk() const + { + return value; } }; @@ -407,58 +466,102 @@ struct NoEscapingStateHandler : public StateHandlerImpl struct InlineEscapingStateHandler : public StateHandlerImpl { - class StringWriter + class PairWriter { - ColumnString & col; - ColumnString::Chars & chars; - UInt64 prev_commit_pos; + ColumnString & key_col; + ColumnString::Chars & key_chars; + UInt64 key_prev_commit_pos; + + ColumnString & value_col; + ColumnString::Chars & value_chars; + UInt64 value_prev_commit_pos; public: - explicit StringWriter(ColumnString & col_) - : col(col_), - chars(col.getChars()), - prev_commit_pos(chars.size()) + PairWriter(ColumnString & key_col_, ColumnString & value_col_) + : key_col(key_col_), + key_chars(key_col.getChars()), + key_prev_commit_pos(key_chars.size()), + value_col(value_col_), + value_chars(value_col.getChars()), + value_prev_commit_pos(value_chars.size()) {} - ~StringWriter() + ~PairWriter() { // Make sure that ColumnString invariants are not broken. - if (!isEmpty()) + if (!isKeyEmpty()) + { + resetKey(); + } + + if (!isValueEmpty()) { - reset(); + resetValue(); } } - void append(std::string_view new_data) + void appendKey(std::string_view new_data) { - chars.insert(new_data.begin(), new_data.end()); + key_chars.insert(new_data.begin(), new_data.end()); } template - void append(const T * begin, const T * end) + void appendKey(const T * begin, const T * end) + { + key_chars.insert(begin, end); + } + + void appendValue(std::string_view new_data) + { + value_chars.insert(new_data.begin(), new_data.end()); + } + + template + void appendValue(const T * begin, const T * end) + { + value_chars.insert(begin, end); + } + + void resetKey() { - chars.insert(begin, end); + key_chars.resize_assume_reserved(key_prev_commit_pos); } - void reset() + void resetValue() { - chars.resize_assume_reserved(prev_commit_pos); + value_chars.resize_assume_reserved(value_prev_commit_pos); } - bool isEmpty() const + bool isKeyEmpty() const { - return chars.size() == prev_commit_pos; + return key_chars.size() == key_prev_commit_pos; } - void commit() + bool isValueEmpty() const { - col.insertData(nullptr, 0); - prev_commit_pos = chars.size(); + return value_chars.size() == value_prev_commit_pos; } - std::string_view uncommittedChunk() const + void commitKey() { - return std::string_view(chars.raw_data() + prev_commit_pos, chars.raw_data() + chars.size()); + key_col.insertData(nullptr, 0); + key_prev_commit_pos = key_chars.size(); + } + + void commitValue() + { + value_col.insertData(nullptr, 0); + value_prev_commit_pos = value_chars.size(); + } + + std::string_view uncommittedKeyChunk() const + { + return std::string_view(key_chars.raw_data() + key_prev_commit_pos, key_chars.raw_data() + key_chars.size()); + } + + std::string_view uncommittedValueChunk() const + { + return std::string_view(value_chars.raw_data() + value_prev_commit_pos, value_chars.raw_data() + value_chars.size()); } }; @@ -467,6 +570,114 @@ struct InlineEscapingStateHandler : public StateHandlerImpl : StateHandlerImpl(std::forward(args)...) {} }; +struct ReferencesMapStateHandler : public StateHandlerImpl +{ + /* + * View based PairWriter, no copies at all + * */ + class PairWriter + { + absl::flat_hash_map & map; + + std::string_view key; + std::string_view value; + + public: + explicit PairWriter(absl::flat_hash_map & map_) + : map(map_) + {} + + ~PairWriter() + { + // Make sure that ColumnString invariants are not broken. + if (!isKeyEmpty()) + { + resetKey(); + } + + if (!isValueEmpty()) + { + resetValue(); + } + } + + void appendKey(std::string_view new_data) + { + key = new_data; + } + + template + void appendKey(const T * begin, const T * end) + { + appendKey({begin, end}); + } + + void appendValue(std::string_view new_data) + { + value = new_data; + } + + template + void appendValue(const T * begin, const T * end) + { + appendValue({begin, end}); + } + + void resetKey() + { + key = {}; + } + + void resetValue() + { + value = {}; + } + + bool isKeyEmpty() const + { + return key.empty(); + } + + bool isValueEmpty() const + { + return value.empty(); + } + + void commitKey() + { + // don't do anything + } + + void commitValue() + { + if (map.contains(key) && value != map[key]) + { + throw DuplicateKeyFoundException(key); + } + + map[key] = value; + + resetValue(); + resetKey(); + } + + std::string_view uncommittedKeyChunk() const + { + return key; + } + + std::string_view uncommittedValueChunk() const + { + return value; + } + }; + + template + explicit ReferencesMapStateHandler(Args && ... args) + : StateHandlerImpl(std::forward(args)...) {} +}; + + } } diff --git a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp index 3dd914eb5a0f..decb60c7c4d4 100644 --- a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp @@ -1,5 +1,4 @@ #include -#include #include @@ -19,12 +18,12 @@ TEST(extractKVPairEscapingKeyValuePairExtractor, EscapeSequences) { using namespace std::literals; - auto extractor = KeyValuePairExtractorBuilder().withEscaping().build(); + auto extractor = KeyValuePairExtractorBuilder().buildWithEscaping(); auto keys = ColumnString::create(); auto values = ColumnString::create(); - auto pairs_count = extractor->extract(R"(key1:a\xFF key2:a\n\t\r)"sv, keys, values); + auto pairs_count = extractor.extract(R"(key1:a\xFF key2:a\n\t\r)"sv, keys, values); ASSERT_EQ(pairs_count, 2u); ASSERT_EQ(keys->size(), pairs_count); diff --git a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp index 88dc287be166..1a31f78e8f5e 100644 --- a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp @@ -47,6 +47,7 @@ struct KeyValuePairExtractorTestParam KeyValuePairExtractorBuilder builder; std::string input; std::vector> expected; + bool use_escaping = false; }; struct extractKVPairKeyValuePairExtractorTest : public ::testing::TestWithParam @@ -54,16 +55,29 @@ struct extractKVPairKeyValuePairExtractorTest : public ::testing::TestWithParam< TEST_P(extractKVPairKeyValuePairExtractorTest, Match) { - const auto & [builder, input, expected] = GetParam(); + const auto & [builder, input, expected, use_escaping] = GetParam(); SCOPED_TRACE(input); - auto kv_parser = builder.build(); - SCOPED_TRACE(typeid(kv_parser).name()); - auto keys = ColumnString::create(); auto values = ColumnString::create(); - auto pairs_found = kv_parser->extract(input, keys, values); + std::size_t pairs_found = 0; + + if (use_escaping) + { + auto kv_parser = builder.buildWithEscaping(); + SCOPED_TRACE(typeid(kv_parser).name()); + + pairs_found = kv_parser.extract(input, keys, values); + } + else + { + auto kv_parser = builder.buildWithoutEscaping(); + SCOPED_TRACE(typeid(kv_parser).name()); + + pairs_found = kv_parser.extract(input, keys, values); + } + ASSERT_EQ(expected.size(), pairs_found); size_t i = 0; @@ -102,9 +116,10 @@ INSTANTIATE_TEST_SUITE_P(Simple, extractKVPairKeyValuePairExtractorTest, }, { // same as case 1, but with another handler - KeyValuePairExtractorBuilder().withQuotingCharacter('\'').withEscaping(), + KeyValuePairExtractorBuilder().withQuotingCharacter('\''), R"in(name:'neymar';'age':31;team:psg;nationality:brazil,last_key:last_value)in", - neymar_expected + neymar_expected, + true } } ) @@ -116,12 +131,13 @@ INSTANTIATE_TEST_SUITE_P(InvalidEscapeSeqInValue, extractKVPairKeyValuePairExtra { { // Special case when invalid seq is the last symbol - KeyValuePairExtractorBuilder().withEscaping(), + KeyValuePairExtractorBuilder(), R"in(valid_key:valid_value key:invalid_val\)in", ExpectedValues{ {"valid_key", "valid_value"}, {"key", "invalid_val"} - } + }, + true }, // Not handling escape sequences == do not care of broken one, `invalid_val\` must be present { diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp index c8fe5874281a..5552807759af 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp @@ -27,8 +27,9 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex std::size_t expected_pos, State expected_state) { auto str = ColumnString::create(); + auto val = ColumnString::create(); NextState next_state; - InlineEscapingStateHandler::StringWriter element(*str); + InlineEscapingStateHandler::PairWriter element(*str, *val); if constexpr (quoted) { @@ -41,7 +42,7 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); - ASSERT_EQ(element.uncommittedChunk(), expected_element); + ASSERT_EQ(element.uncommittedKeyChunk(), expected_element); } void test_read(const auto & handler, std::string_view input, std::string_view expected_element, diff --git a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp index c4a3feed63e0..1671ce36139f 100644 --- a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp @@ -27,8 +27,9 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex { NextState next_state; - auto col = ColumnString::create(); - NoEscapingStateHandler::StringWriter element(*col); + auto key = ColumnString::create(); + auto val = ColumnString::create(); + NoEscapingStateHandler::PairWriter element(*key, *val); if constexpr (quoted) { @@ -41,7 +42,7 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); - ASSERT_EQ(element.uncommittedChunk(), expected_element); + ASSERT_EQ(element.uncommittedKeyChunk(), expected_element); } void test_read(const auto & handler, std::string_view input, std::string_view expected_element, diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index dd03f200d81c..e3642235640c 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -2,9 +2,12 @@ #include #include +#include "Formats/NumpyDataTypes.h" #include #include +#include +#include #include #include @@ -49,10 +52,12 @@ #include #include #include +#include #include #include #include #include +#include namespace DB @@ -144,25 +149,59 @@ NameSet getVirtualNamesForFileLikeStorage() return getCommonVirtualsForFileLikeStorage().getNameSet(); } -static std::unordered_map parseHivePartitioningKeysAndValues(const String & path) +static auto makeExtractor() { - std::string pattern = "([^/]+)=([^/]+)/"; + return KeyValuePairExtractorBuilder().withItemDelimiters({'/'}).withKeyValueDelimiter('=').buildWithReferenceMap(); +} + +HivePartitioningKeysAndValues parseHivePartitioningKeysAndValuesRegex(const String & path) +{ + const static RE2 pattern_re("([^/]+)=([^/]*)/"); re2::StringPiece input_piece(path); - std::unordered_map key_values; - std::string key; - std::string value; - std::unordered_map used_keys; - while (RE2::FindAndConsume(&input_piece, pattern, &key, &value)) + HivePartitioningKeysAndValues result; + std::string_view key; + std::string_view value; + + while (RE2::FindAndConsume(&input_piece, pattern_re, &key, &value)) { - auto it = used_keys.find(key); - if (it != used_keys.end() && it->second != value) + auto it = result.find(key); + if (it != result.end() && it->second != value) throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {} with different values, only unique keys are allowed", path, key); - used_keys.insert({key, value}); auto col_name = key; - key_values[col_name] = value; + result[col_name] = value; + } + return result; +} + +HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const String & path) +{ + static auto extractor = makeExtractor(); + + HivePartitioningKeysAndValues key_values; + + // cutting the filename to prevent malformed filenames that contain key-value-pairs from being extracted + // not sure if we actually need to do that, but just in case. Plus, the previous regex impl took care of it + const auto last_slash_pos = path.find_last_of('/'); + + if (last_slash_pos == std::string::npos) + { + // nothing to extract, there is no path, just a filename + return key_values; + } + + std::string_view path_without_filename(path.data(), last_slash_pos); + + try + { + extractor.extract(path_without_filename, key_values); } + catch (const extractKV::DuplicateKeyFoundException & ex) + { + throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {} with different values, only unique keys are allowed", path, ex.key); + } + return key_values; } @@ -195,17 +234,22 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto if (context->getSettingsRef()[Setting::use_hive_partitioning]) { - auto map = parseHivePartitioningKeysAndValues(path); + const auto map = parseHivePartitioningKeysAndValues(path); auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); - for (auto & item : map) + + for (const auto & item : map) { - auto type = tryInferDataTypeByEscapingRule(item.second, format_settings, FormatSettings::EscapingRule::Raw); + const std::string key(item.first); + const std::string value(item.second); + + auto type = tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Raw); + if (type == nullptr) type = std::make_shared(); if (type->canBeInsideLowCardinality()) - add_virtual({item.first, std::make_shared(type)}, true); + add_virtual({key, std::make_shared(type)}, true); else - add_virtual({item.first, type}, true); + add_virtual({key, type}, true); } } @@ -231,7 +275,7 @@ static void addPathAndFileToVirtualColumns(Block & block, const String & path, s if (use_hive_partitioning) { - auto keys_and_values = parseHivePartitioningKeysAndValues(path); + const auto keys_and_values = parseHivePartitioningKeysAndValues(path); for (const auto & [key, value] : keys_and_values) { if (const auto * column = block.findByName(key)) @@ -285,7 +329,7 @@ void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, VirtualsForFileLikeStorage virtual_values, ContextPtr context) { - std::unordered_map hive_map; + HivePartitioningKeysAndValues hive_map; if (context->getSettingsRef()[Setting::use_hive_partitioning]) hive_map = parseHivePartitioningKeysAndValues(virtual_values.path); diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 7764cbc04d18..e7a2ad827f01 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -5,8 +5,7 @@ #include #include #include - -#include +#include namespace DB @@ -106,6 +105,12 @@ struct VirtualsForFileLikeStorage void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, VirtualsForFileLikeStorage virtual_values, ContextPtr context); + +using HivePartitioningKeysAndValues = absl::flat_hash_map; + +HivePartitioningKeysAndValues parseHivePartitioningKeysAndValuesRegex(const String & path); +HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const String & path); + } } diff --git a/src/Storages/tests/gtest_virtual_column_utils.cpp b/src/Storages/tests/gtest_virtual_column_utils.cpp new file mode 100644 index 000000000000..23a8cc2de9bf --- /dev/null +++ b/src/Storages/tests/gtest_virtual_column_utils.cpp @@ -0,0 +1,53 @@ +#include +#include +#include +#include + +using namespace DB; + +static std::vector test_paths = { + "/some/folder/key1=val1/key2=val2/file1.txt", + "/data/keyA=valA/keyB=valB/keyC=valC/file2.txt", + "/another/dir/x=1/y=2/z=3/file3.txt", + "/tiny/path/a=b/file4.txt", + "/yet/another/path/k1=v1/k2=v2/k3=v3/k4=v4/k5=v5/" +}; + +TEST(VirtualColumnUtils, BenchmarkRegexParser) +{ + static constexpr int iterations = 1000000; + + auto start_extractkv = std::chrono::steady_clock::now(); + + for (int i = 0; i < iterations; ++i) + { + // Pick from 5 different paths + const auto & path = test_paths[i % 5]; + auto result = VirtualColumnUtils::parseHivePartitioningKeysAndValues(path); + ASSERT_TRUE(!result.empty()); + } + + auto end_extractkv = std::chrono::steady_clock::now(); + auto duration_ms_extractkv = std::chrono::duration_cast(end_extractkv - start_extractkv).count(); + + std::cout << "[BenchmarkExtractkvParser] " + << iterations << " iterations across 5 paths took " + << duration_ms_extractkv << " ms\n"; + + auto start = std::chrono::steady_clock::now(); + + for (int i = 0; i < iterations; ++i) + { + // Pick from 5 different paths + const auto & path = test_paths[i % 5]; + auto result = VirtualColumnUtils::parseHivePartitioningKeysAndValuesRegex(path); + ASSERT_TRUE(!result.empty()); + } + + auto end = std::chrono::steady_clock::now(); + auto duration_ms = std::chrono::duration_cast(end - start).count(); + + std::cout << "[BenchmarkRegexParser] " + << iterations << " iterations across 5 paths took " + << duration_ms << " ms\n"; +}