Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
994f44a
use_extract_key_value_pairs_for_hive
arthurpassos Apr 11, 2025
3a58e5d
add missing argument
arthurpassos Apr 11, 2025
c8a643f
backport new commits
arthurpassos Apr 12, 2025
15771fe
add cmake dependency
arthurpassos Apr 12, 2025
581f75f
yet another cmake dependency
arthurpassos Apr 12, 2025
3a09066
hmm
arthurpassos Apr 12, 2025
067ec63
rmv test
arthurpassos Apr 14, 2025
30c9dd6
further improve performance
arthurpassos Apr 14, 2025
108eb7f
well well
arthurpassos Apr 14, 2025
72214cc
slightly better
arthurpassos Apr 14, 2025
505de63
Revert "slightly better"
arthurpassos Apr 14, 2025
32147ee
Revert "well well"
arthurpassos Apr 14, 2025
2ae3d2b
matching constructors
arthurpassos Apr 14, 2025
585a353
cleanup chkeyvaluepair extractor
arthurpassos Apr 14, 2025
b5c3382
remove some uneeded abstractions
arthurpassos Apr 14, 2025
6fa2965
tmp
arthurpassos Apr 15, 2025
24ed2e5
looking good already
arthurpassos Apr 15, 2025
0cebde2
rmv one test until I have time to make it compile
arthurpassos Apr 15, 2025
f21cbee
rename reference state handler
arthurpassos Apr 15, 2025
b5210c4
fix ut build
arthurpassos Apr 15, 2025
e0cf083
add benchmark test
arthurpassos Apr 15, 2025
956978c
add tests back
arthurpassos Apr 15, 2025
d01ba29
fix
arthurpassos Apr 15, 2025
519fb42
try catch
arthurpassos Apr 15, 2025
dbb197b
add tests back
arthurpassos Apr 15, 2025
63a8620
remove unuesed badarguments
arthurpassos Apr 15, 2025
90c821f
Merge branch 'antalya' into use_extract_key_value_pairs_for_hive
arthurpassos Apr 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions programs/library-bridge/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ target_link_libraries(clickhouse-library-bridge PRIVATE
daemon
dbms
bridge
clickhouse_functions_extractkeyvaluepairs
)

set_target_properties(clickhouse-library-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..)
Expand Down
1 change: 1 addition & 0 deletions programs/odbc-bridge/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ target_link_libraries(clickhouse-odbc-bridge PRIVATE
dbms
bridge
clickhouse_parsers
clickhouse_functions_extractkeyvaluepairs
ch_contrib::nanodbc
ch_contrib::unixodbc
)
Expand Down
7 changes: 6 additions & 1 deletion src/Core/Block.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const
}


const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const
const ColumnWithTypeAndName * Block::findByName(const std::string_view & name, bool case_insensitive) const
{
if (case_insensitive)
{
Expand All @@ -309,6 +309,11 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool c
return &data[it->second];
}

const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const
{
return findByName(std::string_view{name}, case_insensitive);
}

std::optional<ColumnWithTypeAndName> Block::findSubcolumnByName(const std::string & name) const
{
auto [name_in_storage, subcolumn_name] = Nested::splitName(name);
Expand Down
11 changes: 10 additions & 1 deletion src/Core/Block.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <initializer_list>
#include <vector>
#include <Common/StringHashForHeterogeneousLookup.h>


class SipHash;
Expand All @@ -30,7 +31,7 @@ class Block
{
private:
using Container = ColumnsWithTypeAndName;
using IndexByName = std::unordered_map<String, size_t>;
using IndexByName = std::unordered_map<String, size_t, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;

Container data;
IndexByName index_by_name;
Expand Down Expand Up @@ -70,6 +71,14 @@ class Block
const_cast<const Block *>(this)->findByName(name, case_insensitive));
}

ColumnWithTypeAndName* findByName(const std::string_view & name, bool case_insensitive = false)
{
return const_cast<ColumnWithTypeAndName *>(
const_cast<const Block *>(this)->findByName(name, case_insensitive));
}

const ColumnWithTypeAndName * findByName(const std::string_view & name, bool case_insensitive) const;

const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const;
std::optional<ColumnWithTypeAndName> findSubcolumnByName(const std::string & name) const;
std::optional<ColumnWithTypeAndName> findColumnOrSubcolumnByName(const std::string & name) const;
Expand Down
19 changes: 10 additions & 9 deletions src/Functions/keyvaluepair/extractKeyValuePairs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

#include <Interpreters/Context.h>

#include <Functions/keyvaluepair/impl/KeyValuePairExtractor.h>
#include <Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
#include <Functions/keyvaluepair/ArgumentExtractor.h>

Expand All @@ -29,11 +28,6 @@ class ExtractKeyValuePairs : public IFunction
{
auto builder = KeyValuePairExtractorBuilder();

if constexpr (WITH_ESCAPING)
{
builder.withEscaping();
}

if (parsed_arguments.key_value_delimiter)
{
builder.withKeyValueDelimiter(parsed_arguments.key_value_delimiter.value());
Expand All @@ -56,10 +50,17 @@ class ExtractKeyValuePairs : public IFunction
builder.withMaxNumberOfPairs(context->getSettingsRef()[Setting::extract_key_value_pairs_max_pairs_per_row]);
}

return builder.build();
if constexpr (WITH_ESCAPING)
{
return builder.buildWithEscaping();
}
else
{
return builder.buildWithoutEscaping();
}
}

ColumnPtr extract(ColumnPtr data_column, std::shared_ptr<KeyValuePairExtractor> extractor, size_t input_rows_count) const
ColumnPtr extract(ColumnPtr data_column, auto & extractor, size_t input_rows_count) const
{
auto offsets = ColumnUInt64::create();

Expand All @@ -72,7 +73,7 @@ class ExtractKeyValuePairs : public IFunction
{
auto row = data_column->getDataAt(i).toView();

auto pairs_count = extractor->extract(row, keys, values);
auto pairs_count = extractor.extract(row, keys, values);

offset += pairs_count;

Expand Down
93 changes: 66 additions & 27 deletions src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
#include <Columns/ColumnsNumber.h>

#include <Functions/keyvaluepair/impl/StateHandler.h>
#include <Functions/keyvaluepair/impl/KeyValuePairExtractor.h>
#include <Functions/keyvaluepair/impl/StateHandlerImpl.h>
#include <absl/container/flat_hash_map.h>

namespace DB
{
Expand All @@ -16,37 +17,36 @@ namespace ErrorCodes
extern const int LIMIT_EXCEEDED;
}

namespace extractKV
{
/*
* Handle state transitions and a few states like `FLUSH_PAIR` and `END`.
* */
template <typename StateHandler>
class CHKeyValuePairExtractor : public KeyValuePairExtractor
class KeyValuePairExtractor
{
using State = typename DB::extractKV::StateHandler::State;
using NextState = DB::extractKV::StateHandler::NextState;

public:
explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_)
: state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_)
{}
using PairWriter = typename StateHandler::PairWriter;

uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override
KeyValuePairExtractor(const Configuration & configuration_, uint64_t max_number_of_pairs_)
: state_handler(StateHandler(configuration_))
, max_number_of_pairs(max_number_of_pairs_)
{
return extract(std::string_view {data}, keys, values);
}

uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override
protected:
uint64_t extractImpl(std::string_view data, typename StateHandler::PairWriter & pair_writer)
{
auto state = State::WAITING_KEY;

auto key = typename StateHandler::StringWriter(*keys);
auto value = typename StateHandler::StringWriter(*values);

uint64_t row_offset = 0;

while (state != State::END)
{
auto next_state = processState(data, state, key, value, row_offset);
auto next_state = processState(data, state, pair_writer, row_offset);

if (next_state.position_in_string > data.size() && next_state.state != State::END)
{
Expand All @@ -61,14 +61,13 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
}

// below reset discards invalid keys and values
reset(key, value);
reset(pair_writer);

return row_offset;
}

private:

NextState processState(std::string_view file, State state, auto & key, auto & value, uint64_t & row_offset)
NextState processState(std::string_view file, State state, auto & pair_writer, uint64_t & row_offset)
{
switch (state)
{
Expand All @@ -78,11 +77,11 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
}
case State::READING_KEY:
{
return state_handler.readKey(file, key);
return state_handler.readKey(file, pair_writer);
}
case State::READING_QUOTED_KEY:
{
return state_handler.readQuotedKey(file, key);
return state_handler.readQuotedKey(file, pair_writer);
}
case State::READING_KV_DELIMITER:
{
Expand All @@ -94,15 +93,15 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
}
case State::READING_VALUE:
{
return state_handler.readValue(file, value);
return state_handler.readValue(file, pair_writer);
}
case State::READING_QUOTED_VALUE:
{
return state_handler.readQuotedValue(file, value);
return state_handler.readQuotedValue(file, pair_writer);
}
case State::FLUSH_PAIR:
{
return flushPair(file, key, value, row_offset);
return flushPair(file, pair_writer, row_offset);
}
case State::END:
{
Expand All @@ -111,8 +110,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
}
}

NextState flushPair(const std::string_view & file, auto & key,
auto & value, uint64_t & row_offset)
NextState flushPair(const std::string_view & file, auto & pair_writer, uint64_t & row_offset)
{
row_offset++;

Expand All @@ -121,20 +119,61 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs);
}

key.commit();
value.commit();
pair_writer.commitKey();
pair_writer.commitValue();

return {0, file.empty() ? State::END : State::WAITING_KEY};
}

void reset(auto & key, auto & value)
void reset(auto & pair_writer)
{
key.reset();
value.reset();
pair_writer.resetKey();
pair_writer.resetValue();
}

StateHandler state_handler;
uint64_t max_number_of_pairs;
};

}

struct KeyValuePairExtractorNoEscaping : extractKV::KeyValuePairExtractor<extractKV::NoEscapingStateHandler>
{
using StateHandler = extractKV::NoEscapingStateHandler;
explicit KeyValuePairExtractorNoEscaping(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}

uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values)
{
auto pair_writer = typename StateHandler::PairWriter(*keys, *values);
return extractImpl(data, pair_writer);
}
};

struct KeyValuePairExtractorInlineEscaping : extractKV::KeyValuePairExtractor<extractKV::InlineEscapingStateHandler>
{
using StateHandler = extractKV::InlineEscapingStateHandler;
explicit KeyValuePairExtractorInlineEscaping(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}

uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values)
{
auto pair_writer = typename StateHandler::PairWriter(*keys, *values);
return extractImpl(data, pair_writer);
}
};

struct KeyValuePairExtractorReferenceMap : extractKV::KeyValuePairExtractor<extractKV::ReferencesMapStateHandler>
{
using StateHandler = extractKV::ReferencesMapStateHandler;
explicit KeyValuePairExtractorReferenceMap(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}

uint64_t extract(std::string_view data, absl::flat_hash_map<std::string_view, std::string_view> & map)
{
auto pair_writer = typename StateHandler::PairWriter(map);
return extractImpl(data, pair_writer);
}
};

}
20 changes: 20 additions & 0 deletions src/Functions/keyvaluepair/impl/DuplicateKeyFoundException.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <Common/Exception.h>

namespace DB
{

namespace extractKV
{

struct DuplicateKeyFoundException : Exception
{
explicit DuplicateKeyFoundException(std::string_view key_) : key(key_) {}

std::string_view key;
};

}

}
20 changes: 0 additions & 20 deletions src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h

This file was deleted.

Loading
Loading