Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c5da3c4
draft / poc
arthurpassos Nov 1, 2024
f36e6a8
add a test
arthurpassos Nov 1, 2024
4658728
merge minmax and bf eval
arthurpassos Nov 5, 2024
58db88b
extern logical_error
arthurpassos Nov 6, 2024
c6ec587
update test
arthurpassos Nov 6, 2024
12b498d
update test
arthurpassos Nov 6, 2024
fb77c44
explicit in constructor
arthurpassos Nov 6, 2024
f8ba549
update tests
arthurpassos Dec 3, 2024
11f50b2
update comment
arthurpassos Dec 3, 2024
8a38a71
address some comments
arthurpassos Dec 3, 2024
bd46b20
address some comments
arthurpassos Dec 3, 2024
a66ca3e
small fix
arthurpassos Dec 3, 2024
c74a725
refactor
arthurpassos Dec 4, 2024
ace36ba
remove commented out code
arthurpassos Dec 4, 2024
f731ec6
define err code
arthurpassos Dec 4, 2024
e6af915
check element.monotonic_functions_chain.empty()
arthurpassos Dec 4, 2024
4178f8b
avoid multiple virtual f calls
arthurpassos Dec 4, 2024
8e024fb
style changes
arthurpassos Dec 5, 2024
0b25c7c
hehe
arthurpassos Dec 5, 2024
2e0faf0
workaround darwin uint64 issue
arthurpassos Dec 5, 2024
06a6e8e
final adjustments
arthurpassos Dec 6, 2024
4343a01
Update ArrowColumnToCHColumn.cpp
arthurpassos Dec 6, 2024
f89950b
forgot to include this file
arthurpassos Dec 6, 2024
7acadb4
perhaps this will wokr
arthurpassos Dec 6, 2024
370333b
...
arthurpassos Dec 7, 2024
659a28f
lol
arthurpassos Dec 11, 2024
dcafca5
add missing columndescriptor check
arthurpassos Dec 12, 2024
03aa08a
conflicts
arthurpassos Mar 11, 2025
1a7dddd
changed tab to 4 spaces
Enmk Mar 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
525 changes: 0 additions & 525 deletions src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp

This file was deleted.

73 changes: 0 additions & 73 deletions src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h

This file was deleted.

191 changes: 191 additions & 0 deletions src/Processors/Formats/Impl/Parquet/parquetBloomFilterHash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#include <Processors/Formats/Impl/Parquet/parquetBloomFilterHash.h>

#if USE_PARQUET

#include <parquet/metadata.h>
#include <parquet/xxhasher.h>

namespace DB
{

bool isParquetStringTypeSupportedForBloomFilters(
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type)
{
if (logical_type &&
!logical_type->is_none()
&& !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON()))
{
return false;
}

if (parquet::ConvertedType::type::NONE != converted_type &&
!(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8
|| converted_type == parquet::ConvertedType::BSON))
{
return false;
}

return true;
}

bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
{
if (logical_type && !logical_type->is_none() && !logical_type->is_int())
{
return false;
}

if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16
|| converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64
|| converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16
|| converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64))
{
return false;
}

return true;
}

template <typename T>
uint64_t hashSpecialFLBATypes(const Field & field)
{
const T & value = field.safeGet<T>();

parquet::FLBA flba(reinterpret_cast<const uint8_t*>(&value));

parquet::XxHasher hasher;

return hasher.Hash(&flba, sizeof(T));
};

std::optional<uint64_t> tryHashStringWithoutCompatibilityCheck(const Field & field)
{
const auto field_type = field.getType();

if (field_type != Field::Types::Which::String)
{
return std::nullopt;
}

parquet::XxHasher hasher;
parquet::ByteArray ba { field.safeGet<std::string>() };

return hasher.Hash(&ba);
}

std::optional<uint64_t> tryHashString(
const Field & field,
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type)
{
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}

return tryHashStringWithoutCompatibilityCheck(field);
}

std::optional<uint64_t> tryHashFLBA(
const Field & field,
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type,
std::size_t parquet_column_length)
{
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}

const auto field_type = field.getType();

if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6))
{
return hashSpecialFLBATypes<IPv6>(field);
}

return tryHashStringWithoutCompatibilityCheck(field);
}

template <typename ParquetPhysicalType>
std::optional<uint64_t> tryHashInt(const Field & field, const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
{
if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}

parquet::XxHasher hasher;

if (field.getType() == Field::Types::Which::Int64)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<int64_t>()));
}
else if (field.getType() == Field::Types::Which::UInt64)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<uint64_t>()));
}
else if (field.getType() == Field::Types::IPv4)
{
/*
* In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api
* with a zero-ed buffer that had a 32 bits variable copied into it.
*
* To be on the safe side, accept only in case physical type is 32 bits.
* */
if constexpr (std::is_same_v<int32_t, ParquetPhysicalType>)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<IPv4>()));
}
}

return std::nullopt;
}

std::optional<uint64_t> parquetTryHashField(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor)
{
const auto physical_type = parquet_column_descriptor->physical_type();
const auto & logical_type = parquet_column_descriptor->logical_type();
const auto converted_type = parquet_column_descriptor->converted_type();

switch (physical_type)
{
case parquet::Type::type::INT32:
return tryHashInt<int32_t>(field, logical_type, converted_type);
case parquet::Type::type::INT64:
return tryHashInt<int64_t>(field, logical_type, converted_type);
case parquet::Type::type::BYTE_ARRAY:
return tryHashString(field, logical_type, converted_type);
case parquet::Type::type::FIXED_LEN_BYTE_ARRAY:
return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length());
default:
return std::nullopt;
}
}

std::optional<std::vector<uint64_t>> parquetTryHashColumn(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor)
{
std::vector<uint64_t> hashes;

for (size_t i = 0u; i < data_column->size(); i++)
{
Field f;
data_column->get(i, f);

auto hashed_value = parquetTryHashField(f, parquet_column_descriptor);

if (!hashed_value)
{
return std::nullopt;
}

hashes.emplace_back(*hashed_value);
}

return hashes;
}

}

#endif
25 changes: 25 additions & 0 deletions src/Processors/Formats/Impl/Parquet/parquetBloomFilterHash.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <config.h>

#if USE_PARQUET

#include <Processors/Formats/Impl/ArrowFieldIndexUtil.h>

namespace DB
{

/*
* Try to hash a ClickHouse field, nullopt in case it can't be done
* */
std::optional<uint64_t> parquetTryHashField(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor);


/*
* Try to hash elements in a ClickHouse column; Will return std::nullopt in case one of them can't be hashed
* */
std::optional<std::vector<uint64_t>> parquetTryHashColumn(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor);

}

#endif
Loading
Loading