From c87cef9939bb88b4f6e66217e8b9de1ef3609102 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Tue, 16 Dec 2025 17:31:55 +0100 Subject: [PATCH 1/7] Merge pull request #79012 from ClickHouse/pqcs Settings to write and verify parquet checksums --- .../BuzzHouse/Generator/ServerSettings.cpp | 1 + src/Core/FormatFactorySettings.h | 6 +++ src/Core/SettingsChangesHistory.cpp | 4 +- src/Formats/FormatFactory.cpp | 2 + src/Formats/FormatSettings.h | 2 + src/Processors/Formats/Impl/Parquet/Write.cpp | 15 +++++- src/Processors/Formats/Impl/Parquet/Write.h | 1 + .../Formats/Impl/ParquetBlockInputFormat.cpp | 1 + .../Formats/Impl/ParquetBlockOutputFormat.cpp | 1 + .../03408_parquet_checksums.reference | 5 ++ .../0_stateless/03408_parquet_checksums.sh | 46 +++++++++++++++++++ 11 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03408_parquet_checksums.reference create mode 100755 tests/queries/0_stateless/03408_parquet_checksums.sh diff --git a/src/Client/BuzzHouse/Generator/ServerSettings.cpp b/src/Client/BuzzHouse/Generator/ServerSettings.cpp index 048e3f315182..2609bc7d27a4 100644 --- a/src/Client/BuzzHouse/Generator/ServerSettings.cpp +++ b/src/Client/BuzzHouse/Generator/ServerSettings.cpp @@ -559,6 +559,7 @@ std::unordered_map serverSettings = { {"input_format_parquet_case_insensitive_column_matching", trueOrFalseSettingNoOracle}, {"input_format_parquet_enable_json_parsing", trueOrFalseSettingNoOracle}, {"input_format_parquet_enable_row_group_prefetch", trueOrFalseSettingNoOracle}, + {"input_format_parquet_verify_checksums", trueOrFalseSettingNoOracle}, {"input_format_parquet_filter_push_down", trueOrFalseSetting}, {"input_format_parquet_preserve_order", trueOrFalseSettingNoOracle}, {"input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference", trueOrFalseSettingNoOracle}, diff --git a/src/Core/FormatFactorySettings.h b/src/Core/FormatFactorySettings.h index e3e69899cee2..5b512ee4d981 100644 --- a/src/Core/FormatFactorySettings.h +++ b/src/Core/FormatFactorySettings.h @@ -196,6 +196,9 @@ Skip pages using min/max values from column index. )", 0) \ DECLARE(Bool, input_format_parquet_use_offset_index, true, R"( Minor tweak to how pages are read from parquet file when no page filtering is used. +)", 0) \ + DECLARE(Bool, input_format_parquet_verify_checksums, true, R"( +Verify page checksums when reading parquet files. )", 0) \ DECLARE(Bool, input_format_allow_seeks, true, R"( Allow seeks while reading in ORC/Parquet/Arrow input formats. @@ -1127,6 +1130,9 @@ If dictionary size grows bigger than this many bytes, switch to encoding without )", 0) \ DECLARE(Bool, output_format_parquet_enum_as_byte_array, true, R"( Write enum using parquet physical type: BYTE_ARRAY and logical type: ENUM +)", 0) \ + DECLARE(Bool, output_format_parquet_write_checksums, true, R"( +Put crc32 checksums in parquet page headers. )", 0) \ DECLARE(String, output_format_avro_codec, "", R"( Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'. diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 3e3f5e3f7608..4bb2ed91321d 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -55,7 +55,9 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"export_merge_tree_part_file_already_exists_policy", "skip", "skip", "New setting."}, {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."}, {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."}, - {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."} + {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."}, + {"input_format_parquet_verify_checksums", true, true, "New setting."}, + {"output_format_parquet_write_checksums", false, true, "New setting."}, }); addSettingsChanges(settings_changes_history, "25.8", { diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 4b955510a608..05a702db01ae 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -210,6 +210,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.parquet.output_date_as_uint16 = settings[Setting::output_format_parquet_date_as_uint16]; format_settings.parquet.max_dictionary_size = settings[Setting::output_format_parquet_max_dictionary_size]; format_settings.parquet.output_enum_as_byte_array = settings[Setting::output_format_parquet_enum_as_byte_array]; + format_settings.parquet.write_checksums = settings[Setting::output_format_parquet_write_checksums]; format_settings.parquet.max_block_size = settings[Setting::input_format_parquet_max_block_size]; format_settings.parquet.prefer_block_bytes = settings[Setting::input_format_parquet_prefer_block_bytes]; format_settings.parquet.output_compression_method = settings[Setting::output_format_parquet_compression_method]; @@ -225,6 +226,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.parquet.bloom_filter_flush_threshold_bytes = settings[Setting::output_format_parquet_bloom_filter_flush_threshold_bytes]; format_settings.parquet.local_read_min_bytes_for_seek = settings[Setting::input_format_parquet_local_file_min_bytes_for_seek]; format_settings.parquet.enable_row_group_prefetch = settings[Setting::input_format_parquet_enable_row_group_prefetch]; + format_settings.parquet.verify_checksums = settings[Setting::input_format_parquet_verify_checksums]; format_settings.parquet.allow_geoparquet_parser = settings[Setting::input_format_parquet_allow_geoparquet_parser]; format_settings.parquet.write_geometadata = settings[Setting::output_format_parquet_geometadata]; format_settings.pretty.charset = settings[Setting::output_format_pretty_grid_charset].toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index fdbe4858b7bf..ba551d64a0f4 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -292,6 +292,7 @@ struct FormatSettings bool enable_json_parsing = true; bool preserve_order = false; bool enable_row_group_prefetch = true; + bool verify_checksums = true; std::unordered_set skip_row_groups = {}; UInt64 max_block_size = DEFAULT_BLOCK_SIZE; size_t prefer_block_bytes = DEFAULT_BLOCK_SIZE * 256; @@ -312,6 +313,7 @@ struct FormatSettings bool output_compliant_nested_types = true; bool write_page_index = false; bool write_bloom_filter = false; + bool write_checksums = true; ParquetVersion output_version = ParquetVersion::V2_LATEST; ParquetCompression output_compression_method = ParquetCompression::SNAPPY; uint64_t output_compression_level; diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index 2102f50f879d..3ccef28aa667 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -815,8 +816,12 @@ void writeColumnImpl( d.__set_encoding(use_dictionary ? parq::Encoding::RLE_DICTIONARY : encoding); d.__set_definition_level_encoding(parq::Encoding::RLE); d.__set_repetition_level_encoding(parq::Encoding::RLE); - /// We could also put checksum in `header.crc`, but apparently no one uses it: - /// https://issues.apache.org/jira/browse/PARQUET-594 + + if (options.write_checksums) + { + uint32_t crc = arrow::internal::crc32(0, compressed.data(), compressed.size()); + header.__set_crc(crc); + } parq::Statistics page_stats = page_statistics.get(options); bool has_null_count = s.max_def == 1 && s.max_rep == 0; @@ -878,6 +883,12 @@ void writeColumnImpl( header.dictionary_page_header.__set_num_values(dict_encoder->num_entries()); header.dictionary_page_header.__set_encoding(parq::Encoding::PLAIN); + if (options.write_checksums) + { + uint32_t crc = arrow::internal::crc32(0, compressed.data(), compressed.size()); + header.__set_crc(crc); + } + writePage(header, compressed, s, /*add_to_offset_index*/ false, /*first_row_index*/ 0, out); for (auto & p : dict_encoded_pages) diff --git a/src/Processors/Formats/Impl/Parquet/Write.h b/src/Processors/Formats/Impl/Parquet/Write.h index bf027227e6d1..1571cada52d6 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.h +++ b/src/Processors/Formats/Impl/Parquet/Write.h @@ -46,6 +46,7 @@ struct WriteOptions bool write_page_statistics = true; bool write_page_index = true; bool write_bloom_filter = true; + bool write_checksums = true; size_t max_statistics_size = 4096; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index e8a4c8fdce91..342ab6cd8ef9 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -906,6 +906,7 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat parquet::ReaderProperties reader_properties(ArrowMemoryPool::instance()); arrow_properties.set_use_threads(false); arrow_properties.set_batch_size(row_group_batch.adaptive_chunk_size); + reader_properties.set_page_checksum_verification(format_settings.parquet.verify_checksums); // When reading a row group, arrow will: // 1. Look at `metadata` to get all byte ranges it'll need to read from the file (typically one diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index fc94cedfc00d..3da015c2a46a 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -111,6 +111,7 @@ ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, SharedHea options.write_batch_size = format_settings.parquet.write_batch_size; options.write_page_index = format_settings.parquet.write_page_index; options.write_bloom_filter = format_settings.parquet.write_bloom_filter; + options.write_checksums = format_settings.parquet.write_checksums; options.bloom_filter_bits_per_value = format_settings.parquet.bloom_filter_bits_per_value; options.bloom_filter_flush_threshold_bytes = format_settings.parquet.bloom_filter_flush_threshold_bytes; options.write_geometadata = format_settings.parquet.write_geometadata; diff --git a/tests/queries/0_stateless/03408_parquet_checksums.reference b/tests/queries/0_stateless/03408_parquet_checksums.reference new file mode 100644 index 000000000000..8b5e17823f09 --- /dev/null +++ b/tests/queries/0_stateless/03408_parquet_checksums.reference @@ -0,0 +1,5 @@ +1234567890123456 +CRC checksum verification failed +no checksum error, as expected +1234567890123456 +no checksum error, as expected diff --git a/tests/queries/0_stateless/03408_parquet_checksums.sh b/tests/queries/0_stateless/03408_parquet_checksums.sh new file mode 100755 index 000000000000..d527e606252f --- /dev/null +++ b/tests/queries/0_stateless/03408_parquet_checksums.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +F="${CLICKHOUSE_TMP}/03408_parquet_checksums.parquet" + +corrupt_file() { + # Get column chunk offset and length. + r=`${CLICKHOUSE_LOCAL} -q "select row_groups[1].file_offset, row_groups[1].total_compressed_size from file('$F', ParquetMetadata)"` + + # Split the two tab-separated tokens. + IFS=$'\t' read -r offset length <<< "$r" + + # Overwrite the last byte of column chunk (outside page header) with garbage. + printf '\x42' | dd of="$F" bs=1 seek=$(( offset + length - 1 )) count=1 conv=notrunc status=none +} + +# Write file with checksums. +${CLICKHOUSE_LOCAL} -q " + insert into function file('$F') select 1234567890123456 as x settings engine_file_truncate_on_insert=1, output_format_parquet_compression_method='none', output_format_parquet_write_checksums=1; + select * from file('$F');" + +corrupt_file + +${CLICKHOUSE_LOCAL} -q " + select * from file('$F') settings input_format_parquet_verify_checksums=1 +" 2>&1 | grep -o 'CRC checksum verification failed' || echo 'got no checksum error, unexpected' + +${CLICKHOUSE_LOCAL} -q " + select * from file('$F') settings input_format_parquet_verify_checksums=0 +" 2>&1 | grep -o 'CRC checksum verification failed' || echo 'no checksum error, as expected' + + +# Write file without checksums. +${CLICKHOUSE_LOCAL} -q " + insert into function file('$F') select 1234567890123456 as x settings engine_file_truncate_on_insert=1, output_format_parquet_compression_method='none', output_format_parquet_write_checksums=0; + select * from file('$F');" + +corrupt_file + +${CLICKHOUSE_LOCAL} -q " + select * from file('$F') settings input_format_parquet_verify_checksums=1 +" 2>&1 | grep -o 'CRC checksum verification failed' || echo 'no checksum error, as expected' From c3c334f8ba7cdfb2362d0b6a7bf2fe5429491ab9 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 27 Oct 2025 14:36:18 +0000 Subject: [PATCH 2/7] Merge pull request #87735 from ClickHouse/pqf2 A few more parquet fixes --- .../Formats/Impl/Parquet/Reader.cpp | 33 +- src/Processors/Formats/Impl/Parquet/Reader.h | 3 +- src/Processors/Formats/Impl/Parquet/Write.cpp | 7 +- .../Formats/Impl/ParquetBlockInputFormat.cpp | 5 +- tests/clickhouse-test | 6 +- .../00900_long_parquet_load.reference | 922 ------------------ .../0_stateless/00900_long_parquet_load.sh | 70 -- .../00900_long_parquet_load_2.reference | 841 ++++++++++++++++ .../0_stateless/00900_long_parquet_load_2.sh | 52 + .../alltypes_dictionary.parquet.columns | 1 - .../alltypes_list.parquet.columns | 1 - .../alltypes_plain.parquet.columns | 1 - .../alltypes_plain.snappy.parquet.columns | 1 - .../data_parquet/array_float.parquet | Bin .../data_parquet/array_float.parquet.columns | 1 - .../data_parquet/array_int.parquet.columns | 1 - .../data_parquet/array_string.parquet | Bin .../data_parquet/array_string.parquet.columns | 1 - .../data_parquet/binary.parquet.columns | 1 - .../byte_array_decimal.parquet.columns | 1 - ...nsensitive_column_matching.parquet.columns | 1 - .../datapage_v2.snappy.parquet.columns | 1 - .../datatype-date32.parquet.columns | 1 - .../dict-page-offset-zero.parquet.columns | 1 - .../fixed_array_int.parquet.columns | 1 - ...ixed_array_nested_list_int.parquet.columns | 1 - .../fixed_array_str.parquet.columns | 1 - .../fixed_length_decimal.parquet.columns | 1 - .../fixed_length_decimal_1.parquet.columns | 1 - ...ixed_length_decimal_legacy.parquet.columns | 1 - .../hadoop_lz4_compressed.parquet.columns | 1 - .../int32_decimal.parquet.columns | 1 - .../int64_decimal.parquet.columns | 1 - .../0_stateless/data_parquet/iris.parquet | Bin 0 -> 2448 bytes .../data_parquet/list_columns.parquet.columns | 1 - .../nation.dict-malformed.parquet.columns | 1 - .../data_parquet/nested_lists.parquet.columns | 1 - .../nested_lists.snappy.parquet.columns | 1 - .../nested_maps.snappy.parquet.columns | 1 - .../non_hadoop_lz4_compressed.parquet.columns | 1 - .../nonnullable.impala.parquet.columns | 1 - .../nullable.impala.parquet.columns | 1 - .../nullable_list.parquet.columns | 1 - .../data_parquet/nulls.snappy.parquet.columns | 1 - .../data_parquet/single_nan.parquet.columns | 1 - .../data_parquet/userdata1.parquet.columns | 1 - .../data_parquet/userdata2.parquet.columns | 1 - .../data_parquet/userdata3.parquet.columns | 1 - .../data_parquet/userdata4.parquet.columns | 1 - .../data_parquet/userdata5.parquet.columns | 1 - .../v0.7.1.all-named-index.parquet.columns | 1 - ...1.column-metadata-handling.parquet.columns | 1 - .../data_parquet/v0.7.1.parquet.columns | 1 - .../v0.7.1.some-named-index.parquet.columns | 1 - .../00900_parquet_create_table_columns.py | 125 --- 55 files changed, 937 insertions(+), 1169 deletions(-) delete mode 100644 tests/queries/0_stateless/00900_long_parquet_load.reference delete mode 100755 tests/queries/0_stateless/00900_long_parquet_load.sh create mode 100644 tests/queries/0_stateless/00900_long_parquet_load_2.reference create mode 100755 tests/queries/0_stateless/00900_long_parquet_load_2.sh delete mode 100644 tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/alltypes_list.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns mode change 100755 => 100644 tests/queries/0_stateless/data_parquet/array_float.parquet delete mode 100644 tests/queries/0_stateless/data_parquet/array_float.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/array_int.parquet.columns mode change 100755 => 100644 tests/queries/0_stateless/data_parquet/array_string.parquet delete mode 100644 tests/queries/0_stateless/data_parquet/array_string.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/binary.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/byte_array_decimal.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/datatype-date32.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/fixed_array_int.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/fixed_array_nested_list_int.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/fixed_array_str.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/iris.parquet delete mode 100644 tests/queries/0_stateless/data_parquet/list_columns.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nation.dict-malformed.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nested_lists.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nullable_list.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/single_nan.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/userdata1.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/userdata2.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/userdata3.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/userdata4.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/userdata5.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/v0.7.1.all-named-index.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/v0.7.1.column-metadata-handling.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/v0.7.1.parquet.columns delete mode 100644 tests/queries/0_stateless/data_parquet/v0.7.1.some-named-index.parquet.columns delete mode 100755 tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py diff --git a/src/Processors/Formats/Impl/Parquet/Reader.cpp b/src/Processors/Formats/Impl/Parquet/Reader.cpp index fdd0a433d789..6bd568869669 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.cpp +++ b/src/Processors/Formats/Impl/Parquet/Reader.cpp @@ -211,6 +211,33 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher) } } + /// Consider two quirks: + /// (1) Some versions of spark didn't write dictionary_page_offset even when dictionary page is + /// present. Instead, data_page_offset points to the dictionary page. + /// (2) Old DuckDB versions (<= 0.10.2) wrote incorrect data_page_offset when dictionary is + /// present. + /// We work around (1) in initializePage by allowing dictionary page in place of data page. + /// We work around (2) here by converting it to case (1): + /// data_page_offset = dictionary_page_offset + /// dictionary_page_offset.reset() + /// Note: newer versions of DuckDB include version number in the `created_by` string, so this + /// `if` only applies to relatively old versions. Newer versions don't have this bug. + if (file_metadata.created_by == "DuckDB") + { + for (auto & rg : file_metadata.row_groups) + { + for (auto & col : rg.columns) + { + if (!col.__isset.offset_index_offset && col.meta_data.__isset.dictionary_page_offset) + { + col.meta_data.data_page_offset = col.meta_data.dictionary_page_offset; + col.meta_data.__isset.dictionary_page_offset = false; + col.meta_data.dictionary_page_offset = 0; + } + } + } + } + return file_metadata; } @@ -1511,9 +1538,9 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size if (column.dictionary.isInitialized()) throw Exception(ErrorCodes::INCORRECT_DATA, "Column chunk has multiple dictionary pages or inaccurate data_page_offset"); - /// If we got here, this is a weird parquet file that has a dictionary page but no - /// dictionary_page_offset in ColumnMetaData. Not sure whether this is allowed, but spark - /// can output such files, so we have to support it. + /// There's a dictionary page, but there was no dictionary_page_offset in ColumnMetaData. + /// This is probably not allowed, but we have to support it because some writers wrote such + /// files, see comment in readFileMetaData. decodeDictionaryPageImpl(header, page.data, column, column_info); return false; } diff --git a/src/Processors/Formats/Impl/Parquet/Reader.h b/src/Processors/Formats/Impl/Parquet/Reader.h index 899d4371b0c4..eb7cf2931f6f 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.h +++ b/src/Processors/Formats/Impl/Parquet/Reader.h @@ -25,7 +25,8 @@ namespace DB::Parquet // TODO [parquet]: // * either multistage PREWHERE or make query optimizer selectively move parts of the condition to prewhere instead of the whole condition -// * test on files from https://github.com/apache/parquet-testing +// * test on files from https://github.com/apache/parquet-testing and https://www.timestored.com/data/sample/parquet +// * look at issues in 00900_long_parquet_load.sh // * check fields for false sharing, add cacheline padding as needed // * make sure userspace page cache read buffer supports readBigAt // * support newer parquet versions: https://github.com/apache/parquet-format/blob/master/CHANGES.md diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index 3ccef28aa667..0177940c1118 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -1322,7 +1322,12 @@ void writeFileFooter(FileWriteState & file, meta.num_rows += rg.row_group.num_rows; meta.row_groups.push_back(std::move(rg.row_group)); } - meta.__set_created_by(std::string(VERSION_NAME) + " " + VERSION_DESCRIBE); + + /// parquet.thrift sayeth: + /// > This should be in the format + /// > version (build ). + /// > e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) + meta.__set_created_by(fmt::format("ClickHouse version {}.{}.{} (build {})", VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_GITHASH)); if (options.write_page_statistics || options.write_column_chunk_statistics) { diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 342ab6cd8ef9..1b1ce9731962 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -899,7 +899,7 @@ void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settin void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_batch_idx) { - const bool row_group_prefetch = io_pool != nullptr; + bool row_group_prefetch = io_pool != nullptr; auto & row_group_batch = row_group_batches[row_group_batch_idx]; parquet::ArrowReaderProperties arrow_properties; @@ -952,7 +952,10 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat // other, failing an assert. So we disable pre-buffering in this case. // That version is >10 years old, so this is not very important. if (metadata->writer_version().VersionLt(parquet::ApplicationVersion::PARQUET_816_FIXED_VERSION())) + { arrow_properties.set_pre_buffer(false); + row_group_prefetch = false; + } if (format_settings.parquet.use_native_reader) { diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 077abaa0cdb6..8cd467a79e85 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1832,7 +1832,6 @@ class TestCase: self.stdout_file, ], stdout=PIPE, - universal_newlines=True, ) as diff_proc: if self.show_whitespaces_in_diff: with Popen( @@ -1840,11 +1839,10 @@ class TestCase: stdin=diff_proc.stdout, stdout=PIPE, ) as sed_proc: - diff = sed_proc.communicate()[0].decode( - "utf-8", errors="ignore" - ) + diff = sed_proc.communicate()[0] else: diff = diff_proc.communicate()[0] + diff = diff.decode("utf-8", errors="ignore") if diff.startswith("Binary files "): diff += "Content of stdout:\n===================\n" diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference deleted file mode 100644 index 2316afeb2e70..000000000000 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ /dev/null @@ -1,922 +0,0 @@ -=== Try load data from alltypes_dictionary.parquet -0 1 0 0 0 0 0 0 01/01/09 0 1230768000 -1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060 -=== Try load data from alltypes_list.parquet -[] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] -[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['1999-12-31 23:00:00','2000-12-31 23:00:00','2001-12-31 23:00:00'] [0.2,10,4] [4,10000.1,10000.1] [1000000000,90,101001.01] -[1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] ['2000-01-01','2001-01-01','2002-01-01'] ['1999-12-31 23:00:00','2000-12-31 23:00:00','2001-12-31 23:00:00'] [0.2,10,4] [4,10000.1,10000.1] [1000000000,90,101001.01] -=== Try load data from alltypes_plain.parquet -0 1 0 0 0 0 0 0 01/01/09 0 1230768000 -1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060 -2 1 0 0 0 0 0 0 02/01/09 0 1233446400 -3 0 1 1 1 10 1.1 10.1 02/01/09 1 1233446460 -4 1 0 0 0 0 0 0 03/01/09 0 1235865600 -5 0 1 1 1 10 1.1 10.1 03/01/09 1 1235865660 -6 1 0 0 0 0 0 0 04/01/09 0 1238544000 -7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060 -=== Try load data from alltypes_plain.snappy.parquet -6 1 0 0 0 0 0 0 04/01/09 0 1238544000 -7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060 -=== Try load data from array_float.parquet -idx1 [] -idx10 [10.2,8.2] -idx2 [10.2,8.2,7.2] -idx3 [10.2,8.2] -idx4 [10.2] -idx5 [10.2,8.2] -idx6 [10.2] -idx7 [10.2,8.2] -idx8 [10.2,8.2] -idx9 [10.2] -=== Try load data from array_int.parquet -idx1 [100,101,102] -idx10 [100,101,102] -idx2 [100,101] -idx3 [100,101,102,101] -idx4 [100] -idx5 [100,101] -idx6 [100,101] -idx7 [100,101] -idx8 [100,101] -idx9 [100,101,102] -=== Try load data from array_string.parquet -idx1 ['This','is','a','test'] -idx10 ['This','is','a','test'] -idx2 ['cigarette','smoke'] -idx3 ['the','grocery','clerks'] -idx4 [] -idx5 ['wants','to','get','out'] -idx6 ['me','up?'] -idx7 ['then','I','put','him','back'] -idx8 ['make','a','man'] -idx9 ['Which','Heaven','to','gaudy','day','denies'] -=== Try load data from binary.parquet -\0 - - - - - - - -\b -\t -\n - -=== Try load data from byte_array_decimal.parquet -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -=== Try load data from case_insensitive_column_matching.parquet -123 1 -456 2 -=== Try load data from datapage_v2.snappy.parquet -abc 1 2 1 [1,2,3] -abc 2 3 1 [] -abc 3 4 1 [] -abc 5 2 1 [1,2] -\N 4 5 0 [1,2,3] -=== Try load data from datatype-date32.parquet -1925-01-01 -1949-10-01 -2021-10-01 -2282-12-31 -=== Try load data from dict-page-offset-zero.parquet -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -1552 -=== Try load data from fixed_array_int.parquet -idx1 [73,33,88,34,94,96,11,90,20,17] -idx10 [10,15,52,43,22,50,42,87,19,91] -idx2 [10,18,74,39,2,55,13,41,6,42] -idx3 [48,27,74,82,70,46,18,78,63,73] -idx4 [87,56,32,98,25,18,66,21,20,2] -idx5 [67,84,56,84,50,21,93,90,37,42] -idx6 [53,14,44,96,40,71,26,74,27,25] -idx7 [46,1,21,11,83,74,11,63,28,49] -idx8 [55,30,50,19,12,95,5,83,71,34] -idx9 [90,53,83,54,35,87,73,74,98,50] -=== Try load data from fixed_array_nested_list_int.parquet -idx1 [[34,11,56,20,90,20,17,55,54,99],[22,18,15,77,68,51,30,76,9,40],[67,24,2,27,72,53,99,57,67,96],[95,77,60,47,68,56,91,28,90,38],[77,8,75,89,84,81,60,82,71,26],[41,89,60,15,43,81,31,68,71,65],[84,97,22,9,26,59,97,10,16,60],[13,82,16,47,32,74,34,78,90,15],[35,44,50,18,82,71,20,68,89,41],[60,59,7,65,18,23,24,37,48,45]] -idx10 [[10,72,13,85,89,15,51,73,64,49],[78,28,54,57,10,1,3,35,23,15],[97,93,76,87,86,21,30,9,58,21],[27,23,35,71,4,68,90,14,93,87],[59,98,66,94,8,90,16,20,33,10],[73,84,79,37,75,50,64,74,79,31],[20,64,79,51,20,41,7,69,5,22],[37,60,1,99,45,43,56,26,15,83],[7,96,6,58,7,59,57,6,22,92],[97,11,71,55,32,79,88,20,58,31]] -idx2 [[72,32,17,44,69,37,95,64,47,70],[15,4,28,31,80,63,40,93,17,87],[28,46,56,21,65,20,5,76,18,61],[66,44,5,66,71,73,26,70,47,22],[9,9,62,21,74,75,32,71,2,61],[66,81,43,46,10,78,66,31,34,17],[30,5,10,15,56,52,36,3,88,16],[45,84,39,12,39,13,43,66,67,16],[27,18,68,80,87,19,98,39,90,54],[48,96,24,67,66,91,3,10,7,66]] -idx3 [[54,93,69,82,45,54,35,59,20,39],[80,3,15,33,45,22,45,11,28,94],[68,26,89,38,49,28,29,59,93,57],[11,62,53,12,16,83,2,55,65,6],[37,49,56,93,33,77,22,53,20,22],[4,75,2,5,34,5,90,67,2,79],[22,2,80,14,13,44,33,11,31,24],[11,54,75,33,60,20,10,51,56,33],[72,5,84,78,7,95,21,2,88,75],[69,88,52,85,70,96,51,69,48,20]] -idx4 [[65,58,31,62,38,29,32,71,87,49],[66,89,59,83,29,32,59,49,68,34],[81,54,14,69,39,84,95,37,15,47],[15,51,15,38,90,61,76,51,11,24],[64,93,27,26,29,15,15,4,95,36],[60,61,57,66,61,25,21,97,34,36],[7,34,30,4,8,29,68,48,58,64],[77,19,92,77,81,21,52,1,86,92],[16,91,1,40,57,44,76,10,78,10],[88,13,70,61,2,76,78,69,90,24]] -idx5 [[52,98,63,41,2,6,56,80,72,15],[11,64,89,57,65,92,63,39,76,51],[47,25,33,28,58,22,83,19,54,63],[61,8,20,24,83,44,33,32,65,90],[51,75,72,55,54,5,43,90,64,30],[86,87,48,65,84,67,82,94,2,9],[1,75,83,79,64,63,52,51,33,72],[12,84,92,27,66,32,40,48,34,55],[59,91,19,77,70,56,10,82,40,60],[79,83,14,82,42,22,72,54,12,9]] -idx6 [[71,60,6,31,40,66,78,6,23,14],[62,88,38,68,37,37,63,18,57,3],[45,93,1,49,99,33,10,91,6,17],[13,40,57,85,48,14,74,89,43,49],[18,17,39,75,52,30,48,4,13,55],[3,34,73,71,75,58,6,73,73,31],[28,74,26,3,2,49,50,60,27,79],[35,28,12,10,1,21,61,70,65,37],[30,27,51,85,89,84,73,48,4,71],[1,86,23,68,82,9,6,95,14,25]] -idx7 [[58,24,79,16,32,36,36,91,18,22],[85,29,36,16,75,79,71,70,6,39],[47,15,82,30,55,14,49,47,38,13],[28,54,95,82,25,16,44,82,33,11],[58,16,87,96,65,3,10,68,87,15],[94,84,65,50,21,78,8,78,89,72],[39,41,67,23,21,83,43,94,31,15],[67,58,73,87,58,71,52,10,30,90],[69,65,72,89,51,8,39,80,49,79],[32,36,76,11,88,87,75,55,33,74]] -idx8 [[8,26,45,3,25,83,88,77,98,38],[52,76,79,94,6,74,73,31,93,53],[89,24,62,83,35,24,60,24,41,14],[52,13,13,32,87,77,19,20,52,47],[50,46,66,30,26,85,91,50,98,83],[87,44,5,11,25,52,9,55,15,37],[18,24,36,72,84,10,13,59,16,65],[93,4,19,13,75,64,73,29,81,25],[61,65,1,45,75,88,19,4,73,32],[23,20,27,55,13,34,97,80,8,19]] -idx9 [[32,6,44,30,66,48,93,67,62,16],[49,67,73,32,86,78,28,88,25,60],[70,10,95,16,10,49,12,47,22,62],[95,86,63,43,4,96,3,34,60,53],[60,75,96,66,96,82,62,31,78,72],[17,44,13,7,45,54,27,84,10,68],[43,49,86,10,57,99,32,72,73,41],[57,79,15,62,79,87,67,69,75,59],[45,18,2,19,45,96,8,86,71,97],[20,20,11,15,82,29,16,12,87,87]] -=== Try load data from fixed_array_str.parquet -idx1 ['str10','str11','str12','str13','str14','str15','str16','str17','str18','str19'] -idx10 ['str100','str101','str102','str103','str104','str105','str106','str107','str108','str109'] -idx2 ['str20','str21','str22','str23','str24','str25','str26','str27','str28','str29'] -idx3 ['str30','str31','str32','str33','str34','str35','str36','str37','str38','str39'] -idx4 ['str40','str41','str42','str43','str44','str45','str46','str47','str48','str49'] -idx5 ['str50','str51','str52','str53','str54','str55','str56','str57','str58','str59'] -idx6 ['str60','str61','str62','str63','str64','str65','str66','str67','str68','str69'] -idx7 ['str70','str71','str72','str73','str74','str75','str76','str77','str78','str79'] -idx8 ['str80','str81','str82','str83','str84','str85','str86','str87','str88','str89'] -idx9 ['str90','str91','str92','str93','str94','str95','str96','str97','str98','str99'] -=== Try load data from fixed_length_decimal.parquet -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -=== Try load data from fixed_length_decimal_1.parquet -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -=== Try load data from fixed_length_decimal_legacy.parquet -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -=== Try load data from hadoop_lz4_compressed.parquet -1593604800 abc 42 -1593604800 def 7.7 -1593604801 abc 42.125 -1593604801 def 7.7 -=== Try load data from int32_decimal.parquet -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -=== Try load data from int64_decimal.parquet -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -=== Try load data from list_columns.parquet -[1,2,3] ['abc','efg','hij'] -[4] ['efg',NULL,'hij','xyz'] -[NULL,1] [] -=== Try load data from nation.dict-malformed.parquet -0 ALGERIA 0 haggle. carefully final deposits detect slyly agai -1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon -2 BRAZIL 1 y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special -3 CANADA 1 eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold -4 EGYPT 4 y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d -5 ETHIOPIA 0 ven packages wake quickly. regu -6 FRANCE 3 refully final requests. regular, ironi -7 GERMANY 3 l platelets. regular accounts x-ray: unusual, regular acco -8 INDIA 2 ss excuses cajole slyly across the packages. deposits print aroun -9 INDONESIA 2 slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull -10 IRAN 4 efully alongside of the slyly final dependencies. -11 IRAQ 4 nic deposits boost atop the quickly final requests? quickly regula -12 JAPAN 2 ously. final, express gifts cajole a -13 JORDAN 4 ic deposits are blithely about the carefully regular pa -14 KENYA 0 pending excuses haggle furiously deposits. pending, express pinto beans wake fluffily past t -15 MOROCCO 0 rns. blithely bold courts among the closely regular packages use furiously bold platelets? -16 MOZAMBIQUE 0 s. ironic, unusual asymptotes wake blithely r -17 PERU 1 platelets. blithely pending dependencies use fluffily across the even pinto beans. carefully silent accoun -18 CHINA 2 c dependencies. furiously express notornis sleep slyly regular accounts. ideas sleep. depos -19 ROMANIA 3 ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account -20 SAUDI ARABIA 4 ts. silent requests haggle. closely express packages sleep across the blithely -21 VIETNAM 2 hely enticingly express accounts. even, final -22 RUSSIA 3 requests against the platelets use never according to the quickly regular pint -23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull -24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be -=== Try load data from nested_lists.parquet -[[[1,2,3],[1,2,3]],[[1,2,3]],[[],[1,2,3]]] [[['Some string','Some string'],[]],[['Some string']],[[]]] [[NULL,1,2],[NULL],[1,2],[]] [['Some string',NULL,'Some string'],[NULL],[]] -=== Try load data from nested_lists.snappy.parquet -[[['a','b'],['c']],[[],['d']]] 1 -[[['a','b'],['c','d']],[[],['e']]] 1 -[[['a','b'],['c','d'],['e']],[[],['f']]] 1 -=== Try load data from nested_maps.snappy.parquet -{'a':{1:1,2:0}} 1 1 -{'b':{1:1}} 1 1 -{'c':{}} 1 1 -{'d':{}} 1 1 -{'e':{1:1}} 1 1 -{'f':{3:1,4:0,5:1}} 1 1 -=== Try load data from non_hadoop_lz4_compressed.parquet -1593604800 abc 42 -1593604800 def 7.7 -1593604801 abc 42.125 -1593604801 def 7.7 -=== Try load data from nonnullable.impala.parquet -8 [-1] [[-1,-2],[]] {'k1':-1} [{},{'k1':1},{},{}] (-1,[-1],([[(-1,'nonnullable')]]),{}) -=== Try load data from nullable.impala.parquet -1 [1,2,3] [[1,2],[3,4]] {'k1':1,'k2':100} [{'k1':1}] (1,[1],([[(10,'aaa'),(-10,'bbb')],[(11,'c')]]),{'foo':(([1.1]))}) -2 [NULL,1,2,NULL,3,NULL] [[NULL,1,2,NULL],[3,NULL,4],[],[]] {'k1':2,'k2':NULL} [{'k3':NULL,'k1':1},{},{}] (NULL,[NULL],([[(NULL,NULL),(10,'aaa'),(NULL,NULL),(-10,'bbb'),(NULL,NULL)],[(11,'c'),(NULL,NULL)],[],[]]),{'g1':(([2.2,NULL])),'g2':(([])),'g3':(([])),'g4':(([])),'g5':(([]))}) -3 [] [[]] {} [{},{}] (NULL,[],([]),{}) -4 [] [] {} [] (NULL,[],([]),{}) -5 [] [] {} [] (NULL,[],([]),{'foo':(([2.2,3.3]))}) -6 [] [] {} [] (NULL,[],([]),{}) -7 [] [[],[5,6]] {'k1':NULL,'k3':NULL} [] (7,[2,3,NULL],([[],[(NULL,NULL)],[]]),{}) -=== Try load data from nullable_list.parquet -[] [] [] -[1,NULL,2] [NULL,'Some string',NULL] [0,NULL,42.42] -[NULL] [NULL] [NULL] -=== Try load data from nulls.snappy.parquet -(NULL) -(NULL) -(NULL) -(NULL) -(NULL) -(NULL) -(NULL) -(NULL) -=== Try load data from single_nan.parquet -\N -=== Try load data from userdata1.parquet -1454457660 721 Shirley Williams swilliamsk0@sciencedirect.com 132.137.10.218 5610801309305920 Indonesia 8/13/1978 \N Help Desk Technician -1454457663 785 Daniel Spencer dspencerls@cargocollective.com Male 241.143.186.140 China 12/3/1997 194214.08 Internal Auditor -1454457674 880 Lillian Murray lmurrayof@guardian.co.uk Female 222.252.22.1 201713786459078 Norway 4/16/1981 282503.77 Business Systems Development Analyst -1454457684 852 Carol Patterson cpattersonnn@ycombinator.com Female 244.190.113.241 0604512080706322395 Liberia 5/8/1984 263412.02 Assistant Professor -1454457705 244 Sarah Freeman sfreeman6r@wikimedia.org Female 219.8.22.27 30520943172503 United States 3/25/1958 25806.31 Budget/Accounting Analyst II ⁰⁴⁵ -1454457740 633 Maria Fowler mfowlerhk@chronoengine.com Female 246.85.249.122 3584144503415501 China 11/25/1998 276712.79 Staff Scientist ␣ -1454457782 925 Chris Murphy cmurphypo@nature.com 89.217.243.136 5602220700741429 Russia \N -1454457790 788 Nicholas Butler nbutlerlv@thetimes.co.uk Male 77.38.58.165 3575506969751259 Brazil 2/10/1981 192076.79 Data Coordiator -1454457853 301 Jerry Welch jwelch8c@paginegialle.it Male 141.166.33.218 5602252929753349 Latvia 3/14/1973 28731.89 Software Engineer I -1454457952 53 Ralph Price rprice1g@tmall.com Male 152.6.235.33 4844227560658222 China 8/26/1986 168208.4 Teacher -1454458004 607 Johnny Owens jowensgu@blogspot.com Male 181.25.18.91 5602239825516409 Indonesia 2/14/1960 169429.76 Health Coach III -1454458010 375 Bruce Gonzales bgonzalesae@studiopress.com Male 19.195.169.187 Sweden 7/4/1993 118244.57 Human Resources Manager "<>?:""{}|_+" -1454458170 744 Heather Richardson hrichardsonkn@twitter.com Female 129.15.137.135 Ukraine 12/26/1980 164117.18 GIS Technical Architect -1454458178 635 Willie Dixon wdixonhm@diigo.com Male 27.245.227.220 Japan 8/29/1992 265321.18 Senior Cost Accountant -1454458242 11 Susan Perkins sperkinsa@patch.com Female 180.85.0.62 3573823609854134 Russia 210001.95 -1454458282 175 Samuel Edwards sedwards4u@businessweek.com Male 60.248.106.175 676249211413011686 Russia 10/15/1986 75886.69 Senior Sales Associate -1454458493 78 Mildred Torres mtorres25@alibaba.com Female 38.102.60.15 6399156779396437 Russia 9/24/1960 166987.55 Paralegal -1454458506 40 Jack Flores jflores13@yolasite.com Male 162.215.65.11 3577342788590928 Argentina 1/28/1958 81685.1 Financial Advisor -1454458536 749 Larry Fields lfieldsks@theguardian.com Male 46.57.123.222 3531208154739438 Yemen 139177.38 Œ„´‰ˇÁ¨ˆØ∏”’ -1454458564 521 Roy Palmer rpalmereg@nsw.gov.au Male 255.242.77.68 3589146577885209 Nepal 8/28/1964 262816.87 Software Test Engineer IV -1454458607 314 James Harvey jharvey8p@npr.org Male 96.88.41.248 3589416270039051 China 211553.57 -1454458706 995 Jose Mccoy jmccoyrm@elpais.com Male 117.37.215.98 560222933605513180 Norway 7/30/1987 275898.37 Graphic Designer -1454458727 835 Sean Castillo scastillon6@altervista.org 211.77.61.195 Portugal 6/15/1979 \N Quality Control Specialist -1454458739 821 Juan Foster jfosterms@reference.com Male 219.231.170.245 5108759901583907 Portugal 2/16/1969 120076.81 Quality Engineer 1E02 -1454458751 670 Irene Hughes ihughesil@topsy.com Female 154.194.86.224 3536739760978536 Netherlands 6/17/1973 274295.42 Structural Analysis Engineer -1454458801 149 Gregory Edwards gedwards44@icq.com Male 5.204.156.34 3548268624172124 Portugal 2/5/1977 236421.33 Librarian -1454458805 683 Joshua Ramirez jramireziy@liveinternet.ru Male 164.224.133.177 3574998106893089 France 10/24/1987 17658.63 Senior Developer -1454458862 226 James Austin jaustin69@istockphoto.com Male 228.107.68.143 4913037818454290 Russia 25084.49 -1454458909 659 Doris Welch dwelchia@about.com Female 195.125.217.107 3537263234825586 Indonesia 3/31/1995 183928.71 Quality Engineer -1454458914 479 Joseph Gordon jgordonda@trellian.com Male 140.193.192.82 3533495991170988 Indonesia 6/30/1960 262448.45 Health Coach II -1454458932 615 Marie Matthews mmatthewsh2@smugmug.com 8.217.73.21 589312447234085155 Indonesia 8/10/1973 \N Chief Design Engineer -1454458946 379 Martha Simmons msimmonsai@tripadvisor.com Female 8.141.39.185 Russia 9/18/1978 92766.32 Staff Scientist -1454458967 730 Anne Perez aperezk9@freewebs.com Female 208.87.2.91 China 8/18/1966 47293.4 Nuclear Power Engineer ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙 -1454458979 426 Lois Green lgreenbt@1688.com 39.174.95.97 5100146457712544 Bulgaria 2/22/1955 \N Health Coach III -1454459038 810 Mark Kelley mkelleymh@blog.com 210.153.220.197 3543227090716355 Poland 5/31/1969 \N Programmer Analyst I -1454459045 475 Richard Howell rhowelld6@springer.com Male 176.182.155.97 Central African Republic 138775.31 ‪‪test‪ -1454459058 523 Phillip Butler pbutlerei@storify.com Male 184.124.14.67 China 12/18/1957 106832.85 Paralegal -1454459092 437 Virginia Robinson vrobinsonc4@opensource.org Female 148.213.54.195 3567035727522042 China 6/27/1995 24623.44 Senior Sales Associate -1454459132 722 Robin Spencer rspencerk1@github.com Female 83.129.98.63 3580163142176138 Poland 1/18/1987 171963.73 Budget/Accounting Analyst I -1454459226 291 Julia Medina jmedina82@cbc.ca Female 43.27.110.171 30163835573619 Russia 8/12/1991 109927.88 Software Engineer II -1454459288 800 Sarah Andrews sandrewsm7@kickstarter.com Female 238.132.217.166 5018303367167648843 China 4/19/1970 42010.56 Computer Systems Analyst IV -1454459290 162 Steve Spencer sspencer4h@deliciousdays.com Male 109.138.4.34 China 6/2/1964 79184.71 Teacher () { _; } >_[$($())] { touch /tmp/blns.shellshock2.fail; } -1454459301 322 Frances Fisher ffisher8x@businessinsider.com Female 55.187.133.82 30168292124913 Poland 11/4/1997 140594.79 Geologist IV 社會科學院語學研究所 -1454459320 370 Roger Gilbert rgilberta9@businesswire.com Male 46.96.123.235 Finland 1/20/1999 16506.02 Analog Circuit Design manager -1454459328 929 Susan Jordan sjordanps@ucla.edu Female 108.42.4.149 589358467890938815 Philippines 5/31/1995 44739.92 Account Coordinator -1454459330 215 Philip Fox pfox5y@vimeo.com Male 65.223.141.140 Israel 9/5/1991 218538.31 Graphic Designer -1454459356 265 Judith Simpson jsimpson7c@taobao.com 105.52.110.107 6378542962124121 Indonesia 12/12/1983 \N Project Manager """\'""\'""\'\'\'""" -1454459359 708 Judy Young jyoungjn@dailymail.co.uk Female 21.109.231.236 3554148278137055 Tunisia 1/2/1958 212070.86 Chief Design Engineer 田中さんにあげて下さい -1454459394 795 Clarence Edwards cedwardsm2@ed.gov 111.156.147.232 3533231926493017 Poland 12/23/1981 \N General Manager -1454459439 589 Gerald Porter gportergc@pcworld.com Male 97.189.77.0 Philippines 7/2/1979 278447.61 Professor -1454459459 33 Christina Mason cmasonw@nydailynews.com Female 74.214.22.120 Greece 7/21/1986 242593.85 Senior Sales Associate -1454459497 524 Brenda Willis bwillisej@sun.com Female 45.122.116.217 6380803357074248 Poland 108844.98 -1454459499 591 Rose Garrett rgarrettge@mit.edu Female 116.228.6.108 30147178065069 Philippines 10/5/1988 244134.1 Accountant III -1454459516 653 Lane Male 192.59.226.245 3528384158258405 China 12/26/1997 127912.54 Geologist I -1454459556 779 Richard Hunt rhuntlm@ovh.net Male 162.73.16.141 5203349476569897 China 6/24/1969 13375.17 Environmental Tech -1454459562 681 Betty Hamilton bhamiltoniw@facebook.com Female 193.209.0.183 Morocco 5/5/1965 210804.85 Human Resources Assistant II -1454459577 173 Amy Garza agarza4s@woothemes.com Female 75.187.251.37 China 82283.83 -1454459605 57 Willie Palmer wpalmer1k@t-online.de Male 164.107.46.161 4026614769857244 China 8/23/1986 184978.64 Environmental Specialist -1454459605 888 Marie Torres mtorreson@tamu.edu Female 190.148.84.34 5610170119678060511 Bosnia and Herzegovina 261087.2 -1454459709 293 Amy Cook acook84@prlog.org 186.92.46.224 Ukraine 7/23/1976 \N Human Resources Assistant III -1454459719 920 Johnny Brown jbrownpj@constantcontact.com Male 25.161.139.20 Sweden 4/17/1998 149870.24 Speech Pathologist -1454459729 137 Phillip Vasquez pvasquez3s@canalblog.com Male 195.121.180.8 5602221706127365 Ethiopia 7/28/1992 274927.74 Internal Auditor -1454459747 876 Samuel Hughes shughesob@dion.ne.jp Male 29.127.239.106 3535476909940686 Indonesia 220585.61 Œ„´‰ˇÁ¨ˆØ∏”’ -1454459781 4 Denise Riley driley3@gmpg.org Female 140.35.109.83 3576031598965625 China 4/8/1997 90263.05 Senior Cost Accountant -1454459806 195 Joe Hayes jhayes5e@opensource.org Male 96.48.27.170 343842871636339 Indonesia 239690.34 -1454459806 525 Elizabeth Porter eporterek@china.com.cn Female 249.248.212.114 Indonesia 7/7/1993 33270.67 Recruiter -1454459905 958 Louis Griffin lgriffinql@umn.edu 184.242.195.194 3571277617780793 China 10/31/1988 \N Assistant Media Planner -1454459969 655 Johnny Reed jreedi6@chicagotribune.com Male 169.161.103.111 4844445630272291 Russia 5/23/1979 68913.72 Quality Engineer -1454459981 614 Marie Ramirez mramirezh1@wikia.com Female 143.213.146.199 633390820329851783 China 7/17/1988 131783.55 Dental Hygienist -1454460012 200 Russell Ward rward5j@surveymonkey.com Male 73.156.128.8 Sweden 173849.81 -1454460033 454 Ashley Crawford acrawfordcl@weather.com Female 61.81.102.117 3563365997409370 Vietnam 264109.73 -1454460230 685 Joan Jackson jjacksonj0@paypal.com Female 153.5.15.100 Yemen 8/16/1992 54385.21 Structural Analysis Engineer -1454460236 222 Sara Price sprice65@usatoday.com Female 46.58.242.198 Canada 2/11/1959 49611.44 Sales Representative -1454460241 16 Bruce Willis bwillisf@bluehost.com Male 239.182.219.189 3573030625927601 Brazil 239100.65 -1454460496 906 Amanda Clark aclarkp5@facebook.com Female 190.75.162.144 56022268731524616 Norway 7/19/1982 39551.7 General Manager -1454460516 68 Rachel Price rprice1v@census.gov Female 89.52.192.105 Indonesia 5/6/1982 234502.16 Payment Adjustment Coordinator -1454460605 879 Diane Flores dfloresoe@wiley.com Female 88.102.252.118 201739112087937 Philippines 12/2/1969 250449.32 Sales Associate -1454460715 676 Michael Jackson mjacksonir@scribd.com Male 130.159.201.48 201788384710734 China 7/8/1957 170234.61 Database Administrator III -1454460728 550 Cheryl Evans cevansf9@yolasite.com Female 244.155.129.93 Japan 7/24/1955 12380.49 Budget/Accounting Analyst II -1454460813 761 Kathleen Cook kcookl4@geocities.jp Female 154.7.81.231 Bulgaria 5/12/1996 107594.9 Analyst Programmer -1454460817 599 Sean Garcia sgarciagm@blogger.com Male 94.211.15.55 3557998741604165 Serbia 8/24/1963 131270.12 Structural Engineer 0/0 -1454460934 939 Keith Hernandez khernandezq2@amazon.com Male 153.51.249.140 3550284883492520 Belarus 10/12/1977 56167.67 Environmental Tech -1454460945 763 Amanda Miller amillerl6@dagondesign.com Female 15.140.92.92 Philippines 11/24/1979 118824.39 Structural Engineer -1454460961 664 Kathleen Torres ktorresif@vistaprint.com Female 11.165.183.246 Nicaragua 4/6/1960 257366 Environmental Specialist -1454461065 17 Emily Andrews eandrewsg@cornell.edu Female 29.231.180.172 30271790537626 Russia 4/13/1990 116800.65 Food Chemist -1454461083 569 Heather Johnson hjohnsonfs@skype.com Female 3.121.91.120 3552946432961233 Argentina 11/24/1966 197315 Cost Accountant -1454461104 768 Gregory James gjameslb@businessweek.com Male 80.18.249.93 30041579214659 Sweden 78310.93 -1454461128 584 Lois Ross lrossg7@irs.gov Female 176.213.236.60 Brazil 6/23/1989 95013.72 Database Administrator IV 999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999 -1454461201 856 Mildred Harper mharpernr@samsung.com Female 153.214.193.120 6763961170182948344 Finland 37573.27 -1454461259 383 Beverly Carter bcarteram@wordpress.com Female 4.251.6.51 3535631087457545 Indonesia 11/15/1982 272520.3 Compensation Analyst -1454461332 909 Samuel Henry shenryp8@163.com Male 204.10.183.241 6771639706876926 Philippines 4/3/1998 164954.8 Compensation Analyst -1454461498 678 Wanda Ford wfordit@sitemeter.com Female 63.28.195.79 Poland 28276.84 -1454461562 801 Annie Bradley abradleym8@jimdo.com Female 166.216.149.179 Poland 2/17/1970 267475.37 Quality Control Specialist -1454461671 643 Thomas Hunter thunterhu@pinterest.com Male 91.145.126.98 3574840401671309 China 3/3/1962 201611.79 Programmer II -1454461690 428 Dennis Marshall dmarshallbv@bloglines.com 51.104.218.177 3544646067494556 Pakistan \N -1454461771 3 Evelyn Morgan emorgan2@altervista.org Female 7.161.136.94 6767119071901597 Russia 2/1/1960 144972.51 Structural Engineer -1454461843 832 Anthony Duncan aduncann3@merriam-webster.com Male 54.202.218.90 3561384853362062 China 10/5/1982 239812.39 Human Resources Manager -1454461880 648 Eric Bryant ebryanthz@tripod.com Male 11.228.180.159 Sweden 3/21/1981 46534.77 Budget/Accounting Analyst I ١٢٣ -1454462013 943 Arthur Nelson anelsonq6@sun.com Male 201.79.146.145 5602257963938888 Ukraine 185554.08 -1454462053 994 Carol Williams cwilliamsrl@army.mil Female 53.242.60.20 France 1/5/1988 120933.54 Recruiter -=== Try load data from userdata2.parquet -1454457626 638 Richard Perkins rperkinshp@princeton.edu Male 206.117.180.117 China 4/11/2000 123221.64 Tax Accountant -1454457675 77 Doris Elliott delliott24@shinystat.com Female 36.27.140.126 Portugal 9/23/1987 98288.74 Design Engineer -1454457741 472 Sara Collins scollinsd3@yellowbook.com Female 238.228.239.222 5002357683259593 Philippines 1/6/1966 220244.65 Internal Auditor -1E02 -1454457764 681 Samuel Foster sfosteriw@github.io Male 101.228.90.125 676725448783712104 Brazil 6/27/1982 275514.12 Office Assistant II -1454457800 216 Robin Reed rreed5z@guardian.co.uk 191.104.133.70 Portugal 3/15/1978 \N Desktop Support Technician test⁠test‫ -1454457912 321 Joe Collins jcollins8w@google.com.hk Male 135.236.105.189 3573647966682865 Dominican Republic 106582.46 -1454457928 837 Jonathan Romero jromeron8@hp.com Male 129.49.88.101 30180713638645 Brazil 2/27/1957 238966.77 Speech Pathologist -1454457982 95 Teresa Ruiz truiz2m@diigo.com Female 22.118.240.24 337941028849437 Brazil 7/25/1994 243603.67 Cost Accountant -1454458012 218 Samuel Reed sreed61@sohu.com Male 131.124.128.124 3540638382406385 Brazil 257041.54 -1454458014 128 Harold Jenkins hjenkins3j@hostgator.com 204.144.188.106 374283629923426 Dominican Republic \N -1454458024 26 Sandra Coleman scolemanp@blogger.com Female 230.159.39.252 3555708337891155 China 8/7/1971 113688.11 VP Sales -1454458038 609 Joyce Palmer jpalmergw@mashable.com Female 164.56.14.55 6371540406366768 China 201121.46 -1454458083 879 Kevin Meyer kmeyeroe@squarespace.com Male 233.187.65.16 France 98010.89 -1454458190 705 Beverly Gonzales bgonzalesjk@wufoo.com Female 38.31.68.95 4405331360959318 Philippines 9/21/1957 42738.65 Director of Sales -1454458307 237 Richard Grant rgrant6k@etsy.com Male 241.252.232.2 6304639002149768801 Poland 2/23/1991 71635.33 Paralegal -1454458377 986 Melissa George mgeorgerd@apple.com Female 143.50.124.180 5602226915795555 Czech Republic 12/6/1962 63403.41 Internal Auditor -1454458390 181 Scott Marshall smarshall50@geocities.jp Male 137.234.29.113 3571996025746621 Philippines 4/23/1978 206952.7 Staff Scientist ␣ -1454458452 61 Sean Greene sgreene1o@goo.gl Male 71.195.178.59 5602246313163081 China 2/20/1991 70656.63 Sales Representative -1454458464 327 Janice Matthews jmatthews92@guardian.co.uk Female 71.195.173.202 6304527633260205 Russia 7/29/2000 157292.61 Physical Therapy Assistant -1454458470 657 Kathy Boyd kboydi8@skyrock.com 36.183.199.94 6389206450992194 China 4/24/1982 \N General Manager 🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧 -1454458494 390 Chris Mason cmasonat@purevolume.com Male 21.36.118.254 China 4/28/1983 168120.17 Sales Representative -1454458497 365 Albert Mills amillsa4@t.co Male 181.108.162.242 China 8/25/1962 180913.71 Recruiter -1454458508 999 Marie Medina mmedinarq@thetimes.co.uk Female 223.83.175.211 Kazakhstan 3/25/1969 53564.76 Speech Pathologist -1454458512 185 Brandon Williamson bwilliamson54@vimeo.com Male 4.249.36.104 4913822210519505 Russia 277603.75 -1454458529 5 Howard Miller hmiller4@fema.gov Male 103.193.150.230 3583473261055014 France 11/26/1998 50210.02 Senior Editor -1454458591 978 Jean Jacobs jjacobsr5@springer.com Female 143.77.255.89 6377468383747335 Guatemala 11/13/1977 218108.02 Accounting Assistant III -1454458647 788 Dennis Price dpricelv@google.co.jp Male 50.213.201.120 3588056573581168 Albania 10/29/1962 218338.58 Environmental Specialist -1454458655 450 Rose Mccoy rmccoych@livejournal.com Female 91.93.75.71 Dominican Republic 1/2/1972 192818.85 Executive Secretary \N -1454458657 213 Norma Garrett ngarrett5w@technorati.com Female 65.49.237.93 Albania 80916.71 -1454458725 110 Theresa Gardner tgardner31@photobucket.com Female 232.118.202.192 Ukraine 1/6/1982 243844.4 Health Coach II -1454458764 346 Thomas Richards trichards9l@ifeng.com Male 0.111.159.70 5610777337517834253 Thailand 2/19/1981 221644.31 Analog Circuit Design manager -1454458768 430 Linda Harvey lharveybx@google.ca Female 138.19.27.11 Indonesia 8/19/1961 200606 Teacher -1/2 -1454458782 287 Martin Ferguson mferguson7y@eventbrite.com Male 67.188.95.86 Portugal 7/2/1981 262746.89 Cost Accountant -1454458853 926 Joan Graham jgrahampp@icio.us Female 209.238.1.225 3557860962551501 China 3/1/1972 197284.8 Chief Design Engineer ‪‪test‪ -1454458888 533 Sarah Jordan sjordanes@europa.eu Female 120.197.115.153 5002357582121340 Indonesia 9/10/1963 146649.24 Programmer Analyst IV -1454458948 2 Walter Collins wcollins1@bloglovin.com Male 247.28.26.93 3587726269478025 China \N -1454459077 720 Theresa Hayes thayesjz@dion.ne.jp Female 43.78.228.159 Russia 231701.16 -1454459120 214 Margaret Hughes mhughes5x@biglobe.ne.jp Female 36.234.5.134 3546342491809456 Azerbaijan 127862.72 ˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs \'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ \'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥ -1454459148 737 Joseph Gray jgraykg@bbb.org Male 60.23.118.26 3540391233313117 United States 159699.28 -1454459184 419 Larry Black lblackbm@github.com Male 61.181.102.70 5108758999951786 Canada 4/12/1997 263463.01 Staff Accountant I -1454459341 559 Raymond Gray rgrayfi@mapy.cz Male 104.112.4.152 201619406564124 Brazil 4/29/1955 132421.37 VP Quality Control 和製漢語 -1454459366 71 Jacqueline Wallace jwallace1y@dagondesign.com Female 203.83.140.84 3578315582149538 Turkmenistan 4/15/1997 89436.49 Cost Accountant -1454459447 315 Earl Rivera erivera8q@weebly.com Male 249.22.156.255 6333306262684398 Macedonia 33051.81 """" -1454459464 298 Johnny Kelly jkelly89@dailymail.co.uk Male 56.120.150.167 4614973744018 Malaysia 10/20/1965 254369.91 Automation Specialist III -1454459540 329 Mary Diaz mdiaz94@macromedia.com Female 60.49.220.52 5108751463671162 Mongolia 9/12/1997 112279.71 Project Manager -1454459624 842 Brenda Jones bjonesnd@mysql.com Female 200.142.153.124 Colombia 10/1/1963 250051.84 Safety Technician III -1454459634 775 Lillian Ryan lryanli@t-online.de Female 152.216.220.164 3541599165648107 Iran 8/19/1967 138178.35 VP Marketing -1454459634 998 Stephanie Sims ssimsrp@newyorker.com Female 135.66.68.181 3548125808139842 Poland 112275.78 -1454459658 659 Julie Anderson jandersonia@shareasale.com Female 21.61.224.82 343450744553044 Netherlands 12/27/1976 68225.51 Compensation Analyst -1454459679 634 Harry Olson holsonhl@skyrock.com Male 57.82.212.119 5002351465267817 Chile 4/3/1956 173608.69 Assistant Professor -1454459732 892 Thompson Female 9.228.212.189 Czech Republic 10/3/1964 184732.94 Budget/Accounting Analyst IV -1454459817 495 Steve Ramos sramosdq@go.com Male 209.215.139.231 5602239349519376 France 194636.12 -1454459838 271 Nicole Wright nwright7i@businessinsider.com Female 213.168.29.131 3551761943539373 Chile 2/22/1967 34243.03 Budget/Accounting Analyst III -1454459839 424 Kimberly Coleman kcolemanbr@bizjournals.com Female 83.237.12.153 5641829981259605 Iran 280387.11 -1454459870 701 Bobby Chavez bchavezjg@tinypic.com Male 71.18.120.35 3575292555485293 China 5/20/1965 13910.56 Product Engineer åß∂ƒ©˙∆˚¬…æ -1454459921 954 Willie Thomas wthomasqh@earthlink.net Male 173.219.113.26 3560763628353111 Mexico 5/31/1990 201325.44 Programmer Analyst I -1454459944 694 Theresa Graham tgrahamj9@amazon.com Female 176.19.106.64 3539554098566813 China 4/8/1983 155735.87 Administrative Assistant III -1454459984 693 Jonathan Graham jgrahamj8@berkeley.edu Male 239.139.123.46 3581752291204508 Sweden 9/12/1961 16159.02 Statistician III -1454460022 127 Anna Moreno amoreno3i@cafepress.com Female 2.85.251.176 5610875550247635 Guatemala 12/30/1983 156757.41 Research Nurse -1454460158 232 Susan Burns sburns6f@cbsnews.com Female 2.93.31.196 5602245359290816 China 10/25/1992 58832.39 Research Assistant IV -1454460185 711 Alice Robertson arobertsonjq@sakura.ne.jp Female 182.147.6.194 Thailand 8/9/1955 54046.02 Legal Assistant -1454460227 661 Phyllis Brown pbrownic@macromedia.com Female 115.89.196.124 Brazil 7/31/1990 245014.11 Librarian -1454460293 146 Christina Gibson cgibson41@over-blog.com Female 226.138.197.167 China 3/14/1987 201589 Accountant II -1454460311 259 Donna Marshall dmarshall76@jimdo.com Female 249.36.126.149 6709877241918640 Indonesia 4/15/1986 281443.65 Structural Engineer 123 -1454460317 899 Harold Robinson hrobinsonoy@privacy.gov.au Male 94.237.36.16 5602247816220394 Philippines 10/3/1955 181832.97 Civil Engineer 0/0 -1454460450 391 Stone Female 205.229.198.173 Portugal 10/13/1968 173807.29 Web Developer I -1454460563 814 Kelly Riley krileyml@4shared.com 166.51.39.101 3529610026130015 China 6/24/1987 \N Data Coordiator -1454460586 350 Ruth Green rgreen9p@vk.com Female 170.37.204.80 3567581372052553 Poland 10/30/1990 76094.37 Community Outreach Specialist -1454460599 284 Joyce Bryant jbryant7v@stumbleupon.com Female 125.142.215.135 3551722227261571 Czech Republic 26866.76 """" -1454460658 129 Paula Oliver poliver3k@barnesandnoble.com Female 108.49.104.111 3551237510305944 China 149572.54 -1454460668 85 John Hudson jhudson2c@rediff.com Male 75.191.191.171 3538638405093479 Georgia 6/22/1994 82621.71 Tax Accountant -1454460753 578 Clarence Gonzales cgonzalesg1@fc2.com 13.29.242.81 30237628216824 Norway \N -1454460790 754 Rose Brooks rbrookskx@chron.com Female 99.103.60.118 201422963957371 China 4/8/1994 201004.89 Legal Assistant 1/2 -1454460792 118 Charles Gonzalez cgonzalez39@google.com.au Male 52.126.168.127 Nigeria 8/26/1958 108318.24 Internal Auditor -1454460806 479 Henry Scott hscottda@cornell.edu Male 53.161.182.142 5602240199354518 Indonesia 6/22/1992 32141.19 Assistant Professor -1454460806 791 Anthony Butler abutlerly@springer.com Male 84.141.89.156 Czech Republic 8/21/1969 282078.29 Health Coach IV -1454460833 1000 Alice Peterson apetersonrr@parallels.com Female 244.89.94.58 5602227843485236 Nigeria 239858.7 -1454460836 246 Billy Spencer bspencer6t@mozilla.com Male 1.121.193.207 5127963978663124 Malta 275300.87 -1454460842 236 Susan Wilson swilson6j@mapy.cz Female 253.105.50.250 4913609318117229 Cameroon 5/10/2000 135956.76 Director of Sales -1454460867 161 Janice Armstrong jarmstrong4g@sciencedirect.com Female 76.231.89.120 6759331684315962 Philippines 7/14/1996 64638.14 Project Manager -1454460947 250 Larson Male 250.66.116.249 6709520051264027651 Indonesia 9/30/1975 121560.88 Staff Accountant I -1454460979 951 Arthur Long alongqe@devhub.com Male 92.244.136.245 4175006438208322 China 3/4/1959 74667.22 Pharmacist -1454461020 339 Doris Bennett dbennett9e@de.vu Female 98.5.171.133 4041599256556998 Nicaragua 85802.06 $1.00 -1454461049 725 Patrick Rodriguez prodriguezk4@blogs.com Male 233.167.251.29 3543135453573752 Poland 8/10/1956 129023.91 Web Designer IV -1454461082 359 Ruby Fox rfox9y@chron.com Female 39.224.24.103 3566813987246457 Moldova 199091.31 -1454461084 488 Mark Weaver mweaverdj@dot.gov Male 36.130.233.58 3568615406520315 China 225258.27 -1454461184 802 Joyce Lopez jlopezm9@ocn.ne.jp Female 232.61.24.78 Ecuador 258343.17 -1454461219 258 Paul Gordon pgordon75@gravatar.com Male 160.61.49.169 3567008825292446 Czech Republic 2/25/2000 258680.6 Structural Analysis Engineer -1454461293 230 Victor Campbell vcampbell6d@stumbleupon.com Male 212.43.106.70 China 9/19/1993 42985.78 Analog Circuit Design manager -1454461314 421 Timothy Gomez tgomezbo@examiner.com Male 33.5.250.113 373343849259778 Czech Republic 215485.48 -1454461350 944 Kelly Hanson khansonq7@phpbb.com 250.78.86.48 United States 1/2/1969 \N Account Executive -1454461510 985 Rachel Holmes rholmesrc@hubpages.com Female 182.16.233.193 3578965006812598 Nigeria 4/1/1980 273229.15 Assistant Professor -1454461537 400 Arthur Smith asmithb3@accuweather.com Male 107.97.38.111 5602233710304252 China 1/30/1985 114652.62 Mechanical Systems Engineer -1454461604 993 Christina Hayes chayesrk@xing.com Female 199.58.20.93 North Korea 10/30/1967 121659.5 Librarian -1454461701 67 Tina Reid treid1u@163.com Female 116.38.145.226 Germany 4/25/1967 228301.51 Financial Analyst -1454461723 708 Carlos Mason cmasonjn@state.tx.us Male 171.189.25.159 5402971302511824 Thailand 4/8/1965 163810.9 Business Systems Development Analyst -1454461756 816 Sara Sanders ssandersmn@cornell.edu Female 54.250.225.134 Netherlands 7/26/1998 261953.95 Quality Engineer -1454461763 299 Diane Watkins dwatkins8a@netvibes.com 141.246.209.93 Yemen \N -1454461897 976 Paula Ross prossr3@tumblr.com 39.229.193.40 3535447138661799 Jordan 8/19/1990 \N Budget/Accounting Analyst IV -1454461902 56 Thomas Freeman tfreeman1j@java.com Male 161.123.216.250 3536920916224146 Colombia 8/4/1973 239571.27 Senior Developer -=== Try load data from userdata3.parquet -1454457607 457 Clarence Hunt chuntco@drupal.org Male 89.135.47.216 Zambia 9/27/1977 97179.31 Staff Accountant III 1E02 -1454457613 723 Arthur Jones ajonesk2@theguardian.com Male 31.151.216.146 France 2/6/1986 12068.96 Teacher -1454457706 234 Doris Grant dgrant6h@nasa.gov Female 195.132.180.36 5602256096038525 Colombia 7/14/1969 283813.79 Senior Cost Accountant -1454457712 14 Eric Parker eparkerd@usa.gov Male 25.73.91.135 5602249431899032 Russia 8/12/1986 102832.54 Tax Accountant -1454457781 846 Sharon Porter sporternh@yelp.com Female 206.179.138.50 6706029727013149 Colombia 7/3/1966 175902.84 Project Manager -1454457884 637 Frank Hudson fhudsonho@walmart.com Male 52.37.91.110 4405081678166102 China 2/7/1997 126102.31 Senior Developer -1454457968 134 Teresa Gray tgray3p@ox.ac.uk Female 60.117.57.222 China 9/18/1994 159276.6 Assistant Media Planner -1454458022 549 Aaron Reid areidf8@topsy.com Male 117.148.230.113 Russia 3/25/1983 211580.8 Product Engineer -1454458079 156 Ann Morris amorris4b@newyorker.com Female 14.165.90.97 3553147941910493 Indonesia 6/4/1956 158396.75 Engineer I -1454458121 794 Joshua Flores jfloresm1@sphinn.com Male 84.212.10.197 3587575297567030 China 2/9/1989 267751.84 Developer III -1454458182 604 Steve Castillo scastillogr@ezinearticles.com Male 159.158.95.181 3545937730645529 China 6/8/1993 86028 Programmer III -1454458233 64 Rose Fernandez rfernandez1r@usgs.gov Female 199.141.221.229 3564435193511524 Brazil 5/5/1972 196329.18 Senior Cost Accountant -1454458334 79 Robin Price rprice26@jugem.jp Female 235.141.108.176 5610389618618837 Russia 1/7/1977 120293.75 Biostatistician IV -1454458337 86 West Female 247.72.186.254 3541609903446548 Indonesia 12/11/1984 132544.98 Physical Therapy Assistant -1454458375 939 Craig Jones cjonesq2@de.vu Male 154.208.206.255 Indonesia 1/29/1989 266312.01 Safety Technician II -1454458415 805 George Meyer gmeyermc@google.nl Male 146.59.222.51 Syria 5/28/1973 242409.4 Analog Circuit Design manager -1454458434 914 Earl Martinez emartinezpd@squidoo.com Male 150.29.51.94 677135530260451546 Philippines 10/25/1970 257708.77 Software Engineer II 1E+02 -1454458516 5 Jacqueline Ellis jellis4@amazon.com Female 158.137.238.6 Russia 7/12/1959 286038.78 Marketing Assistant -1454458597 371 Heather Fisher hfisheraa@printfriendly.com Female 190.23.234.91 6304245587473860 Portugal 4/24/1955 101118.28 Associate Professor -1454458619 680 Mildred Dean mdeaniv@alibaba.com Female 173.255.221.184 3576992005749797 Armenia 4/3/1979 78889.63 Desktop Support Technician "__ロ( -1454458806 695 Ashley Olson aolsonja@noaa.gov Female 233.175.155.3 376319939588935 Indonesia 5/8/1979 256795.8 Systems Administrator III -1454458825 946 Beverly Henderson bhendersonq9@amazon.com Female 96.37.213.162 3554635936579520 Russia 8/4/1979 65339.1 VP Marketing -1454458897 881 Collins Male 100.212.189.244 3531552235272517 South Korea 7/5/1981 72539.92 VP Sales -1454458915 332 Raymond Ward rward97@drupal.org Male 89.82.25.71 3538744508795034 South Africa 5/4/1994 163739.08 Data Coordiator -1454458981 216 Judy Gutierrez jgutierrez5z@ftc.gov Female 120.107.239.171 China 11/13/1965 36744.51 Statistician I 🐵 🙈 🙉 🙊 -1454458994 539 Donald Holmes dholmesey@examiner.com Male 24.129.145.78 3532611982139532 Czech Republic 11/7/1988 256744.28 Administrative Assistant I -1454459168 721 Christopher Hunt chuntk0@blogtalkradio.com Male 69.240.85.94 201463274401428 Indonesia 6/8/1968 32269.1 Data Coordiator -1454459172 733 Bonnie Hawkins bhawkinskc@vinaora.com Female 150.107.139.217 5010121004388204 China 8/28/1971 133958.72 Information Systems Manager -1454459204 768 Victor Nichols vnicholslb@blogs.com Male 231.113.119.58 3587933684998468 France 13777.53 -1454459243 803 Donald Wood dwoodma@parallels.com Male 212.8.149.51 67610717455795070 Mexico 6/22/1971 20752.43 Chief Design Engineer Œ„´‰ˇÁ¨ˆØ∏”’ -1454459252 752 Mark Gomez mgomezkv@hud.gov Male 116.39.31.225 337941154145279 Indonesia 1/12/1965 232731.06 Professor -1454459281 282 Jason Kelly jkelly7t@themeforest.net Male 129.110.129.46 3532753335256769 Botswana 122812.35 -1454459307 681 Carlos Fields cfieldsiw@trellian.com Male 253.69.168.229 3573119954905542 Japan 121346.35 -1454459430 892 Gloria Fowler gfowleror@apache.org Female 31.26.133.176 5602245069101311 Jamaica 5/31/1962 172923.11 Desktop Support Technician -1E+02 -1454459462 728 Jacqueline Porter jporterk7@example.com Female 183.189.204.28 3558636209028613 China 2/18/1966 60948.17 VP Marketing -1454459482 847 Brenda Hall bhallni@craigslist.org Female 239.232.28.195 Sweden 12/5/1962 14658.92 Senior Quality Engineer -1454459511 512 Phyllis Rice pricee7@t-online.de Female 141.247.60.33 4041591621552 China 3/9/1992 74670.8 Web Developer I -1454459535 331 Patrick White pwhite96@sina.com.cn Male 145.132.114.239 3534146356970178 Ukraine 1/19/1994 96246.01 Executive Secretary -1454459549 611 Elizabeth Day edaygy@archive.org Female 244.129.35.183 4903539550370988748 China 6/28/1974 217382.97 Paralegal 𠜎𠜱𠝹𠱓𠱸𠲖𠳏 -1454459623 424 Lillian Vasquez lvasquezbr@about.me Female 15.233.130.74 6706936038940735306 Netherlands 6/28/2000 256419.66 Account Representative I -1454459691 579 Irene Day idayg2@theglobeandmail.com Female 124.253.55.20 3564632724049897 Argentina 9/3/1974 58715.23 Teacher -1454459729 362 Melissa Stephens mstephensa1@comsenz.com Female 105.158.98.174 3534057744078246 Philippines 1/22/1974 210781.96 Cost Accountant ᠎ -1454459735 103 Justin Grant jgrant2u@lycos.com Male 251.111.132.81 3542141314461899 China 1/7/2001 140911.2 Project Manager -1454459793 662 Jesse Gonzales jgonzalesid@google.fr Male 215.192.238.90 3550826252709387 Peru 7/22/1978 260505.75 Environmental Specialist -1454459819 866 Andrea Carpenter acarpentero1@taobao.com Female 246.154.31.121 Japan 3/6/1984 248740.81 Senior Quality Engineer -1454459841 923 Marilyn Long mlongpm@walmart.com Female 215.6.99.179 5602241011840536 Cameroon 10/28/1964 110571.54 Social Worker -1454459858 560 Judy Wright jwrightfj@blogs.com 7.139.209.42 560222806370845260 Colombia 3/6/1961 \N Software Test Engineer IV -1454459862 244 Diane Hawkins dhawkins6r@hatena.ne.jp Female 90.247.138.242 4026763155071942 China 5/10/1968 171218.47 Help Desk Operator -1454459921 639 Gloria Fields gfieldshq@mlb.com Female 76.62.183.159 6334660493144630501 Peru 5/7/1996 210991.41 Accounting Assistant II -1454459945 193 Catherine Rivera crivera5c@liveinternet.ru Female 197.164.37.102 4903900636714991 China 10/17/1984 240545.5 Cost Accountant -1454459958 186 Larry Coleman lcoleman55@imdb.com Male 139.205.254.237 3549906950974212 Germany 12/19/1958 182376.29 Compensation Analyst -1454459959 195 Andrew Henderson ahenderson5e@ftc.gov Male 44.116.118.204 United States 5/27/1977 108242.9 Accountant I -1454460044 743 Mildred Clark mclarkkm@issuu.com Female 179.135.234.32 3589587359210761 Philippines 268426 -1E+02 -1454460050 189 Samuel Fox sfox58@bing.com Male 220.161.213.119 3535192418612498 Argentina 9/2/1991 56084.78 Marketing Assistant -1454460053 209 Anne Flores aflores5s@marketwatch.com Female 8.136.212.14 Canada 6/17/1964 195673.07 Occupational Therapist -1454460230 956 John Baker jbakerqj@exblog.jp Male 96.167.232.236 Spain 9/29/1992 177531.95 Sales Representative -1454460278 683 Paula Johnston pjohnstoniy@marketwatch.com Female 246.57.43.147 560221588257454843 Mongolia 10/20/1978 227145.54 Administrative Officer -1454460325 341 Samuel Jordan sjordan9g@jimdo.com Male 183.29.32.119 3535569167756420 China 3/29/1975 130541.17 Safety Technician IV -1454460330 654 Michael Sims msimsi5@discuz.net Male 169.136.209.75 Bulgaria 6/14/1982 277854.98 Recruiting Manager -1454460342 814 Deborah Hudson dhudsonml@parallels.com 186.205.3.210 Ukraine 11/3/2000 \N Marketing Manager -1454460373 813 Mildred Harris mharrismk@vistaprint.com Female 250.65.167.151 3577530968521354 Greece 238399.8 -1454460382 624 Wayne Henry whenryhb@dedecms.com Male 173.2.93.236 China 147631.62 -1454460446 1000 Wanda Brooks wbrooksrr@yellowpages.com Female 241.43.62.149 3539260761630759 Japan 158607.84 -1454460471 685 Joe Rivera jriveraj0@ebay.com Male 101.130.15.106 4903855508114581 Thailand 74067.89 -1454460482 330 Robin Campbell rcampbell95@stanford.edu Female 144.152.165.130 4662544509352 Sierra Leone 4/9/1969 64481.72 Quality Engineer -1454460569 67 Eric Armstrong earmstrong1u@arizona.edu Male 128.202.252.112 4041590574307 Indonesia 5/30/1973 75347.18 Web Designer II -1454460587 37 Frank Stevens fstevens10@samsung.com Male 61.182.84.178 Philippines 3/19/1958 47326.14 VP Product Management -1454460615 42 Carlos Armstrong carmstrong15@technorati.com Male 85.22.216.153 3532000356234436 Indonesia 23446.58 -1454460668 556 Lisa Turner lturnerff@ustream.tv Female 192.4.71.81 3579076936527626 China 127717.62 -1454460696 958 Howard Gomez hgomezql@people.com.cn 226.78.136.12 6706662408386172373 Philippines \N test⁠test‫ -1454460697 959 Kimberly Alvarez kalvarezqm@gizmodo.com Female 244.177.51.246 30135810163038 Philippines 8/5/1976 211292 Design Engineer -1454460701 612 Dorothy Hanson dhansongz@i2i.jp Female 165.73.75.69 Azerbaijan 9/5/1971 246728.41 Information Systems Manager -1454460759 126 Amy Roberts aroberts3h@dyndns.org Female 166.99.225.202 Costa Rica 273960.79 𠜎𠜱𠝹𠱓𠱸𠲖𠳏 -1454460768 822 Jane Tucker jtuckermt@arizona.edu Female 43.88.112.223 Sweden 55680.59 -1454460788 41 Joyce Butler jbutler14@csmonitor.com Female 88.243.175.236 Indonesia 135825.27 -1454460812 496 Jesse Cole jcoledr@sogou.com Male 106.227.88.115 50184107778776571 Peru 6/2/1965 205296.96 Actuary -1454460898 516 Wayne Carter wcartereb@g.co Male 151.122.136.210 3547971451281253 Portugal 1/22/1992 122139.24 Cost Accountant -1454460912 571 Joan Chavez jchavezfu@com.com Female 17.161.255.139 Poland 10/16/1972 277679.98 Safety Technician I -1454460930 166 Pamela Perkins pperkins4l@wsj.com Female 237.225.95.141 378608444146629 China 141169.54 -1454460959 128 Wayne Kim wkim3j@cdc.gov 196.5.87.192 5007668319479461 Malaysia 1/27/1979 \N Internal Auditor -1454460980 465 Julie Phillips jphillipscw@ning.com Female 186.219.160.248 5602251286921119 Spain 6/10/1976 120755.68 Marketing Manager /dev/null; touch /tmp/blns.fail ; echo -1454460991 144 Martha Martin mmartin3z@sakura.ne.jp Female 220.126.107.146 201779098970730 New Zealand 5/23/1985 88724.94 Administrative Officer -1454461001 874 Laura Wells lwellso9@mit.edu Female 135.67.140.204 5482317399663099 Sweden 12/4/1993 262303.96 Environmental Tech -1454461065 833 Lois Lee lleen4@zdnet.com Female 31.87.204.102 5602245033844400 Bulgaria 113425.72 -1454461292 575 Jessica Watkins jwatkinsfy@marketwatch.com Female 165.50.211.193 201566979007298 Macedonia 7/12/1989 253506.67 Food Chemist -1454461361 184 Clarence Moore cmoore53@bloglines.com Male 212.30.218.42 Indonesia 6/16/1974 283539.78 Internal Auditor -1454461642 406 Frances Ray frayb9@theguardian.com Female 24.12.13.133 3555958533555779 Colombia 9/19/2000 282052.82 Staff Accountant III -1454461847 446 Helen Ward hwardcd@indiegogo.com Female 249.175.182.167 3550054667502541 Colombia 2/15/1959 115934.54 Graphic Designer -1454461863 101 Irene Adams iadams2s@biblegateway.com Female 135.79.211.166 Palestinian Territory 7/29/1994 73723.8 Help Desk Technician 00˙Ɩ$- -1454461907 99 Ruth Howell rhowell2q@cornell.edu Female 190.170.191.14 China 5/2/1969 286113.38 Senior Quality Engineer -1454461978 340 Gloria Wilson gwilson9f@soup.io Female 116.58.188.151 3539542269827494 Croatia 206401.2 -1454462106 132 Amanda Porter aporter3n@cloudflare.com Female 64.254.17.111 Brazil 7/26/1964 41956.4 Nurse -1454462425 102 Ralph Walker rwalker2t@sitemeter.com 101.111.216.188 Peru 4/15/1959 \N VP Accounting -1454462469 188 Christine Rodriguez crodriguez57@sciencedaily.com 240.122.189.81 6397046163164230 China 12/13/1998 \N Sales Representative -1454462692 106 Cynthia Vasquez cvasquez2x@washingtonpost.com Female 70.52.238.194 Kazakhstan 175907.62 1E+02 -1454462763 121 Heather Davis hdavis3c@hhs.gov Female 154.156.181.140 Poland 71140.46 -1454462944 704 Patrick Torres ptorresjj@ask.com Male 122.10.211.188 5602254083107544 Russia 10/28/1995 119841.99 Environmental Tech -1454463056 718 Tammy Simpson tsimpsonjx@imdb.com Female 28.114.238.250 5602250512089980 Russia 4/30/1987 240161.08 Human Resources Manager -1/2 -1454463110 548 Ryan 48.44.183.147 Russia 12/7/1999 \N Recruiting Manager -1454463111 206 Jeremy Boyd jboyd5p@sciencedirect.com Male 190.221.209.41 Mexico 8/17/1963 169562.93 Legal Assistant $1.00 -=== Try load data from userdata4.parquet -1454544135 174 Arthur Bishop abishop4t@deliciousdays.com Male 23.143.216.45 3543731590226021 Portugal 74352.02 -1454544166 397 Adam Harrison aharrisonb0@symantec.com Male 24.23.251.104 30250631299455 United States 10/14/1976 220537.78 Systems Administrator IV -1454544275 676 Julia Turner jturnerir@tripadvisor.com Female 246.75.105.64 3573355428855000 Philippines 9/23/1975 43244.37 Engineer I -1454544290 694 Carol Griffin cgriffinj9@zimbio.com 4.106.189.110 Philippines 5/5/1958 \N Quality Engineer -1454544350 790 Michael Mitchell mmitchelllx@blog.com Male 142.112.74.125 China 74089.46 -1454544355 372 Brandon Hicks bhicksab@unicef.org Male 14.1.141.83 564182403737341280 China 10/4/1985 62678.54 Sales Representative -1454544427 582 Annie Spencer aspencerg5@gizmodo.com Female 193.135.127.103 Philippines 7/29/1965 32342.28 Cost Accountant -1454544628 802 Lois Gibson lgibsonm9@mayoclinic.com Female 226.250.177.108 5610916546870112 Thailand 5/16/1955 149273.02 Occupational Therapist -1454544647 382 Paul Sanders psandersal@photobucket.com Male 216.84.37.205 6385564398040268 Sweden 6/9/1980 240223.98 Mechanical Systems Engineer 1 -1454544648 364 Jason Fox jfoxa3@unesco.org Male 184.48.48.126 Japan 8/9/1976 84483.3 Mechanical Systems Engineer -1454544719 716 Diana Little dlittlejv@shop-pro.jp Female 168.15.235.95 Argentina 267712.23 -1454544765 766 Lisa Harper lharperl9@boston.com Female 26.253.184.166 4903454632131201206 China 9/30/1986 177862.14 Analog Circuit Design manager -1454544797 471 Linda Arnold larnoldd2@yellowbook.com Female 25.72.220.19 3573669257084239 Indonesia 2/6/1983 249094.03 GIS Technical Architect " -1454544833 508 Andrea Alvarez aalvareze3@amazon.co.uk Female 94.93.141.212 Indonesia 165484.69   -1454544883 991 Mary Willis mwillisri@i2i.jp Female 188.83.241.84 Russia 9/4/1992 133498.3 Payment Adjustment Coordinator -1454544907 137 Harry Thomas hthomas3s@edublogs.org Male 203.181.156.216 3586074069338235 Poland 6/6/1979 159098.74 Chemical Engineer -1454545008 824 Jack Hudson jhudsonmv@hp.com Male 195.27.62.30 Ukraine 9/19/1970 163426.27 Community Outreach Specialist -1454545044 173 Ruth Welch rwelch4s@spotify.com Female 7.253.134.135 3543426983427878 Japan 8/6/1964 203330.7 Paralegal -1454545053 225 Judy Greene jgreene68@discovery.com 246.203.234.47 589310636256482728 Dominica \N -1454545135 948 Janet Lawson jlawsonqb@indiatimes.com Female 90.48.142.31 4026186827051821 Philippines 197991.65 -1454545185 757 James Pierce jpiercel0@meetup.com Male 14.116.62.43 5018717793434778 Greece 12/25/1989 17173.34 Assistant Manager -1454545221 995 Philip Mcdonald pmcdonaldrm@tripadvisor.com Male 224.59.55.103 5108753554344402 France 4/22/1955 59331.14 Recruiting Manager -1454545227 6 William Williamson wwilliamson5@trellian.com Male 44.86.73.201 201849487683564 Indonesia 12/6/1993 95352.25 Librarian 1E+02 -1454545361 770 Gregory Henderson ghendersonld@issuu.com Male 233.65.87.175 Philippines 79047.27 -1454545379 713 Ruth Barnes rbarnesjs@google.it Female 29.37.239.173 56108753791531632 Sweden 8/23/1965 268965.5 Occupational Therapist -1454545666 430 Stephen Knight sknightbx@so-net.ne.jp Male 233.213.210.160 China 7/7/1969 183842.12 Quality Control Specialist -1454545825 470 Carl Freeman cfreemand1@de.vu Male 40.13.20.8 5002357075956137 Armenia 1/6/1984 140264.63 Accountant III -1454545841 736 Ashley Black ablackkf@freewebs.com Female 130.87.75.86 30046346841197 China 5/8/1991 263407.66 Senior Developer -1454545876 36 Earl Mccoy emccoyz@bigcartel.com Male 161.179.122.154 5038877150819047588 Japan 10/12/1976 114766.43 Software Test Engineer IV 0.00 -1454545911 981 Martin Hudson mhudsonr8@senate.gov Male 103.7.125.212 3580063273741488 Azerbaijan 55371.91 -1454545934 728 Brandon Oliver boliverk7@tuttocitta.it Male 190.202.45.71 3561315827587251 Norway 10/31/1960 157819.05 Structural Engineer -1454545941 224 Julia Lane jlane67@networksolutions.com Female 126.98.58.100 3566544839563357 Brazil 9/24/1975 77279.09 Business Systems Development Analyst -1454545957 112 Mildred Martinez mmartinez33@wufoo.com 206.47.25.150 Brazil \N -1454546022 147 John Henry jhenry42@google.nl Male 175.38.124.31 3534881822199867 China 7/7/1959 180821.73 Engineer I "<>?:""{}|_+" -1454546057 132 Rose Evans revans3n@hubpages.com Female 18.134.14.151 6767390430172490489 United States 2/11/1977 109352.69 Automation Specialist II -1454546075 162 Nancy Sanchez nsanchez4h@yahoo.com Female 180.250.167.88 Malawi 5/12/1956 280050.1 Health Coach III -1454546122 913 Lisa Oliver loliverpc@nydailynews.com Female 153.239.15.222 201665522335840 Sweden 1/28/1957 180645.76 Marketing Assistant () { 0; }; touch /tmp/blns.shellshock1.fail; -1454546163 55 Nancy Stephens nstephens1i@godaddy.com Female 211.0.225.116 Mongolia 20805.69 -1454546263 498 Lillian Lynch llynchdt@posterous.com Female 13.168.64.88 Brazil 6/18/1982 203558.13 Accountant I -1454546287 934 Mark Dunn mdunnpx@booking.com Male 77.125.49.164 Indonesia 7/2/1990 120101.43 Financial Advisor -1454546293 748 Carol Perry cperrykr@cmu.edu Female 113.54.30.174 675928304974727871 Colombia 122048.92 -1454546294 830 Catherine Rice cricen1@hexun.com Female 134.65.177.193 Portugal 100751.27 -1454546294 924 Jimmy Nelson jnelsonpn@rediff.com Male 244.130.194.232 Norway 259092.5 -1454546377 431 Pamela Ruiz pruizby@java.com 42.71.124.95 Pakistan 9/15/1976 \N Software Engineer I -1454546405 158 Melissa Alexander malexander4d@google.pl Female 186.71.215.96 Greece 5/7/1972 180150.8 VP Marketing -1454546406 94 Debra Sims dsims2l@meetup.com Female 150.198.93.159 5602215295621929 Brazil 12/21/1984 276704.96 Office Assistant IV -1454546500 909 Cynthia Smith csmithp8@house.gov Female 166.21.108.146 374622628177056 China 9/30/1974 252566.03 Physical Therapy Assistant -1454546508 599 John Lewis jlewisgm@youtube.com Male 90.227.58.221 Sweden 5/16/1970 58222.46 Software Engineer II -1454546576 617 Jonathan Hall jhallh4@upenn.edu Male 12.13.126.157 491109978928388311 China 5/16/1986 50824.51 GIS Technical Architect -1454546653 227 Helen Green hgreen6a@vimeo.com Female 156.198.175.255 5048379124161648 Uganda 10/20/2000 163189.36 Computer Systems Analyst III -1454546690 349 David Washington dwashington9o@un.org Male 131.53.93.63 3578517361666653 Greece 10/16/1998 34742.07 Staff Accountant IV -1454546703 474 Betty Cook bcookd5@admin.ch Female 23.9.243.170 China 5/16/1962 151829.78 Budget/Accounting Analyst I -1454546726 767 Philip Burton pburtonla@zimbio.com Male 138.134.59.28 3528288812489043 Russia 5/27/1983 241065.94 Software Engineer III -1454546741 904 Debra Wilson dwilsonp3@desdev.cn Female 254.162.119.226 630461807132739339 Poland 4/20/1969 107766.71 Financial Analyst -1454546820 480 Todd Wagner twagnerdb@reuters.com 25.149.209.61 3560449524302754 Tunisia 8/31/1983 \N Research Associate -1454546835 62 Donna Gonzalez dgonzalez1p@instagram.com Female 81.57.136.186 China 3/3/1975 181562.45 Junior Executive -1454546857 666 Anthony Sullivan asullivanih@boston.com Male 119.85.206.152 561007482254370160 Portugal 5/20/1970 164827.57 Systems Administrator IV -1454546901 366 Julie Garrett jgarretta5@wsj.com Female 40.18.147.38 China 225753.62 -1454546930 808 Russell Freeman rfreemanmf@comcast.net Male 244.181.177.133 30295400628590 Greece 173731.67 -1454546970 739 Nicholas Sanders nsanderski@scientificamerican.com Male 13.8.6.64 347899819407351 Portugal 6/3/1991 130727.91 Research Associate -1454547011 198 Timothy Ford tford5h@vk.com Male 3.35.147.123 5602236379905962 Morocco 4/27/1998 55901.49 Paralegal -1454547029 644 Jean Cole jcolehv@mac.com Female 5.188.221.124 Comoros 7/24/1985 215195.83 Civil Engineer -1454547030 916 Andrew Campbell acampbellpf@nymag.com Male 172.206.158.110 Guatemala 8/12/1962 33394.2 Financial Analyst -1454547032 264 Charles James cjames7b@wordpress.org Male 40.115.241.175 6761364619849686314 Canada 9/21/1958 227083.18 Professor -1454547070 727 Louise Castillo lcastillok6@cmu.edu Female 54.15.177.72 3586380225985649 France 3/22/1978 17830.21 Nurse -1454547124 168 Christopher Hughes chughes4n@businessinsider.com Male 23.110.32.151 6304281728252855 Serbia 12/9/1975 220573.8 Design Engineer 999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999 -1454547192 232 Angela Evans aevans6f@a8.net Female 115.244.254.13 6333718316396730 China 1/7/1968 265380.99 VP Quality Control -1454547201 464 Marie Harris mharriscv@dot.gov Female 26.45.137.53 Tajikistan 9/24/1961 203845.4 Analog Circuit Design manager -1454547203 427 Rebecca Thompson rthompsonbu@wikipedia.org Female 110.47.151.2 Indonesia 4/29/1992 216830.25 Assistant Manager -1454547212 575 Arthur Reyes areyesfy@ca.gov Male 161.254.47.140 Poland 3/9/1962 214072.68 Health Coach I -1454547223 564 Rebecca Ford rfordfn@stanford.edu Female 210.231.201.84 Indonesia 9/3/1969 204041.63 Office Assistant II ÅÍÎÏ˝ÓÔÒÚÆ☃ -1454547242 997 William Patterson wpattersonro@omniture.com Male 149.242.140.255 3528460022712031 Colombia 3/1/2000 108955.05 Executive Secretary ✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿 -1454547281 463 Gerald Knight gknightcu@independent.co.uk Male 34.192.129.107 China 4/27/1975 84585.78 Civil Engineer ⁰⁴⁵ -1454547356 276 Albert Gordon agordon7n@examiner.com Male 88.159.237.102 3534524682255003 Sweden 8/25/1996 265299.22 Assistant Media Planner -1454547362 317 Clarence Simpson csimpson8s@comsenz.com Male 104.53.119.249 3586887721906879 Venezuela 3/7/1977 35314.18 Professor -1454547369 217 Anthony Jacobs ajacobs60@ycombinator.com Male 59.162.173.59 374283051163301 Ivory Coast 7/11/1988 103409 Safety Technician II (。◕ ∀ ◕。) -1454547401 230 Jimmy Bailey jbailey6d@odnoklassniki.ru Male 22.173.156.124 3576503167968271 China 197603.47 $1.00 -1454547508 960 Craig Shaw cshawqn@wordpress.org Male 88.203.243.165 5602229798654196 Tanzania 8/5/1999 119584.32 Senior Sales Associate -1454547541 585 Bonnie Snyder bsnyderg8@ftc.gov Female 170.100.220.94 3564602303009802 Japan 6/5/1998 89020.39 Desktop Support Technician -1454547577 871 Gloria Howard ghowardo6@harvard.edu Female 173.45.99.88 Egypt 8/27/1972 140945.69 Human Resources Assistant I -1454547609 878 Kathryn Snyder ksnyderod@e-recht24.de Female 235.195.131.110 6761199763991532 Indonesia 3/29/1973 168235 GIS Technical Architect -1454547697 473 Joseph Coleman jcolemand4@ucoz.ru Male 1.40.64.123 4508104337648496 Argentina 6/14/1975 167526.19 Librarian /dev/null; touch /tmp/blns.fail ; echo -1454547707 206 Shirley Ruiz sruiz5p@dagondesign.com Female 159.102.238.195 201955789975119 Bosnia and Herzegovina 10/8/1963 197240.2 General Manager -1454548013 705 Alan Sims asimsjk@ed.gov Male 180.200.150.10 3531118427209962 Israel 12/8/1982 269504.53 Biostatistician III -1454548122 108 Craig Knight cknight2z@ucsd.edu Male 139.37.241.169 3556934424099549 Greece 2/21/1955 247303.71 Senior Financial Analyst Ω≈ç√∫˜µ≤≥÷ -1454548170 611 Steve Ford sfordgy@hubpages.com Male 190.25.153.64 56022386492755060 China 6/7/1979 39645.72 Health Coach IV ̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕ -1454548319 40 Joan Price jprice13@mtv.com 233.4.158.135 3584182571037112 Portugal \N -1454548438 618 Jeremy Roberts jrobertsh5@go.com Male 89.14.246.154 Russia 7/31/1989 273400 Research Assistant II -1454548507 314 Dixon Male 93.252.91.51 670677121929947139 Ireland 209533.24 -1454548522 522 Eric Kelley ekelleyeh@pcworld.com Male 131.75.70.227 Syria 7/22/1990 163141.3 General Manager "__ロ( -1454548725 133 Lillian Collins lcollins3o@csmonitor.com 80.80.47.76 4175009027155995 Czech Republic \N -1454549109 306 Mark Boyd mboyd8h@cocolog-nifty.com Male 158.13.1.119 3562815747212335 Brazil 2/15/1967 66134.2 Social Worker -1454549131 371 Carl Knight cknightaa@unc.edu Male 64.176.41.31 Macedonia 6/4/1973 116193.06 Environmental Specialist -1454549158 346 Kathryn Butler kbutler9l@washingtonpost.com 32.220.87.246 374288729624402 China 11/24/1972 \N Staff Accountant II -1454549169 59 John Rogers jrogers1m@miitbeian.gov.cn Male 91.131.170.178 3578552255653202 Croatia 9/25/1971 164207.53 Administrative Assistant III -1454549202 304 Billy Howard bhoward8f@geocities.com Male 101.47.248.109 3561004867229459 Ireland 2/23/1963 147308.45 Software Test Engineer II -1454549230 702 Patricia Oliver poliverjh@cmu.edu 18.206.245.40 Ireland \N ÅÍÎÏ˝ÓÔÒÚÆ☃ -1454549233 179 Christine Duncan cduncan4y@furl.net 49.36.119.18 30544573199206 China 8/15/2000 \N Mechanical Systems Engineer -1454549243 72 Walter Hill whill1z@fda.gov Male 169.189.26.193 Philippines 4/25/1989 170789.26 Executive Secretary -1454549360 862 Joseph Patterson jpattersonnx@google.it Male 79.70.102.172 3548682692624495 Argentina 87931.98 -=== Try load data from userdata5.parquet -1454544139 20 Debra White dwhitej@umn.edu Female 142.140.184.111 Indonesia 47859.54 -1454544152 53 Ralph Simmons rsimmons1g@google.cn Male 180.159.250.232 3554040768947822 Pakistan 111413.03 -1454544187 564 Christine Willis cwillisfn@pagesperso-orange.fr Female 166.102.221.213 3534808021291708 Russia 8/3/1991 112850.81 Desktop Support Technician -1454544201 40 Christine Alexander calexander13@aboutads.info Female 163.32.3.92 50183677518131890 China 1/14/1981 213713.99 Sales Associate -1454544213 992 Anna Dean adeanrj@netvibes.com Female 113.127.227.85 3586135192218451 Vietnam 5/29/1962 286181.88 Automation Specialist II -1454544238 601 Aaron Kim akimgo@mayoclinic.com Male 182.52.179.175 3587685548758112 Kazakhstan 11/6/1963 156217.14 Accounting Assistant I -1454544284 903 John Harris jharrisp2@goo.ne.jp Male 65.10.215.144 3565387100757980 China 6/7/1970 153671.44 Analog Circuit Design manager -1454544326 325 Billy Meyer bmeyer90@nature.com Male 163.186.10.162 3538589516492193 Colombia 7/20/1983 84716.67 Assistant Professor -1454544328 746 Christine Howell chowellkp@php.net Female 71.95.250.29 5100170292026399 China 1/11/1964 30533.25 Account Executive -1454544347 353 Alan Collins acollins9s@cpanel.net Male 16.99.94.145 3536005999242155 Guatemala 6/1/1980 38434.4 Software Test Engineer II ・( ̄∀ ̄)・:*: -1454544495 879 Marie Vasquez mvasquezoe@is.gd Female 101.194.66.108 3563730358790256 China 9/21/1958 12182.09 Nurse -1454544507 912 Evelyn Fisher efisherpb@soup.io Female 221.207.200.158 201473318880354 China 5/17/1998 208654.68 Geological Engineer -1454544523 923 Jessica George jgeorgepm@so-net.ne.jp Female 119.65.145.55 Russia 6/22/1965 73210.79 Nurse -1454544624 96 Brandon Owens bowens2n@si.edu Male 5.39.151.46 4591258400528650 France 3/13/1998 74028.68 Software Engineer III -1454544685 617 Judith Bishop jbishoph4@weibo.com Female 50.167.35.101 3536263290947101 Taiwan 147732.13 (。◕ ∀ ◕。) -1454544809 89 Carolyn Gutierrez cgutierrez2g@smh.com.au Female 109.77.234.103 Madagascar 2/13/1999 139612.73 Nurse -1454544817 929 Harold Tucker htuckerps@stanford.edu Male 243.182.109.135 374622077056546 China 161472.14 -1454544819 590 Irene Larson ilarsongd@addthis.com Female 67.196.118.250 Syria 8/11/1969 222598.25 Business Systems Development Analyst  -1454544888 577 Frances Day fdayg0@ox.ac.uk Female 54.131.119.123 3534463936023182 Portugal 11/15/1969 206386.03 Environmental Specialist -1454544926 908 Bruce Banks bbanksp7@ifeng.com Male 3.58.102.49 560224852697998794 Indonesia 1/18/1983 146835.33 Professor -1454544936 951 Carolyn Lewis clewisqe@blogger.com Female 154.230.220.164 5469666950681032 Uruguay 11/25/1955 119686.8 Help Desk Technician -1454545198 56 Jane Murray jmurray1j@apache.org Female 174.82.82.71 5100149053428994 China 7/15/1973 57832.83 Software Consultant -1454545225 439 Keith Cook kcookc6@usa.gov Male 22.162.180.159 Poland 146503.61 -1454545268 358 Todd Meyer tmeyer9x@huffingtonpost.com Male 183.45.201.202 5593314243312813 China 7/31/1987 115187.5 Paralegal -1454545307 204 Lillian Long llong5n@skype.com Female 146.238.55.254 5641820612278798844 Czech Republic 6/18/1999 150598.38 Human Resources Assistant IV -1454545319 409 Doris Bishop dbishopbc@spotify.com Female 199.116.182.20 3575820879808061 Canada 11/29/1964 169913.1 Geological Engineer -1454545330 559 Eric West ewestfi@mapquest.com Male 229.67.66.9 3584340222063867 Italy 8/31/1998 59102.31 General Manager 1E2 -1454545330 702 Dennis Kelly dkellyjh@cargocollective.com Male 159.10.27.86 3586421938986530 China 3/22/1982 260296.17 Desktop Support Technician -1454545334 371 Gerald Russell grussellaa@last.fm 174.119.43.205 3545489024436298 Bahrain 12/21/2000 \N Senior Cost Accountant -1454545338 369 Judy Perez jpereza8@gmpg.org Female 109.68.19.234 5249772984361935 Philippines 7/9/1989 257973.8 Sales Associate -1454545351 178 Melissa Thomas mthomas4x@mysql.com Female 192.210.201.207 5562824139318432 Equatorial Guinea 8/17/1965 267092.73 Junior Executive -1454545414 831 Arthur Hill ahilln2@usnews.com Male 231.181.126.173 5602223371820245193 Colombia 3/25/1993 247436.07 Mechanical Systems Engineer -1454545426 170 Anne Oliver aoliver4p@jimdo.com Female 205.100.30.244 3530095445603833 Indonesia 3/31/1970 232499.96 Software Test Engineer III -1454545483 48 Frances Willis fwillis1b@linkedin.com 102.186.57.75 4175001067968122 Philippines 8/3/1998 \N VP Marketing -1454545502 478 Joshua Harrison jharrisond9@noaa.gov Male 231.249.108.195 30492555718355 Japan 11/24/1971 143815.22 Clinical Specialist -1454545547 98 Timothy Boyd tboyd2p@imdb.com Male 211.20.45.168 5602253132446507 Peru 7/8/1976 127883.56 Data Coordiator -1454545556 148 Powell Female 77.50.112.73 5303311226469439 China 175168.8 -1454545565 998 Louis Lee lleerp@thetimes.co.uk Male 8.88.141.81 Russia 11/20/1982 13134.47 Office Assistant IV -1454545585 116 Lisa James ljames37@walmart.com 149.162.35.129 Sweden 3/19/1986 \N Graphic Designer -1454545601 269 Carlos Flores cflores7g@samsung.com Male 121.205.206.52 France 89368.56 -1454545680 197 Eugene Shaw eshaw5g@topsy.com Male 75.2.214.89 5602236558365152 France 11/25/1983 204106.08 Associate Professor -1454545733 536 Charles Welch cwelchev@paginegialle.it Male 135.156.127.116 3540766046216294 Bulgaria 11/26/1980 280230.13 Accountant II -1454545747 800 Sharon Crawford scrawfordm7@google.cn Female 185.219.127.5 5141634704661813 Pakistan 12/1/1980 14880.86 Clinical Specialist -1454545784 370 Martin Webb mwebba9@shutterfly.com Male 241.183.200.48 Portugal 5/28/1981 134676.08 Database Administrator III -1454545905 425 Wanda Olson wolsonbs@pen.io Female 136.216.93.167 3579427292475142 Slovenia 195983.76 -1454545917 158 Nelson Female 158.42.83.104 Nigeria 56092.93 -1454545926 144 Ruth Ryan rryan3z@reference.com Female 157.117.150.254 3580511168862041 Indonesia 9/9/1972 56717.9 Account Coordinator 999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999 -1454545928 432 Aaron Sims asimsbz@squidoo.com Male 176.74.122.74 3553550116250639 China 12/20/1992 245201.62 Recruiting Manager -1454545988 167 Frank Cunningham fcunningham4m@github.com Male 150.174.230.186 5602249442759621 France 4/17/1969 254828.23 Nuclear Power Engineer -1454546017 736 Doris Reyes dreyeskf@trellian.com Female 50.37.101.111 Russia 3/23/1967 48543.29 Electrical Engineer ١٢٣ -1454546051 696 Walter Baker wbakerjb@webmd.com Male 33.81.54.207 Poland 12/4/1985 257839.28 Occupational Therapist -1454546096 418 Nicole Weaver nweaverbl@yellowbook.com Female 178.127.204.49 6333547435590930225 Brazil 91251 -1454546121 521 Jesse Mccoy jmccoyeg@illinois.edu Male 77.2.76.98 5602212301270239 Indonesia 265697.47 -1454546176 324 Randy Perkins rperkins8z@spotify.com Male 90.152.116.122 4903530859961340 Canada 9/29/1982 59754.4 Programmer IV -1454546214 720 Daniel Roberts drobertsjz@blog.com Male 200.191.212.146 4917780904858553 Argentina 3/31/1965 151397.44 Analog Circuit Design manager -1454546253 279 Ernest Palmer epalmer7q@zdnet.com Male 24.129.157.239 5384992294623031 China 158317.63 -1454546269 937 Julia Hawkins jhawkinsq0@businesswire.com Female 41.247.95.119 Japan 52113.66 -1454546342 307 Phillip Mason pmason8i@hubpages.com Male 231.103.199.111 5602233897712483 China 277619.14 ␢ -1454546380 547 Benjamin Garcia bgarciaf6@spotify.com Male 151.228.6.14 3555896626891000 Macedonia 240109.95 -1454546423 616 Frances Hamilton fhamiltonh3@tamu.edu Female 188.88.34.240 Peru 3/19/1989 69117.34 Assistant Professor -1454546426 753 Raymond Harper rharperkw@facebook.com Male 148.46.64.54 5002351763645136 China 1/27/1980 191542.74 VP Accounting -1454546437 972 Bonnie Morrison bmorrisonqz@simplemachines.org 13.205.160.142 6763571935984496 Georgia 3/8/1973 \N Tax Accountant ../../../../../../../../../../../etc/passwd%00 -1454546468 185 Lisa Castillo lcastillo54@ebay.com Female 96.65.226.75 5100133275364427 Iran 4/8/1989 19003.55 Database Administrator I -1454546507 980 Marilyn Castillo mcastillor7@wikipedia.org Female 225.8.34.64 3560325383537120 Thailand 166569.16 -1454546551 293 Barbara Diaz bdiaz84@usnews.com Female 176.106.164.136 30109403344362 Egypt 11/25/1984 41388.68 Quality Control Specialist -1454546607 172 Christina Payne cpayne4r@umich.edu 208.172.251.134 3567551256592404 Hungary 5/9/1977 \N Quality Control Specialist -1454546678 454 Amy Phillips aphillipscl@blog.com Female 156.231.253.161 Russia 11/21/1997 136062.09 Environmental Tech ␣ -1454546732 792 Christine Howard chowardlz@prweb.com Female 69.22.66.149 Kosovo 3/10/1998 90266.03 Civil Engineer -1454546852 671 Juan Scott jscottim@theatlantic.com Male 170.84.164.52 3530364751135776 Indonesia 12/29/1979 127445.95 Assistant Professor -1454546865 878 Robin Matthews rmatthewsod@alexa.com Female 168.96.0.234 5108756854169874 China 11/17/1975 155909.78 Staff Accountant I -1454546874 578 Lisa Foster lfosterg1@va.gov Female 116.239.143.83 30550897409197 Canada 12/25/1980 282301.9 Product Engineer -1454546885 514 Clarence Gardner cgardnere9@addthis.com Male 241.164.83.193 3567799117668968 Mexico 2/8/1983 69661.64 Business Systems Development Analyst -1454546937 32 Roy Simmons rsimmonsv@telegraph.co.uk Male 21.20.158.183 5602244835346375 Mongolia 6/27/1994 13987.6 Senior Editor "<>?:""{}|_+" -1454546996 140 Christina Hanson chanson3v@seattletimes.com Female 154.87.3.146 3589004738797807 Peru 12/6/1994 157444.39 Budget/Accounting Analyst I -1454547050 714 Sean Shaw sshawjt@stumbleupon.com Male 190.171.138.84 4041370678096900 Portugal 11/13/1987 280420.03 Director of Sales -1454547183 440 David Dixon ddixonc7@google.es Male 102.192.92.231 3571723971536297 China 197005 ゚・✿ヾ╲(。◕‿◕。)╱✿・゚ -1454547190 109 Janice Edwards jedwards30@huffingtonpost.com Female 156.5.183.66 Czech Republic 9/3/1977 166805.79 Account Coordinator -1454547193 807 Helen Roberts hrobertsme@marketwatch.com Female 242.160.113.180 201415538184406 Armenia 9/30/1968 131695.03 Help Desk Technician -1454547199 6 Irene Wells iwells5@fema.gov Female 85.5.67.113 Iran 74337.42 -1454547206 629 Donna Crawford dcrawfordhg@google.fr Female 139.87.72.237 3548002968267145 Philippines 9/10/1974 120949.74 Senior Quality Engineer -1454547314 239 Terry Anderson tanderson6m@joomla.org Male 126.193.158.217 Slovenia 6/2/1988 241130.56 Senior Sales Associate -1454547413 874 Roger Armstrong rarmstrongo9@shop-pro.jp Male 176.127.63.161 Sweden 1/4/1969 195125.77 Environmental Tech -1454547442 85 Scott Washington swashington2c@bloomberg.com Male 79.185.72.100 6395647151650882 Brazil 2/17/1957 240505.52 Professor -1454547470 265 Ronald Simmons rsimmons7c@php.net Male 231.21.126.12 Colombia 5/12/1959 28563.27 Staff Accountant III -1454547497 574 Laura Lawson llawsonfx@disqus.com Female 227.157.239.115 5108755030972003 Mongolia 6/17/1987 192790.7 Sales Representative ../../../../../../../../../../../etc/hosts -1454547546 582 Medina Male 230.187.35.16 China 87740.62 -1454547580 868 Todd Simmons tsimmonso3@amazon.co.uk Male 232.231.42.85 Peru 1/28/1977 70099.6 Sales Associate NULL -1454547632 421 Sara Murray smurraybo@instagram.com Female 83.32.41.79 Mongolia 3/2/1972 21859.35 Research Associate -1454547659 83 Tammy Walker twalker2a@craigslist.org Female 115.94.89.2 4508955158259501 China 1/1/1972 241046.96 Community Outreach Specialist -1454547745 476 Norma Palmer npalmerd7@etsy.com Female 24.81.30.107 6759877990739668322 China 2/22/1974 273005.88 Executive Secretary -1454547823 333 Ruth Ryan rryan98@gov.uk Female 165.226.217.32 6771454237379758 Philippines 4/25/1993 246324.26 Staff Accountant I -1454547897 523 Raymond Green rgreenei@sciencedaily.com Male 129.154.223.20 5020525177159002 Brazil 7/25/1966 217735.34 Sales Associate -1454547914 626 Steven Cooper scooperhd@home.pl Male 226.75.17.73 30583351914956 United States 4/22/2000 174475.39 Web Developer II -1454547938 51 Terry Mitchell tmitchell1e@soundcloud.com Male 64.34.240.165 Peru 101626.65 -1454547979 282 Lisa Romero lromero7t@pinterest.com Female 54.113.22.9 Portugal 224233.61 -1454548111 899 Raymond Payne rpayneoy@purevolume.com Male 170.237.246.144 201978019687940 Philippines 1/21/1993 126392.14 Staff Accountant I -1454548272 966 Kevin Martin kmartinqt@hostgator.com Male 87.47.66.144 3550408592420163 Sweden 10/24/1965 213135.46 Senior Sales Associate -1454548342 846 Keith Taylor ktaylornh@about.me Male 90.199.26.239 4175007392203366 South Africa 2/4/1990 64012.82 Associate Professor -1454548358 164 Lawrence Johnston ljohnston4j@businessweek.com Male 150.125.123.49 China 6/14/1993 243318.68 Design Engineer -1454548489 550 Brandon Owens bowensf9@wired.com Male 220.236.132.34 Vietnam 271248.99 -=== Try load data from v0.7.1.all-named-index.parquet -0.21 59.8 61 326 3.89 3.84 2.31 Premium E SI1 -0.22 65.1 61 337 3.87 3.78 2.49 Fair E VS2 -0.23 56.9 65 327 4.05 4.07 2.31 Good E VS1 -0.23 59.4 61 338 4 4.05 2.39 Very Good H VS1 -0.23 61.5 55 326 3.95 3.98 2.43 Ideal E SI2 -0.24 62.3 57 336 3.95 3.98 2.47 Very Good I VVS1 -0.24 62.8 57 336 3.94 3.96 2.48 Very Good J VVS2 -0.26 61.9 55 337 4.07 4.11 2.53 Very Good H SI1 -0.29 62.4 58 334 4.2 4.23 2.63 Premium I VS2 -0.31 63.3 58 335 4.34 4.35 2.75 Good J SI2 -=== Try load data from v0.7.1.column-metadata-handling.parquet -1 0.1 2017-01-01 02:00:00 a 2017-01-01 02:00:00 -2 0.2 2017-01-02 02:00:00 b 2017-01-02 02:00:00 -3 0.3 2017-01-03 02:00:00 c 2017-01-03 02:00:00 -=== Try load data from v0.7.1.parquet -0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1 -0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 8 -0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 2 -0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 0 -0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 9 -0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 6 -0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 5 -0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 7 -0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 3 -0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 4 -=== Try load data from v0.7.1.some-named-index.parquet -0.21 59.8 61 326 3.89 3.84 2.31 Premium E SI1 -0.22 65.1 61 337 3.87 3.78 2.49 Fair E VS2 -0.23 56.9 65 327 4.05 4.07 2.31 Good E VS1 -0.23 59.4 61 338 4 4.05 2.39 Very Good H VS1 -0.23 61.5 55 326 3.95 3.98 2.43 Ideal E SI2 -0.24 62.3 57 336 3.95 3.98 2.47 Very Good I VVS1 -0.24 62.8 57 336 3.94 3.96 2.48 Very Good J VVS2 -0.26 61.9 55 337 4.07 4.11 2.53 Very Good H SI1 -0.29 62.4 58 334 4.2 4.23 2.63 Premium I VS2 -0.31 63.3 58 335 4.34 4.35 2.75 Good J SI2 diff --git a/tests/queries/0_stateless/00900_long_parquet_load.sh b/tests/queries/0_stateless/00900_long_parquet_load.sh deleted file mode 100755 index 0a7f10fe16dd..000000000000 --- a/tests/queries/0_stateless/00900_long_parquet_load.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash -# Tags: long, no-fasttest, no-debug, no-asan, no-msan, no-tsan - -# -# Load all possible .parquet files found in submodules. -# TODO: Add more files. -# - -# Also 5 sample files from -# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet -# ... -# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata5.parquet - - -# set -x - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY") -[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../.. -[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../.. - -DATA_DIR=$CUR_DIR/data_parquet - -[ -n "$ROOT_DIR" ] && [ -z "$PARQUET_READER" ] && PARQUET_READER="$ROOT_DIR"/contrib/arrow/cpp/build/release/parquet-reader - -# To update: -# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/ - -# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue. -# There is failure due parsing nested arrays or nested maps with NULLs: -# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) - -# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled: -# debug: -# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) -# release: -# Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin - -for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do - JSON=$DATA_DIR/$NAME.json - COLUMNS_FILE=$DATA_DIR/$NAME.columns - - { [ -z "$PARQUET_READER" ] || [ ! -s "$PARQUET_READER" ]; } && [ ! -s "$COLUMNS_FILE" ] && continue - - echo "=== Try load data from $NAME" - - # If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns - [ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON" - [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/helpers/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE" - - # Debug only: - # [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump - - # COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.py $JSON` 2>&1 || continue - COLUMNS=$(cat "$COLUMNS_FILE") || continue - - ${CLICKHOUSE_CLIENT} -n --query=" - DROP TABLE IF EXISTS parquet_load; - CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory;" - - # Some files contain unsupported data structures, exception is ok. - ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" < "$DATA_DIR"/"$NAME" 2>&1 | sed 's/Exception/Ex---tion/' - - ${CLICKHOUSE_CLIENT} -n --query=" - SELECT * FROM parquet_load ORDER BY tuple(*) LIMIT 100; - DROP TABLE parquet_load;" -done diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.reference b/tests/queries/0_stateless/00900_long_parquet_load_2.reference new file mode 100644 index 000000000000..955134ffd23c --- /dev/null +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.reference @@ -0,0 +1,841 @@ +=== Try load data from 02588_data.parquet +36993 cta 224.0.90.10 1670947008433897216 22204819 EDGX cqs_pillar quote \N \N \N 82.57 1 R 83.36 2 R 1 \N +139613 cta 224.0.90.10 1670964613107206144 65892770 NASDAQ cqs_pillar quote \N \N \N 82.78 1 R 82.83 1 R 1 \N +105492 cta 224.0.90.10 1670960913931416320 56544849 BZX cqs_pillar quote \N \N \N 82.85 1 R 83.16 1 R 1 \N +36152 cta 224.0.90.10 1670946823125135616 21622585 EDGX cqs_pillar quote \N \N \N 82.75 1 R 83.47 2 R 1 \N +69158 cta 224.0.90.10 1670953640446284544 40764014 IEX cqs_pillar quote \N \N \N 80.88 1 R 82.53 2 R 1 \N +27530 cta 224.0.90.10 1670945419685089024 16077821 NYSE cqs_pillar quote \N \N \N 83.51 1 R 83.71 1 R 1 \N +149156 cta 224.0.90.10 1670965092304685312 67996527 EDGA cqs_pillar quote \N \N \N 82.53 1 R 82.64 1 R 1 \N +110613 cta 224.0.90.10 1670961649615504128 58147008 EDGA cqs_pillar quote \N \N \N 82.35 1 R 82.59 1 R 1 \N +140891 cta 224.0.90.10 1670964672789551360 66162030 BZX cqs_pillar quote \N \N \N 82.52 2 R 82.84 1 R 1 \N +76346 cta 224.0.90.10 1670955037463045376 44631551 CSX cqs_pillar quote \N \N \N 82.88 1 R 82.99 1 R 1 \N +101143 cta 224.0.89.10 1670959978724152576 2456274 BZX cts_pillar trade 82.86 10 \N \N \N \N \N \N 1 \N +101742 cta 224.0.90.10 1670960130004183808 55040364 OTC cqs_pillar quote \N \N \N 78.8 0 87.09 0 1 \N +72936 cta 224.0.90.10 1670954283346099712 42646692 NYSE cqs_pillar quote \N \N \N 82.73 2 R 82.88 1 R 1 \N +31762 cta 224.0.90.10 1670945915357103360 18166318 BYX cqs_pillar quote \N \N \N 83.04 1 R 83.61 1 R 1 \N +9462 cta 224.0.90.10 1670943070913110784 6523476 EDGX cqs_pillar quote \N \N \N 85.83 2 R 86.79 2 R 1 \N +110924 cta 224.0.90.10 1670961662057144576 58201805 NYSE cqs_pillar quote \N \N \N 82.25 1 R 82.35 7 R 1 \N +87134 cta 224.0.90.10 1670957393338548736 49239199 CSX cqs_pillar quote \N \N \N 82.66 1 R 82.8 1 R 1 \N +118463 cta 224.0.90.10 1670962682025828096 60652181 AMEX cqs_pillar quote \N \N \N 82.18 4 R 82.49 3 R 1 \N +107704 cta 224.0.90.10 1670961253161367040 57218136 NYSE cqs_pillar quote \N \N \N 82.59 1 R 82.73 1 R 1 \N +117169 cta 224.0.90.10 1670962550347156736 60310620 EDGX cqs_pillar quote \N \N \N 81.76 1 R 82.15 1 R 1 \N + +\N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N 153071 \N +=== Try load data from 02718_data.parquet +36993 993 Hello993 \N 1 \N +36152 152 Hello152 2 1 119401713636491372 +69158 158 Hello158 8 1 1991930319586584116 +27530 530 Hello530 0 1 259638476471573956 +76346 346 Hello346 6 1 2834077831228187571 +72936 936 Hello936 6 1 3888695898929829382 +31762 762 Hello762 2 1 11110522506606775460 +9462 462 Hello462 2 1 6275131136942537266 +87134 134 Hello134 4 1 1219316186974625286 +74898 898 Hello898 8 1 5066180111327518250 +50958 958 Hello958 8 1 4077715651062815935 +15467 467 Hello467 \N 1 \N +22775 775 Hello775 \N 1 \N +67503 503 Hello503 \N 1 \N +65837 837 Hello837 \N 1 \N +29002 2 Hello2 2 1 6000600018545075778 +40596 596 Hello596 6 1 1428087471198311791 +37204 204 Hello204 4 1 6955606590448669529 +75722 722 Hello722 2 1 8736384626514688031 +39807 807 Hello807 \N 1 \N + +\N 0 \N 100000 16777408911109712567 +=== Try load data from 02725_data.parquet +0 0 1 5222652260262540557 +1 1 1 2963573356302499406 +2 2 1 787535183767392522 + +\N 0 3 8973760800332432485 +=== Try load data from 02960_polygon_bound_bug.parquet +0 [[[(-157.9582991377384,21.28868463439519),(-157.9582991377384,21.35254833902807),(-157.89014212298457,21.35254833902807),(-157.89014212298457,21.28868463439519),(-157.9582991377384,21.28868463439519)]]] 1 2965019508272517993 +1 [[[(174.76274050943928,-41.35902838296618),(174.76274050943928,-41.29535963198179),(174.8472037311771,-41.29535963198179),(174.8472037311771,-41.35902838296618),(174.76274050943928,-41.35902838296618)]]] 1 12759744261968863701 + +\N [] 2 15724763770241381694 +=== Try load data from 03445_geoparquet_null_linestring.parquet +0 [(0,0),(1,1)] linestring1 1 11952067038708389267 +1 [] linestring2 1 6193756679726488702 + +\N [] \N 2 18145823718434877969 +=== Try load data from 03445_geoparquet_null_point.parquet +0 (1,1) Point1 1 13965490122443935483 +1 (0,0) Point2 1 17718043395310203769 + +\N (0,0) \N 2 13236789444044587636 +=== Try load data from 03445_geoparquet_null_polygon.parquet +0 [[(0,0),(2,0),(2,2),(0,2),(0,0)]] Polygon1 1 8425904500992146115 +1 [] Polygon2 1 16721826817827418522 + +\N [] \N 2 6700987245110013021 +=== Try load data from 03445_geoparquet_wkb.parquet +0 (0,0) [(0,0),(5,5)] [[(1,1),(2,1),(2,2),(1,2),(1,1)]] [[(0,0),(5,5)],[(1,1),(2,4),(5,5)]] [[[(1,1),(2,1),(2,2),(1,2),(1,1)]],[[(3,3),(4,3),(4,4),(3,4),(3,3)]]] 1 609312927898875547 +1 (1,1) [(1,1),(2,4),(5,5)] [[(3,3),(4,3),(4,4),(3,4),(3,3)]] [[(0,0),(5,5)],[(1,1),(2,4),(5,5)]] [[[(1,1),(2,1),(2,2),(1,2),(1,1)]],[[(3,3),(4,3),(4,4),(3,4),(3,3)]]] 1 18202783414119897595 + +\N (0,0) [] [] [] [] 2 365352268309221526 +=== Try load data from 03445_geoparquet_wkt.parquet +0 1 (30,10) [(30,10),(10,30),(40,40)] [[(35,10),(45,45),(15,40),(10,20),(35,10)],[(20,30),(35,35),(30,20),(20,30)]] 1 2151260364810557833 +1 2 (40,20) [(40,20),(20,40),(50,50)] [[(35,10),(45,45),(15,40),(10,20),(35,10)]] 1 17365314224586431973 + +\N \N (0,0) [] [] 2 1069830515687438190 +=== Try load data from 68131.parquet +0 [1,2] 1 5845551876324623491 + +\N [] 1 5845551876324623491 +=== Try load data from alltypes_dictionary.parquet +0 0 true 0 0 0 0 0 0 01/01/09 0 2009-01-01 00:00:00.000000000 1 2660886053874804793 +1 1 false 1 1 1 10 1.1 10.1 01/01/09 1 2009-01-01 00:01:00.000000000 1 8346836977294016814 + +\N \N \N \N \N \N \N \N \N \N \N \N 2 11007723031168821607 +=== Try load data from alltypes_list.parquet +0 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] 1 16503550532047480981 +1 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] [10957,11323,11688] [946674000,978296400,1009832400] [0.2,10,4] [4,10000.1,10000.1] [1000000000,90,101001.01] 1 15124318519444847550 +2 [1,-2,3] [1,2,3] [100,-200,300] [100,200,300] [10000000,-20000000,30000000] [10000000,2000000,3000000] [100000000000000,-200000000000,3000000000000] [100000000000000,20000000000000,3000000000000] ['Some string','Some string','Some string'] ['0000','1111','2222'] [42.42,424.2,0.4242] [424242.424242,4242042420.242424,42] [10957,11323,11688] [946674000,978296400,1009832400] [0.2,10,4] [4,10000.1,10000.1] [1000000000,90,101001.01] 1 3202145420792745352 + +\N [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] 3 16383270398575522267 +=== Try load data from alltypes_plain.parquet +0 4 true 0 0 0 0 0 0 03/01/09 0 2009-03-01 00:00:00.000000000 1 8465924141596626920 +4 2 true 0 0 0 0 0 0 02/01/09 0 2009-02-01 00:00:00.000000000 1 15267283463947283488 +3 7 false 1 1 1 10 1.1 10.1 04/01/09 1 2009-04-01 00:01:00.000000000 1 8084360163536046539 +1 5 false 1 1 1 10 1.1 10.1 03/01/09 1 2009-03-01 00:01:00.000000000 1 15868711115761948062 +6 0 true 0 0 0 0 0 0 01/01/09 0 2009-01-01 00:00:00.000000000 1 18177502423429277302 +7 1 false 1 1 1 10 1.1 10.1 01/01/09 1 2009-01-01 00:01:00.000000000 1 5076808624665456456 +5 3 false 1 1 1 10 1.1 10.1 02/01/09 1 2009-02-01 00:01:00.000000000 1 15096584967546526452 +2 6 true 0 0 0 0 0 0 04/01/09 0 2009-04-01 00:00:00.000000000 1 4477898818565688651 + +\N \N \N \N \N \N \N \N \N \N \N \N 8 16728097424210647406 +=== Try load data from alltypes_plain.snappy.parquet +0 6 true 0 0 0 0 0 0 04/01/09 0 2009-04-01 00:00:00.000000000 1 4638961571363325565 +1 7 false 1 1 1 10 1.1 10.1 04/01/09 1 2009-04-01 00:01:00.000000000 1 10118128356831336697 + +\N \N \N \N \N \N \N \N \N \N \N \N 2 14757089928194662262 +=== Try load data from array_float.parquet +9 idx10 [10.2,8.2] 1 6500220947925738428 +0 idx1 [] 1 12173579943307849648 +4 idx5 [10.2,8.2] 1 12410529623371858105 +3 idx4 [10.2] 1 10296189669222279578 +1 idx2 [10.2,8.2,7.2] 1 1784870960298998108 +8 idx9 [10.2] 1 13922970413406815839 +6 idx7 [10.2,8.2] 1 4112649194148461660 +7 idx8 [10.2,8.2] 1 5240844288156881892 +5 idx6 [10.2] 1 3798195961072290386 +2 idx3 [10.2,8.2] 1 1513321551386387090 + +\N [] 10 16413140331168905886 +=== Try load data from array_int.parquet +9 idx10 [100,101,102] 1 13206811767060123460 +0 idx1 [100,101,102] 1 1468807849511680661 +4 idx5 [100,101] 1 5865660290831628184 +3 idx4 [100] 1 17591298091891296814 +1 idx2 [100,101] 1 7229523771491093755 +8 idx9 [100,101,102] 1 13974023397309305358 +6 idx7 [100,101] 1 7948251195409708470 +7 idx8 [100,101] 1 2180201009889543106 +5 idx6 [100,101] 1 4988466500581683121 +2 idx3 [100,101,102,101] 1 13792751652532249689 + +\N [] 10 14458819231670106154 +=== Try load data from array_string.parquet +9 idx10 ['This','is','a','test'] 1 8058991263431016296 +0 idx1 ['This','is','a','test'] 1 4316848877505089401 +4 idx5 ['wants','to','get','out'] 1 15659577912664352409 +3 idx4 [] 1 15229575922646986842 +1 idx2 ['cigarette','smoke'] 1 2611073676543300327 +8 idx9 ['Which','Heaven','to','gaudy','day','denies'] 1 16225894291786653140 +6 idx7 ['then','I','put','him','back'] 1 13493172719522718923 +7 idx8 ['make','a','man'] 1 15077948755334701219 +5 idx6 ['me','up?'] 1 8103829450440236616 +2 idx3 ['the','grocery','clerks'] 1 14008392062650720804 + +\N \N [] 10 2104840490268466281 +=== Try load data from binary.parquet +9 \t 1 13249380382722804046 +0 \0 1 13370574831107216623 +11 1 9579149237072969620 +4  1 13244713168078657249 +10 \n 1 1444330909894844834 +3  1 13076407284415682324 +1  1 6974130533670503360 +8 \b 1 439548861610966662 +6  1 16660782491328850550 +7  1 18276792018525247702 +5  1 14590984139361154376 +2  1 15175603013226953226 + +\N \N 12 6955188355048989260 +=== Try load data from byte_array_decimal.parquet +20 21 1 14249832127278749837 +18 19 1 8782556698832744361 +9 10 1 1928879545261732492 +0 1 1 3679262401748128310 +15 16 1 10115147958821052082 +11 12 1 6139620041083242203 +22 23 1 15516880273489139801 +23 24 1 16401735415014746636 +4 5 1 17051611891056893642 +14 15 1 14806847963466793184 +10 11 1 5493709197750930073 +12 13 1 6787141652406725654 +19 20 1 15665309129618128622 +3 4 1 12537652640196058813 +1 2 1 16863671454656808012 +8 9 1 8782944891374641799 +17 18 1 7522801893698749378 +6 7 1 11491618852443552134 +13 14 1 16678362120261280996 +7 8 1 7441211880741634496 + +\N \N 24 1955102309460433818 +=== Try load data from case_insensitive_column_matching.parquet +0 123 1 1 24960983393539569 +1 456 2 1 10860017704905464526 + +\N \N \N 2 10884978688299004095 +=== Try load data from datapage_v2.snappy.parquet +0 abc 1 2 true [1,2,3] 1 16078188846132661856 +4 abc 5 2 true [1,2] 1 6454514002962808519 +3 \N 4 5 false [1,2,3] 1 \N +1 abc 2 3 true [] 1 243797108142459904 +2 abc 3 4 true [] 1 16370920853893187024 + +\N \N 0 0 false [] 5 2253932663712014071 +=== Try load data from datatype-date32.parquet +0 1925-01-01 1 95355579932678910 +3 2282-12-31 1 1199432099726642882 +1 1949-10-01 1 11226486318317443606 +2 2021-10-01 1 14120462944390486409 + +\N \N 4 8194992868657700191 +=== Try load data from delta_length_byte_array_encoding.parquet +0 SWEEP SWETT 00459 \N ('20221206100111','+0100') ('20221206100111','+0100') ('20221206100111','+0100') 3 11 T \N 1 \N + +\N \N \N \N \N (NULL,NULL) (NULL,NULL) (NULL,NULL) \N \N \N \N 1 \N +=== Try load data from dict-page-offset-zero.parquet +30 1552 1 8422143286990545479 +24 1552 1 10300175852787330865 +34 1552 1 11618054463714841641 +32 1552 1 14967043328803827083 +20 1552 1 616429680791283190 +28 1552 1 15309599966318093138 +18 1552 1 9219274452570041950 +9 1552 1 11166776644758002338 +0 1552 1 1110107121126640579 +15 1552 1 2043878238511573955 +36 1552 1 7370540478052318116 +27 1552 1 1968222612518557249 +33 1552 1 10663086133314837259 +11 1552 1 14050429072743601145 +22 1552 1 12748313231675201650 +23 1552 1 10990797888557012014 +4 1552 1 2906092155432956845 +29 1552 1 15224319568498500590 +14 1552 1 1468386472401779279 +10 1552 1 14846008954777106711 + +\N \N 39 8745471155267852610 +=== Try load data from example_half_float.parquet +0 1.5 1 15974859270407168234 +4 0 1 9247474980253145447 +3 -1 1 8983465089972530386 +1 2.5 1 15230195726587115037 +2 3.140625 1 16022017580069504368 + +\N \N 5 10117780426160808624 +=== Try load data from fixed_array_int.parquet +9 idx10 [10,15,52,43,22,50,42,87,19,91] 1 16021513271056229789 +0 idx1 [73,33,88,34,94,96,11,90,20,17] 1 4629975054797286940 +4 idx5 [67,84,56,84,50,21,93,90,37,42] 1 17290372351898611615 +3 idx4 [87,56,32,98,25,18,66,21,20,2] 1 12000235215545123457 +1 idx2 [10,18,74,39,2,55,13,41,6,42] 1 5967127131809967594 +8 idx9 [90,53,83,54,35,87,73,74,98,50] 1 16114439062892910195 +6 idx7 [46,1,21,11,83,74,11,63,28,49] 1 18320969301847648369 +7 idx8 [55,30,50,19,12,95,5,83,71,34] 1 12253618457445158216 +5 idx6 [53,14,44,96,40,71,26,74,27,25] 1 14205911744472535748 +2 idx3 [48,27,74,82,70,46,18,78,63,73] 1 8582050234588949331 + +\N \N [] 10 14705747384097111558 +=== Try load data from fixed_array_nested_list_int.parquet +9 idx10 [[10,72,13,85,89,15,51,73,64,49],[78,28,54,57,10,1,3,35,23,15],[97,93,76,87,86,21,30,9,58,21],[27,23,35,71,4,68,90,14,93,87],[59,98,66,94,8,90,16,20,33,10],[73,84,79,37,75,50,64,74,79,31],[20,64,79,51,20,41,7,69,5,22],[37,60,1,99,45,43,56,26,15,83],[7,96,6,58,7,59,57,6,22,92],[97,11,71,55,32,79,88,20,58,31]] 1 17970853696231218182 +0 idx1 [[34,11,56,20,90,20,17,55,54,99],[22,18,15,77,68,51,30,76,9,40],[67,24,2,27,72,53,99,57,67,96],[95,77,60,47,68,56,91,28,90,38],[77,8,75,89,84,81,60,82,71,26],[41,89,60,15,43,81,31,68,71,65],[84,97,22,9,26,59,97,10,16,60],[13,82,16,47,32,74,34,78,90,15],[35,44,50,18,82,71,20,68,89,41],[60,59,7,65,18,23,24,37,48,45]] 1 2178808435128197877 +4 idx5 [[52,98,63,41,2,6,56,80,72,15],[11,64,89,57,65,92,63,39,76,51],[47,25,33,28,58,22,83,19,54,63],[61,8,20,24,83,44,33,32,65,90],[51,75,72,55,54,5,43,90,64,30],[86,87,48,65,84,67,82,94,2,9],[1,75,83,79,64,63,52,51,33,72],[12,84,92,27,66,32,40,48,34,55],[59,91,19,77,70,56,10,82,40,60],[79,83,14,82,42,22,72,54,12,9]] 1 10102655348078839145 +3 idx4 [[65,58,31,62,38,29,32,71,87,49],[66,89,59,83,29,32,59,49,68,34],[81,54,14,69,39,84,95,37,15,47],[15,51,15,38,90,61,76,51,11,24],[64,93,27,26,29,15,15,4,95,36],[60,61,57,66,61,25,21,97,34,36],[7,34,30,4,8,29,68,48,58,64],[77,19,92,77,81,21,52,1,86,92],[16,91,1,40,57,44,76,10,78,10],[88,13,70,61,2,76,78,69,90,24]] 1 9171162259712295701 +1 idx2 [[72,32,17,44,69,37,95,64,47,70],[15,4,28,31,80,63,40,93,17,87],[28,46,56,21,65,20,5,76,18,61],[66,44,5,66,71,73,26,70,47,22],[9,9,62,21,74,75,32,71,2,61],[66,81,43,46,10,78,66,31,34,17],[30,5,10,15,56,52,36,3,88,16],[45,84,39,12,39,13,43,66,67,16],[27,18,68,80,87,19,98,39,90,54],[48,96,24,67,66,91,3,10,7,66]] 1 14152623021723637344 +8 idx9 [[32,6,44,30,66,48,93,67,62,16],[49,67,73,32,86,78,28,88,25,60],[70,10,95,16,10,49,12,47,22,62],[95,86,63,43,4,96,3,34,60,53],[60,75,96,66,96,82,62,31,78,72],[17,44,13,7,45,54,27,84,10,68],[43,49,86,10,57,99,32,72,73,41],[57,79,15,62,79,87,67,69,75,59],[45,18,2,19,45,96,8,86,71,97],[20,20,11,15,82,29,16,12,87,87]] 1 14877241838559228840 +6 idx7 [[58,24,79,16,32,36,36,91,18,22],[85,29,36,16,75,79,71,70,6,39],[47,15,82,30,55,14,49,47,38,13],[28,54,95,82,25,16,44,82,33,11],[58,16,87,96,65,3,10,68,87,15],[94,84,65,50,21,78,8,78,89,72],[39,41,67,23,21,83,43,94,31,15],[67,58,73,87,58,71,52,10,30,90],[69,65,72,89,51,8,39,80,49,79],[32,36,76,11,88,87,75,55,33,74]] 1 5485679170602838617 +7 idx8 [[8,26,45,3,25,83,88,77,98,38],[52,76,79,94,6,74,73,31,93,53],[89,24,62,83,35,24,60,24,41,14],[52,13,13,32,87,77,19,20,52,47],[50,46,66,30,26,85,91,50,98,83],[87,44,5,11,25,52,9,55,15,37],[18,24,36,72,84,10,13,59,16,65],[93,4,19,13,75,64,73,29,81,25],[61,65,1,45,75,88,19,4,73,32],[23,20,27,55,13,34,97,80,8,19]] 1 5246976039550502726 +5 idx6 [[71,60,6,31,40,66,78,6,23,14],[62,88,38,68,37,37,63,18,57,3],[45,93,1,49,99,33,10,91,6,17],[13,40,57,85,48,14,74,89,43,49],[18,17,39,75,52,30,48,4,13,55],[3,34,73,71,75,58,6,73,73,31],[28,74,26,3,2,49,50,60,27,79],[35,28,12,10,1,21,61,70,65,37],[30,27,51,85,89,84,73,48,4,71],[1,86,23,68,82,9,6,95,14,25]] 1 6035658836658698306 +2 idx3 [[54,93,69,82,45,54,35,59,20,39],[80,3,15,33,45,22,45,11,28,94],[68,26,89,38,49,28,29,59,93,57],[11,62,53,12,16,83,2,55,65,6],[37,49,56,93,33,77,22,53,20,22],[4,75,2,5,34,5,90,67,2,79],[22,2,80,14,13,44,33,11,31,24],[11,54,75,33,60,20,10,51,56,33],[72,5,84,78,7,95,21,2,88,75],[69,88,52,85,70,96,51,69,48,20]] 1 8704823891163505117 + +\N \N [] 10 1692762168861203775 +=== Try load data from fixed_array_str.parquet +9 idx10 ['str100','str101','str102','str103','str104','str105','str106','str107','str108','str109'] 1 11290285949581330773 +0 idx1 ['str10','str11','str12','str13','str14','str15','str16','str17','str18','str19'] 1 8071043625235141093 +4 idx5 ['str50','str51','str52','str53','str54','str55','str56','str57','str58','str59'] 1 13678605401710213392 +3 idx4 ['str40','str41','str42','str43','str44','str45','str46','str47','str48','str49'] 1 7840105480765753525 +1 idx2 ['str20','str21','str22','str23','str24','str25','str26','str27','str28','str29'] 1 8007482149540455162 +8 idx9 ['str90','str91','str92','str93','str94','str95','str96','str97','str98','str99'] 1 12166079781402687071 +6 idx7 ['str70','str71','str72','str73','str74','str75','str76','str77','str78','str79'] 1 16266405848887210870 +7 idx8 ['str80','str81','str82','str83','str84','str85','str86','str87','str88','str89'] 1 5655882823175845862 +5 idx6 ['str60','str61','str62','str63','str64','str65','str66','str67','str68','str69'] 1 2220108651520481848 +2 idx3 ['str30','str31','str32','str33','str34','str35','str36','str37','str38','str39'] 1 12690465408294469025 + +\N \N [] 10 5652744751565830541 +=== Try load data from fixed_length_decimal.parquet +20 21 1 11198275065492030391 +18 19 1 351877431427227425 +9 10 1 10154485708636235935 +0 1 1 8400161416641029105 +15 16 1 5316068084098049881 +11 12 1 492436284488452913 +22 23 1 9703814451674327669 +23 24 1 7776632914114470965 +4 5 1 11637472902120684352 +14 15 1 11714025203378226649 +10 11 1 2832942064114283293 +12 13 1 11263380343111977755 +19 20 1 5134307520953471473 +3 4 1 1592352314383541392 +1 2 1 16790405582299928683 +8 9 1 1533117082386042720 +17 18 1 15842453746690660649 +6 7 1 13407742669739090137 +13 14 1 2885392384039107169 +7 8 1 2892786383714126928 + +\N \N 24 7048233601027308485 +=== Try load data from fixed_length_decimal_1.parquet +20 21 1 11198275065492030391 +18 19 1 351877431427227425 +9 10 1 10154485708636235935 +0 1 1 8400161416641029105 +15 16 1 5316068084098049881 +11 12 1 492436284488452913 +22 23 1 9703814451674327669 +23 24 1 7776632914114470965 +4 5 1 11637472902120684352 +14 15 1 11714025203378226649 +10 11 1 2832942064114283293 +12 13 1 11263380343111977755 +19 20 1 5134307520953471473 +3 4 1 1592352314383541392 +1 2 1 16790405582299928683 +8 9 1 1533117082386042720 +17 18 1 15842453746690660649 +6 7 1 13407742669739090137 +13 14 1 2885392384039107169 +7 8 1 2892786383714126928 + +\N \N 24 7048233601027308485 +=== Try load data from fixed_length_decimal_legacy.parquet +20 21 1 98871632974742800 +18 19 1 12286414994539395392 +9 10 1 4015198646268050837 +0 1 1 5707119067460400703 +15 16 1 13816866841100736535 +11 12 1 9251720416939280334 +22 23 1 5565784470055769873 +23 24 1 5988838306095885468 +4 5 1 14537087045384223761 +14 15 1 4448174729128812515 +10 11 1 1016747994366367371 +12 13 1 17033975816505265696 +19 20 1 7405892054282145799 +3 4 1 8774318004330791847 +1 2 1 12562399927720453883 +8 9 1 6051866097990639565 +17 18 1 12713912483641764543 +6 7 1 7899984304259045775 +13 14 1 15558757953562584889 +7 8 1 12196684160062922979 + +\N \N 24 9181524539748381587 +=== Try load data from hadoop_lz4_compressed.parquet +0 1593604800 abc 42 1 3806095617483730328 +3 1593604801 def 7.7 1 13482131770667370456 +1 1593604800 def 7.7 1 5861304996888164348 +2 1593604801 abc 42.125 1 9545089655608587545 + +\N 0 \N 4 14247877966938301061 +=== Try load data from int-list-zero-based-chunked-array.parquet +36993 [36993,36994,36995,36996,36997,36998,36999,37000,37001] 1 13704691802482625316 +36152 [36152,36153,36154,36155,36156,36157,36158,36159,36160] 1 17679192445898543326 +69158 [69158,69159,69160,69161,69162] 1 4041534779580111776 +27530 [27530,27531,27532,27533,27534,27535,27536,27537,27538] 1 3713124893993513562 +31762 [31762,31763,31764,31765,31766,31767,31768,31769,31770] 1 3389122341925034518 +9462 [9462,9463,9464,9465,9466,9467,9468] 1 14863022561367924217 +50958 [] 1 11385693486036653406 +15467 [15467,15468] 1 6342543614213518156 +22775 [] 1 12108733369505759980 +67503 [67503,67504,67505,67506,67507,67508,67509,67510,67511] 1 1801937764717624249 +65837 [65837,65838,65839,65840,65841,65842,65843] 1 755465894404509620 +29002 [29002] 1 8456747141270642978 +40596 [40596,40597,40598,40599] 1 3374998285434442417 +37204 [37204,37205,37206,37207,37208,37209] 1 6512084231507538650 +39807 [39807,39808,39809,39810,39811,39812,39813,39814,39815] 1 11144369402569825757 +47225 [47225,47226,47227,47228] 1 3629702947909304977 +10927 [10927,10928,10929,10930,10931,10932,10933] 1 12795261822175527468 +4611 [4611] 1 434672168758103656 +34745 [34745,34746,34747,34748,34749] 1 8563244607249095096 +55007 [] 1 738411964590426114 + +\N [] 70000 6594662959574000636 +=== Try load data from int32_decimal.parquet +20 21 1 14249832127278749837 +18 19 1 8782556698832744361 +9 10 1 1928879545261732492 +0 1 1 3679262401748128310 +15 16 1 10115147958821052082 +11 12 1 6139620041083242203 +22 23 1 15516880273489139801 +23 24 1 16401735415014746636 +4 5 1 17051611891056893642 +14 15 1 14806847963466793184 +10 11 1 5493709197750930073 +12 13 1 6787141652406725654 +19 20 1 15665309129618128622 +3 4 1 12537652640196058813 +1 2 1 16863671454656808012 +8 9 1 8782944891374641799 +17 18 1 7522801893698749378 +6 7 1 11491618852443552134 +13 14 1 16678362120261280996 +7 8 1 7441211880741634496 + +\N \N 24 1955102309460433818 +=== Try load data from int64_decimal.parquet +20 21 1 98871632974742800 +18 19 1 12286414994539395392 +9 10 1 4015198646268050837 +0 1 1 5707119067460400703 +15 16 1 13816866841100736535 +11 12 1 9251720416939280334 +22 23 1 5565784470055769873 +23 24 1 5988838306095885468 +4 5 1 14537087045384223761 +14 15 1 4448174729128812515 +10 11 1 1016747994366367371 +12 13 1 17033975816505265696 +19 20 1 7405892054282145799 +3 4 1 8774318004330791847 +1 2 1 12562399927720453883 +8 9 1 6051866097990639565 +17 18 1 12713912483641764543 +6 7 1 7899984304259045775 +13 14 1 15558757953562584889 +7 8 1 12196684160062922979 + +\N \N 24 9181524539748381587 +=== Try load data from integers_1_5_no_3_bf_minmax.parquet +0 -1 1 11935486105267972682 +3 5 1 8454527814154760371 +1 2 1 463667963421364848 +2 4 1 17956467173040956166 + +\N 0 4 1916660908465950835 +=== Try load data from ipv6_bloom_filter.gz.parquet +0 zTNx7ꙕ 1 13288000489149796924 +4 w*I\n+ 1 9468751629168416770 +3 }ϔ8<\fA뙠Ѻ 1 17226734039706043359 +1  }B,K32 1 7253289996785367618 +2 ZD/y 1 842579928916594899 + +\N \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 5 11185867936307116338 +=== Try load data from iris.parquet +73 6.1 2.8 4.7 1.2 Versicolor 1 18204632795337117686 +89 5.5 2.5 4 1.3 Versicolor 1 4653652012410455888 +63 6.1 2.9 4.7 1.4 Versicolor 1 13689724364604468360 +57 4.9 2.4 3.3 1 Versicolor 1 2127827660748014277 +104 6.5 3 5.8 2.2 Virginica 1 13296829535387120114 +30 4.8 3.1 1.6 0.2 Setosa 1 9117040247783135681 +88 5.6 3 4.1 1.3 Versicolor 1 7185603935448442167 +116 6.5 3 5.5 1.8 Virginica 1 12427820264215610117 +24 4.8 3.4 1.9 0.2 Setosa 1 12950528231928803659 +34 4.9 3.1 1.5 0.2 Setosa 1 1207689558702002101 +109 7.2 3.6 6.1 2.5 Virginica 1 2684941844963881116 +145 6.7 3 5.2 2.3 Virginica 1 3711702047148575642 +93 5 2.3 3.3 1 Versicolor 1 17560318037338666120 +43 5 3.5 1.6 0.6 Setosa 1 16730113477114499272 +32 5.2 4.1 1.5 0.1 Setosa 1 2860566811651200307 +20 5.4 3.4 1.7 0.2 Setosa 1 16845040626516367712 +91 6.1 3 4.6 1.4 Versicolor 1 8232897093452463750 +62 6 2.2 4 1 Versicolor 1 16022163764249252264 +61 5.9 3 4.2 1.5 Versicolor 1 12092451575184938334 +124 6.7 3.3 5.7 2.1 Virginica 1 14064025372312110514 + +\N \N \N \N \N \N 150 14770655198702124555 +=== Try load data from list_columns.parquet +0 [1,2,3] ['abc','efg','hij'] 1 5809983990759233283 +1 [NULL,1] [] 1 9854762345037358727 +2 [4] ['efg',NULL,'hij','xyz'] 1 118071435199055165 + +\N [] [] 3 15782817770995647175 +=== Try load data from multi_column_bf.gz.parquet +540 -62 60904 524 4294967391 13439599920079768466 false VIRE {"key":400, "value":"BNHSB"} GMKT MưFI6eĂ 1 9735371321288412055 +923 -58 46518 345 4294967327 18309803705868924873 true LJHD {"key":97, "value":"EIUNN"} WVRQ |wGZ_P 1 12513952248079036776 +988 -89 47241 756 4294967373 15522063381546272260 true CLDK {"key":604, "value":"JSQFX"} USNQ >8O;i, 1 4175486705904763621 +73 116 32428 513 4294967384 10876946806756585544 false SGPI {"key":711, "value":"XMEED"} IOIX d\rOFЂn\b 1 11079779081220916748 +639 -69 45537 315 4294967315 7263280072686713102 false CHFC {"key":48, "value":"MFGFA"} ZLBS ]Ʇ̠Bʹ (LuբW 1 1884003582258998694 +959 -51 56925 86 4294967348 2903746547904555720 true XHJC {"key":355, "value":"MQETE"} QIQA /<3@C x* 1 12551571122189109470 +286 7 32408 89 4294967349 11768163425287102515 true JBWQ {"key":63, "value":"LBRXP"} IHTI 2B=f*_ 1 2293773524378365089 +815 -16 37808 207 4294967396 18058293579070949154 true AEZA {"key":656, "value":"GOJYT"} ZODP eWE> 1 9939955730329575199 +212 -74 50116 784 4294967316 4639886571322169794 true JZKC {"key":844, "value":"WYFIG"} KELP w+ʰO] 1 5464573773970257898 +510 -44 14672 697 4294967323 3203265463862679482 true QPXM {"key":408, "value":"BJCSQ"} NMGV x|A2^qm 1 18363658913862052503 +827 38 52135 612 4294967349 2177159037635926390 false KWOV {"key":384, "value":"PLUUJ"} VACB >$=L$Q\t 1 1291085690902008617 +705 -96 7506 479 4294967337 1919118169111152342 true EGEU {"key":522, "value":"AOPEL"} TZFW sIޣr^3 1 12776128745828051091 +174 -38 10608 469 4294967369 8139895279544416045 false SUED {"key":348, "value":"KDIJQ"} VKSM v3A, 1 8457376369144183832 +89 79 31144 138 4294967379 3327168023216233174 true MRTX {"key":492, "value":"NGCXD"} GVCG y>LO(xج;& 1 9669048967548569557 +802 92 55937 83 4294967368 13260877643210989136 true FNXX {"key":566, "value":"KVFDZ"} CMMV m:uL{Tzf 1 8531841042433805304 +599 -107 36579 116 4294967311 4635153073869187643 true BURQ {"key":259, "value":"VNMQQ"} TMOJ YznM<~ 1 17862243500288186925 +89 2016-02-03 18:45:51.000000000 90 Jacqueline Carr jcarr2h@freewebs.com Female 197.40.38.49 201939989746686 China 5/31/1961 100733.44 Civil Engineer (。◕ ∀ ◕。) 1 2232484032009753154 +802 2016-02-03 05:23:27.000000000 803 Brian Ray brayma@behance.net Male 160.155.56.47 5602241665830742 Portugal 3/18/1958 143938.05 Speech Pathologist 1 15167281104907264518 +599 2016-02-03 15:51:59.000000000 600 Sarah Nguyen snguyengn@yellowpages.com Female 253.60.3.4 3560343605032408 China 181835.15 1 8724166627869322326 +911 2016-02-03 22:12:59.000000000 912 Amanda Franklin afranklinpb@1688.com Female 175.199.163.26 201793148639880 Colombia 224605.37 1 10158977932363585382 +63 2016-02-03 04:59:53.000000000 64 Dennis Ross dross1r@parallels.com Male 78.25.77.223 Portugal 5/27/1959 280933.71 Biostatistician II 1 8446384695916288357 +162 2016-02-03 10:24:45.000000000 163 Marie Smith msmith4i@quantcast.com Female 203.14.230.232 3531257958056309 Philippines 42270.96 1 14201445153743764259 +958 2016-02-03 18:03:51.000000000 959 Frances Martin fmartinqm@who.int Female 42.157.13.217 3582151311470687 Vietnam 11/4/1958 124538.92 Quality Control Specialist 1 6817154704817639963 + +\N \N \N \N \N \N \N \N \N \N \N \N \N \N 1000 15027035030807841507 +=== Try load data from userdata2.parquet +540 2016-02-03 22:07:47.000000000 541 Donna Owens dowensf0@etsy.com Female 243.24.71.38 Philippines 12/14/1983 156479.16 Senior Quality Engineer 1 10268099683213748654 +923 2016-02-03 20:46:19.000000000 924 Gerald Powell gpowellpn@so-net.ne.jp Male 12.250.87.136 3581939603817750 Democratic Republic of the Congo 4/15/1996 205302 Desktop Support Technician 田中さんにあげて下さい 1 16056284383264880013 +988 2016-02-03 15:50:57.000000000 989 Aaron Wallace awallacerg@ebay.com Male 190.175.69.165 5641822506407465150 Ukraine 8/11/1963 204058.01 Compensation Analyst 1 6690638294356353033 +73 2016-02-03 05:13:12.000000000 74 Kathryn Torres ktorres21@rakuten.co.jp Female 4.124.222.88 4026779356659103 Portugal 7/31/1956 121285.58 Project Manager 1 17443966047479479990 +639 2016-02-03 16:14:28.000000000 640 Johnny Williams jwilliamshr@topsy.com Male 49.99.17.206 China 2/15/1956 243542.02 Sales Associate 1 6428256511174345077 +959 2016-02-03 23:36:19.000000000 960 Randy Phillips rphillipsqn@mail.ru Male 2.135.69.233 201453749304505 Russia 11/23/1972 152452.33 Physical Therapy Assistant 1 4761216647766398181 +286 2016-02-03 00:19:42.000000000 287 Martin Ferguson mferguson7y@eventbrite.com Male 67.188.95.86 Portugal 7/2/1981 262746.89 Cost Accountant 1 6268611945850019602 +815 2016-02-03 01:09:16.000000000 816 Sara Sanders ssandersmn@cornell.edu Female 54.250.225.134 Netherlands 7/26/1998 261953.95 Quality Engineer 1 2265259250359460068 +212 2016-02-03 00:17:37.000000000 213 Norma Garrett ngarrett5w@technorati.com Female 65.49.237.93 Albania 80916.71 1 11448011014978600303 +510 2016-02-03 02:24:25.000000000 511 Chris Robertson crobertsone6@myspace.com Male 170.164.38.187 3528761574735043 Poland 3/18/1992 252042.53 Biostatistician I ‪‪test‪ 1 17051879510685654573 +827 2016-02-03 10:14:47.000000000 828 Roger Garrett rgarrettmz@twitter.com Male 239.61.51.53 3541088069104649 Philippines 2/2/1962 262160.19 Structural Engineer 1 8036646087262902024 +705 2016-02-03 09:40:35.000000000 706 Nicole Lewis nlewisjl@angelfire.com Female 211.40.223.211 Peru 38409.93 田中さんにあげて下さい 1 15936963868833675750 +174 2016-02-03 14:41:49.000000000 175 Diana Larson dlarson4u@goodreads.com Female 219.199.189.122 201506716061117 Philippines 4/11/1978 93166.24 Web Designer I 1 16565753450667403965 +89 2016-02-03 16:11:14.000000000 90 Nicole Reid nreid2h@cisco.com Female 10.75.131.59 5610704755842409780 Philippines 12/15/1985 24922.19 Marketing Assistant 1 14693622815454580058 +802 2016-02-03 14:24:43.000000000 803 Michael Carter mcarterma@dmoz.org Male 210.55.134.180 3571410217580228 Indonesia 113847.16 -1E2 1 3907836235968444221 +599 2016-02-03 06:04:47.000000000 600 Jeremy Roberts jrobertsgn@sciencedaily.com Male 226.251.11.3 5602231165341266 Portugal 5/27/1987 230061.8 Programmer Analyst IV 1 13109642909942519912 +911 2016-02-03 20:40:16.000000000 912 Kathleen Chavez kchavezpb@hexun.com Female 153.82.30.155 4936181486905879 Greece 10/31/1999 217386.03 Engineer IV 1 4072791798945819223 +63 2016-02-03 05:23:22.000000000 64 Roy Hughes rhughes1r@stanford.edu Male 209.120.70.78 3552886646968253 Canada 10/30/1968 191750.33 Mechanical Systems Engineer 1 473622384063714892 +162 2016-02-03 09:34:51.000000000 163 Eric Murray emurray4i@elegantthemes.com Male 160.195.27.97 Russia 11/4/1995 188482.04 Automation Specialist II (ノಥ益ಥ)ノ ┻━┻ 1 3298881357870304748 +958 2016-02-03 02:06:34.000000000 959 Amanda Welch awelchqm@tamu.edu Female 120.101.158.18 China 1/26/1975 231408.83 Budget/Accounting Analyst I 1 15464217041974416631 + +\N \N \N \N \N \N \N \N \N \N \N \N \N \N 1000 5963528300449987764 +=== Try load data from userdata3.parquet +540 2016-02-03 21:02:34.000000000 541 Betty Welch bwelchf0@opensource.org Female 198.236.215.180 3567431388419919 Mexico 7/3/1960 16569.92 VP Marketing 1 2713478761142492000 +923 2016-02-03 11:03:46.000000000 924 Juan Reid jreidpn@cyberchimps.com Male 171.183.44.28 Argentina 7/18/1965 196585.93 VP Sales ᠎ 1 5468821878054872581 +988 2016-02-03 09:02:34.000000000 989 Carol Stanley cstanleyrg@technorati.com Female 191.209.90.103 Croatia 3/25/1956 233179.25 Product Engineer -1 1 14150682749368749560 +73 2016-02-03 02:05:42.000000000 74 Roger Jacobs rjacobs21@rediff.com Male 51.122.147.153 36548589951538 Benin 7/18/1977 18545.32 Paralegal 1/2 1 13402678875674353025 +639 2016-02-03 22:33:30.000000000 640 Ralph Alexander ralexanderhr@cbslocal.com Male 185.34.97.159 3573288479096101 China 11/21/1962 278447.74 Analyst Programmer 1 5355672025070760067 +959 2016-02-03 07:39:35.000000000 960 Jennifer Green jgreenqn@instagram.com Female 18.250.186.105 3561679391138043 Israel 12/28/1959 185355.5 Human Resources Assistant II ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ 1 5062265660321592197 +286 2016-02-03 02:23:13.000000000 287 Bruce Foster bfoster7y@wordpress.org Male 96.73.83.237 5100143005021627 Netherlands 4/8/1971 205483.59 Graphic Designer 1 5572762546433119772 +815 2016-02-03 11:43:23.000000000 816 Edward Harvey eharveymn@vk.com Male 101.136.168.169 6304208202785626682 China 10/17/1961 142265.07 Programmer Analyst IV 1 4829781136518707641 +212 2016-02-03 11:39:09.000000000 213 Donald Rodriguez drodriguez5w@sun.com Male 225.164.245.48 374283580717023 Poland 9/3/1982 251008.63 Health Coach IV Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮ 1 16921058688961234934 +510 2016-02-03 23:26:27.000000000 511 Joshua Henderson jhendersone6@ibm.com Male 189.194.172.167 3549636412646146 Georgia 224387.52 1 14619781117756705513 +827 2016-02-03 10:25:17.000000000 828 Dennis Hudson dhudsonmz@bluehost.com Male 149.206.203.200 Slovenia 10/19/1973 277837.4 Health Coach I 1 9582925868983761804 +705 2016-02-03 06:13:59.000000000 706 Bonnie Webb bwebbjl@typepad.com Female 156.229.251.59 5108754882959325 Japan 91063.61 1 17189555236089554906 +174 2016-02-03 07:37:27.000000000 175 Frank Fuller ffuller4u@reverbnation.com Male 202.196.250.12 337941772142823 Russia 5/23/1967 70592.13 Professor 1 10679779506911848348 +89 2016-02-03 23:22:18.000000000 90 Edward Garza egarza2h@moonfruit.com Male 43.21.138.236 New Zealand 3/27/1965 139025.58 Structural Analysis Engineer 1 621718393876357924 +802 2016-02-03 00:27:23.000000000 803 Donald Wood dwoodma@parallels.com Male 212.8.149.51 67610717455795070 Mexico 6/22/1971 20752.43 Chief Design Engineer Œ„´‰ˇÁ¨ˆØ∏”’ 1 16726890607545042626 +599 2016-02-03 18:41:21.000000000 600 Gregory Hunt ghuntgn@economist.com 30.25.113.84 Thailand 8/13/1966 \N Editor åß∂ƒ©˙∆˚¬…æ 1 \N +911 2016-02-03 03:21:08.000000000 912 Anna Gardner agardnerpb@tmall.com Female 124.152.61.231 3588670541278424 Pakistan 11/22/1962 51597.06 Quality Engineer 1 5179387130181465779 +63 2016-02-03 00:10:33.000000000 64 Rose Fernandez rfernandez1r@usgs.gov Female 199.141.221.229 3564435193511524 Brazil 5/5/1972 196329.18 Senior Cost Accountant 1 9074151447130954378 +162 2016-02-03 19:18:03.000000000 163 Beverly Stanley bstanley4i@csmonitor.com Female 168.4.61.47 4903829552442845848 Mexico 204157.29 1 3613874353723744397 +958 2016-02-03 00:51:37.000000000 959 Kimberly Alvarez kalvarezqm@gizmodo.com Female 244.177.51.246 30135810163038 Philippines 8/5/1976 211292 Design Engineer 1 16582665575228982440 + +\N \N \N \N \N \N \N \N \N \N \N \N \N \N 1000 15440837781074178314 +=== Try load data from userdata4.parquet +540 2016-02-04 02:16:52.000000000 541 Richard Cook rcookf0@tmall.com Male 188.91.20.3 56022193398456680 Brazil 8/30/1994 100666.86 Geological Engineer 1 10437379725430447236 +923 2016-02-04 00:38:14.000000000 924 Jimmy Nelson jnelsonpn@rediff.com Male 244.130.194.232 Norway 259092.5 1 353377476397909171 +988 2016-02-04 07:11:31.000000000 989 Pamela Jones pjonesrg@wisc.edu 35.142.52.21 Czech Republic 6/17/1999 \N Chemical Engineer 1 \N +73 2016-02-04 13:29:27.000000000 74 Sandra Lee slee21@hatena.ne.jp Female 196.212.29.124 China 12/25/1976 190399.56 Assistant Media Planner ../../../../../../../../../../../etc/passwd%00 1 10099339221456216644 +639 2016-02-04 06:34:51.000000000 640 Kenneth Kelley kkelleyhr@wix.com Male 231.69.59.210 502031912635378130 China 5/20/1997 203635.14 Systems Administrator IV 1 9415252749604975816 +959 2016-02-04 00:58:28.000000000 960 Craig Shaw cshawqn@wordpress.org Male 88.203.243.165 5602229798654196 Tanzania 8/5/1999 119584.32 Senior Sales Associate 1 4406609943576080992 +286 2016-02-04 17:22:55.000000000 287 Mary Fuller mfuller7y@4shared.com Female 39.100.185.164 6759049803103370 Philippines 76958.57 1 11262262179434553359 +815 2016-02-04 08:01:05.000000000 816 Carolyn Berry cberrymn@merriam-webster.com 248.80.238.134 630475212324214080 China 7/14/1983 \N Environmental Specialist 1 \N +212 2016-02-04 01:44:35.000000000 213 Stephanie Stephens sstephens5w@noaa.gov Female 46.204.42.128 Poland 5/9/1970 111085.25 VP Product Management 1 11707927600058205217 +510 2016-02-04 05:36:45.000000000 511 Ashley Wallace awallacee6@businesswire.com Female 42.73.144.160 3533785396589455 Afghanistan 253647.58 1 10616068838779795227 +827 2016-02-04 02:56:30.000000000 828 Phyllis Thompson pthompsonmz@dyndns.org Female 194.131.117.5 67639203453515726 Guatemala 1/14/1965 169581.64 Account Representative II 1 2913164633101786028 +705 2016-02-04 02:32:16.000000000 706 Randy Lynch rlynchjl@smh.com.au 150.42.215.175 633412716325659135 Guadeloupe \N 1 \N +174 2016-02-04 04:22:19.000000000 175 Matthew West mwest4u@woothemes.com Male 84.96.179.102 201597130637699 China 162836.18 1 1664772539401809508 +89 2016-02-04 05:58:21.000000000 90 Terry Parker tparker2h@hc360.com Male 189.36.77.133 China 4/2/1987 232623.76 GIS Technical Architect 1 14233491041490997279 +802 2016-02-04 01:42:21.000000000 803 Ann Rose arosema@weather.com Female 100.21.196.218 343079152971743 Panama 8/4/1982 23346.01 Recruiter 1 16223618844547598244 +599 2016-02-04 20:52:10.000000000 600 Matthew Johnson mjohnsongn@zdnet.com Male 90.233.99.234 3576872442196283 Greece 8/2/1956 192843.59 Database Administrator I 1 13239287735152636039 +911 2016-02-04 23:09:08.000000000 912 Carl Reid creidpb@who.int Male 118.17.115.33 3529372387916827 Madagascar 119188.6 ̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟ 1 18058487470247735990 +63 2016-02-04 19:20:35.000000000 64 Samuel Bishop sbishop1r@npr.org Male 87.38.89.122 3534693555244475 Indonesia 97009.57 1 439069544912696274 +162 2016-02-04 11:25:44.000000000 163 Joe Campbell jcampbell4i@woothemes.com Male 51.128.14.99 3565057982691561 Ireland 5/17/1987 128833.87 Human Resources Assistant I 1 14365997131972604627 +958 2016-02-04 05:52:40.000000000 959 Adam Robinson arobinsonqm@ameblo.jp Male 246.55.43.219 36766354159810 China 3/18/1986 214321.9 Administrative Assistant II 1 5979803470117383329 + +\N \N \N \N \N \N \N \N \N \N \N \N \N \N 1000 17094857519419304960 +=== Try load data from userdata5.parquet +540 2016-02-04 17:40:57.000000000 541 Stephanie Watkins swatkinsf0@state.gov Female 145.133.26.223 3568947026673228 China 1/6/1970 182075.75 Accounting Assistant I 1 12719679438472110629 +923 2016-02-04 11:12:31.000000000 924 Karen Bryant kbryantpn@phpbb.com Female 201.240.171.189 China 2/27/1965 209754.44 Librarian 1 9513760761369238532 +988 2016-02-04 23:53:57.000000000 989 Raymond Reynolds rreynoldsrg@ezinearticles.com Male 131.241.110.218 3548006380185694 Cameroon 9/29/1987 234254.11 Administrative Officer 1 9292996498794941706 +73 2016-02-04 01:54:58.000000000 74 Jesse Dixon jdixon21@bloglines.com Male 156.125.120.208 Syria 277530.58 (╯°□°)╯︵ ┻━┻) 1 7700275595939105069 +639 2016-02-04 02:33:38.000000000 640 Rose Franklin rfranklinhr@eepurl.com 117.58.38.167 Japan 3/2/1971 \N Help Desk Operator "ثم نفس سقطت وبالتحديد، 1 \N +959 2016-02-04 18:54:50.000000000 960 Ralph Mitchell rmitchellqn@fda.gov Male 75.18.221.126 Sweden 1/19/1991 55185.48 Developer III 1 7269044307705662703 +286 2016-02-04 16:22:26.000000000 287 Raymond Perez rperez7y@fc2.com Male 103.116.49.188 6384418686319994 France 133705.64 -1/2 1 5779306157384476126 +815 2016-02-04 19:16:50.000000000 816 Bonnie Lawrence blawrencemn@amazon.de 134.65.96.123 5002353226125524 Thailand 10/10/1982 \N Tax Accountant 1 \N +212 2016-02-04 08:28:49.000000000 213 Emily Hansen ehansen5w@house.gov Female 218.61.79.46 3539439843757224 China 11/3/1962 212085.53 Technical Writer 1E+02 1 13040741097555162852 +510 2016-02-04 07:35:46.000000000 511 Shirley Graham sgrahame6@clickbank.net Female 233.119.183.15 Philippines 11/19/1995 123469.85 Account Representative I 1 4682654051550734590 +827 2016-02-04 04:16:18.000000000 828 John Wallace jwallacemz@sbwire.com Male 252.210.159.180 Japan 145020.04 1 17121090033668971382 +705 2016-02-04 04:32:57.000000000 706 Nancy Brown nbrownjl@scientificamerican.com 153.67.146.80 China 8/25/1962 \N Nuclear Power Engineer ␡ 1 \N +174 2016-02-04 04:45:00.000000000 175 Robert Spencer rspencer4u@tinypic.com Male 229.31.140.211 China 9/5/1977 134844.31 Business Systems Development Analyst 1 5982920501085697949 +89 2016-02-04 13:14:27.000000000 90 Jose Wallace jwallace2h@about.com Male 250.231.81.57 Philippines 12/17/1983 213500.16 Design Engineer 1 5655515634053624982 +802 2016-02-04 20:42:48.000000000 803 Paula Allen pallenma@boston.com Female 113.118.168.148 4917830849224286 Indonesia 7/22/1974 285735.68 Nurse 1 6738378110910741598 +599 2016-02-04 06:29:32.000000000 600 Wanda Reyes wreyesgn@chicagotribune.com Female 138.59.39.89 30241930229004 China 11/1/1969 41876.49 Design Engineer 1 6802731161147542418 +911 2016-02-04 00:08:27.000000000 912 Evelyn Fisher efisherpb@soup.io Female 221.207.200.158 201473318880354 China 5/17/1998 208654.68 Geological Engineer 1 9524608675424609675 +63 2016-02-04 09:26:01.000000000 64 Dorothy Gray dgray1r@vimeo.com Female 206.99.76.117 3582462082297450 China 10/8/1975 58802.03 Staff Scientist -1.00 1 658839239639963073 +162 2016-02-04 14:50:17.000000000 163 Linda Mason lmason4i@i2i.jp Female 142.61.140.106 3584383784706648 Finland 12/31/1995 174353.91 Sales Associate 1 12432536431851155457 +958 2016-02-04 20:53:58.000000000 959 Kevin Ruiz kruizqm@eventbrite.com Male 227.54.214.102 5602259166959980322 Philippines 1/1/1974 193459.78 Speech Pathologist 1 16616073755473516051 + +\N \N \N \N \N \N \N \N \N \N \N \N \N \N 1000 9687591295559685771 +=== Try load data from v0.7.1.all-named-index.parquet +9 0.24 62.8 57 336 3.94 3.96 2.48 Very Good J VVS2 1 3996266068447330517 +0 0.22 65.1 61 337 3.87 3.78 2.49 Fair E VS2 1 3997278206514178950 +4 0.21 59.8 61 326 3.89 3.84 2.31 Premium E SI1 1 8019383158266555191 +3 0.23 61.5 55 326 3.95 3.98 2.43 Ideal E SI2 1 14186841138840780957 +1 0.23 56.9 65 327 4.05 4.07 2.31 Good E VS1 1 15443496918742015947 +8 0.24 62.3 57 336 3.95 3.98 2.47 Very Good I VVS1 1 13900850617142433881 +6 0.26 61.9 55 337 4.07 4.11 2.53 Very Good H SI1 1 14657634484271004934 +7 0.23 59.4 61 338 4 4.05 2.39 Very Good H VS1 1 8407731385089321422 +5 0.29 62.4 58 334 4.2 4.23 2.63 Premium I VS2 1 15768860839508581766 +2 0.31 63.3 58 335 4.34 4.35 2.75 Good J SI2 1 774416328789992944 + +\N \N \N \N \N \N \N \N \N \N \N 10 6919038777064438429 +=== Try load data from v0.7.1.column-metadata-handling.parquet +0 1 0.1 2016-12-31 23:00:00.000000 a 2016-12-31 23:00:00.000000 1 4077140958183322084 +1 2 0.2 2017-01-01 23:00:00.000000 b 2017-01-01 23:00:00.000000 1 7684735238820339858 +2 3 0.3 2017-01-02 23:00:00.000000 c 2017-01-02 23:00:00.000000 1 18375738761744497728 + +\N \N \N \N \N \N 3 11690870885038608054 +=== Try load data from v0.7.1.parquet +9 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 9 1 7435848941623393462 +0 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 0 1 9641248315475845441 +4 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 4 1 1951911154143418891 +3 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 3 1 11507922016923702495 +1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1 1 6071398217214065536 +8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 8 1 12958836396990906934 +6 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 6 1 196947299945926025 +7 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 7 1 3545027448448316782 +5 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 5 1 18280121696268577926 +2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 2 1 14500005620171806395 + +\N \N \N \N \N \N \N \N \N \N \N \N 10 12302290812367753423 +=== Try load data from v0.7.1.some-named-index.parquet +9 0.24 62.8 57 336 3.94 3.96 2.48 Very Good J VVS2 1 3996266068447330517 +0 0.22 65.1 61 337 3.87 3.78 2.49 Fair E VS2 1 3997278206514178950 +4 0.21 59.8 61 326 3.89 3.84 2.31 Premium E SI1 1 8019383158266555191 +3 0.23 61.5 55 326 3.95 3.98 2.43 Ideal E SI2 1 14186841138840780957 +1 0.23 56.9 65 327 4.05 4.07 2.31 Good E VS1 1 15443496918742015947 +8 0.24 62.3 57 336 3.95 3.98 2.47 Very Good I VVS1 1 13900850617142433881 +6 0.26 61.9 55 337 4.07 4.11 2.53 Very Good H SI1 1 14657634484271004934 +7 0.23 59.4 61 338 4 4.05 2.39 Very Good H VS1 1 8407731385089321422 +5 0.29 62.4 58 334 4.2 4.23 2.63 Premium I VS2 1 15768860839508581766 +2 0.31 63.3 58 335 4.34 4.35 2.75 Good J SI2 1 774416328789992944 + +\N \N \N \N \N \N \N \N \N \N \N 10 6919038777064438429 diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.sh b/tests/queries/0_stateless/00900_long_parquet_load_2.sh new file mode 100755 index 000000000000..34b668ca0f13 --- /dev/null +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Tags: long, no-fasttest, no-asan, no-msan, no-tsan +#asdqwe no-debug + +# Load various .parquet files from the internet, and files used by other tests. + +# userdata{1..5}.parquet are from: +# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet +# ... +# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata5.parquet + +# set -x + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +DATA_DIR=$CUR_DIR/data_parquet + +# TODO [parquet]: Known issues to investigate: + +# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue. +# There is failure due parsing nested arrays or nested maps with NULLs: +# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) + +# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled: +# debug: +# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) +# release: +# Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin + +EXCLUDE=( + # These have extremely long strings and blow up the output. + list_monotonically_increasing_offsets.parquet + string_int_list_inconsistent_offset_multiple_batches.parquet + # Date out of range. + 02716_data.parquet +) + +for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort | grep -vFf <(printf '%s\n' "${EXCLUDE[@]}")); do + echo "=== Try load data from $NAME" + + # We want to read the file once and get both a hash of the whole data and a sample of a few + # pseudorandomly chosen rows. We use a dummy GROUP BY WITH TOTALS for that. + # (Maybe the unnecessary GROUP BY could be expensive if there are lots of rows, but these test + # files don't have lots of rows or they would be too big to check into git.) + # TODO [parquet]: Delete the input_format_parquet_enable_json_parsing=0 when cityHash64 + # supports JSON: https://github.com/ClickHouse/ClickHouse/issues/87734 + # TODO [parquet]: Delete the session_timezone='UTC' after https://github.com/ClickHouse/ClickHouse/pull/87872 + ${CLICKHOUSE_LOCAL} --query=" + SELECT _row_number, *, count(), sum(cityHash64(_row_number, *)) FROM file('$DATA_DIR/$NAME') GROUP BY all WITH TOTALS ORDER BY cityHash64(_row_number) LIMIT 20 SETTINGS input_format_parquet_enable_json_parsing=0, session_timezone='UTC';" 2>&1 +done diff --git a/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns deleted file mode 100644 index cbc891b2ca74..000000000000 --- a/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/alltypes_list.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_list.parquet.columns deleted file mode 100644 index 3bf762ed7d5f..000000000000 --- a/tests/queries/0_stateless/data_parquet/alltypes_list.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`a1` Array(Int8), `a2` Array(UInt8), `a3` Array(Int16), `a4` Array(UInt16), `a5` Array(Int32), `a6` Array(UInt32), `a7` Array(Int64), `a8` Array(UInt64), `a9` Array(String), `a10` Array(FixedString(4)), `a11` Array(Float32), `a12` Array(Float64), `a13` Array(Date), `a14` Array(Datetime('Asia/Istanbul')), `a15` Array(Decimal(4, 2)), `a16` Array(Decimal(10, 2)), `a17` Array(Decimal(25, 2)) diff --git a/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns deleted file mode 100644 index cbc891b2ca74..000000000000 --- a/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns deleted file mode 100644 index cbc891b2ca74..000000000000 --- a/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/array_float.parquet b/tests/queries/0_stateless/data_parquet/array_float.parquet old mode 100755 new mode 100644 diff --git a/tests/queries/0_stateless/data_parquet/array_float.parquet.columns b/tests/queries/0_stateless/data_parquet/array_float.parquet.columns deleted file mode 100644 index 19f950d2b777..000000000000 --- a/tests/queries/0_stateless/data_parquet/array_float.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -idx String, lst Array(Float32) \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/array_int.parquet.columns b/tests/queries/0_stateless/data_parquet/array_int.parquet.columns deleted file mode 100644 index 3b9395715d7e..000000000000 --- a/tests/queries/0_stateless/data_parquet/array_int.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`idx` String, `lst` Array(Int32) \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/array_string.parquet b/tests/queries/0_stateless/data_parquet/array_string.parquet old mode 100755 new mode 100644 diff --git a/tests/queries/0_stateless/data_parquet/array_string.parquet.columns b/tests/queries/0_stateless/data_parquet/array_string.parquet.columns deleted file mode 100644 index 67a7b6bc04d5..000000000000 --- a/tests/queries/0_stateless/data_parquet/array_string.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`idx` String, `lst` Array(String) \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/binary.parquet.columns b/tests/queries/0_stateless/data_parquet/binary.parquet.columns deleted file mode 100644 index 8f3b137647fd..000000000000 --- a/tests/queries/0_stateless/data_parquet/binary.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`foo` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/byte_array_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/byte_array_decimal.parquet.columns deleted file mode 100644 index cb2a97de8c45..000000000000 --- a/tests/queries/0_stateless/data_parquet/byte_array_decimal.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`value` Nullable(Decimal(4, 2)) diff --git a/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns deleted file mode 100644 index e25da8f923d1..000000000000 --- a/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`Id` Nullable(String), `Score` Nullable(Int32) diff --git a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns deleted file mode 100644 index dc094bef8ede..000000000000 --- a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32)) diff --git a/tests/queries/0_stateless/data_parquet/datatype-date32.parquet.columns b/tests/queries/0_stateless/data_parquet/datatype-date32.parquet.columns deleted file mode 100644 index 202a8a7087b3..000000000000 --- a/tests/queries/0_stateless/data_parquet/datatype-date32.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`date32` Date32 \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns b/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns deleted file mode 100644 index 1ea0876ce954..000000000000 --- a/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`l_partkey` Nullable(Int32) diff --git a/tests/queries/0_stateless/data_parquet/fixed_array_int.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_array_int.parquet.columns deleted file mode 100644 index 2c383e8343f8..000000000000 --- a/tests/queries/0_stateless/data_parquet/fixed_array_int.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`idx` String, `lst` Array(Int64) \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/fixed_array_nested_list_int.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_array_nested_list_int.parquet.columns deleted file mode 100644 index 3e1c9533a324..000000000000 --- a/tests/queries/0_stateless/data_parquet/fixed_array_nested_list_int.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`idx` String, `lst` Array(Array(Int64)) \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/fixed_array_str.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_array_str.parquet.columns deleted file mode 100644 index 67a7b6bc04d5..000000000000 --- a/tests/queries/0_stateless/data_parquet/fixed_array_str.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`idx` String, `lst` Array(String) \ No newline at end of file diff --git a/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns deleted file mode 100644 index 469105337a64..000000000000 --- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`value` Nullable(Decimal(25, 2)) diff --git a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns deleted file mode 100644 index 469105337a64..000000000000 --- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`value` Nullable(Decimal(25, 2)) diff --git a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns deleted file mode 100644 index 5e61877db589..000000000000 --- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`value` Nullable(Decimal(13, 2)) diff --git a/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns b/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns deleted file mode 100644 index 5a0c330c88f8..000000000000 --- a/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64) diff --git a/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns deleted file mode 100644 index cb2a97de8c45..000000000000 --- a/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`value` Nullable(Decimal(4, 2)) diff --git a/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns deleted file mode 100644 index 3624a5719702..000000000000 --- a/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`value` Nullable(Decimal(10, 2)) diff --git a/tests/queries/0_stateless/data_parquet/iris.parquet b/tests/queries/0_stateless/data_parquet/iris.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9224dead94cc960679a7f5681efe4efdbe61bedc GIT binary patch literal 2448 zcmZ`*L1-Ii7JgDj^8efVlXZ3pC; zdH?&rdGCGSd%w%Cyb)^BKaJ|c>>u^rG@i3MOY~n^I738t)2X#gDmNZ|@^LDa8;idE zCY7>Aqs;l-+Gs?X6nmtvI7F-?84DdvXHqnFS-_xdIE8ioJV!QiQLCRkvB_aavfO&!Uh*vVA10Zx#hO|yEjP#u}B3KldA+>P!8pB ze@1QN-eI4{RKgxN!ySA-B1iea$m5I8D2J1k+ zxZ=-$qG`7+YoUiI60W+DtN^FMEiNNahxZXV?=2NWL9TAfJ%nTfsumXMIkmNPM9-?d zaFYrMw~G&ZT%+^alO2w*UdGP}YY4x|9ePf;7JIM*f%kBCm9tdG{svuCg4YvQu)dro6!ktqdJRu7@t3~DmYkEWlq*N8PABJy*18MU%M{bw>%nfVJ z1|foXN$Qxsef}JMGYeR*%9zrs?G~j&XMrlT9ag0PkVC_$q|`#zxdrL-@1XLnFkAJ&*s*lhJTH)KGThUTQh+mywVC%a}1Z)a46>%4=MeHww} z5=!d?Dz%^icr5L3D{Pe#Q-BJ7MGnGh*jy-*tdl?}+;z)PYfkmT9-d;izUVB)ypKHM zaKi+(SmiCsBA5(>vQyelaWdPQiRrx!H$lLasm^Ru?uPXRL9!e6coUT?%4nfN1vDKl zam?c5-(eBew+`pP;@aVy=}B>a4#d)tcF45gLv3d+KI@_9gk?IH9zcaQ{frVFXrF7` z$Xv6gA8N0JLRV;Cn}&OgY#*TNzPlqE3m%nxLd(_ZV7^7X#Xj3c@ilINrNhf;`a>O~ zsxjt)te8JBde64(2nZfTV0aY9z5~e=Q7#`6virYLNGzwS7IaXTWe!lq#N9+|ZAcFq zbIkbQ0Ar&ftHu$PH4+N|A#!Q4QUnw@H@i0dn2fe&#iF=)UqW;P(4Xk6QT?*DN{$13#l! zk1%mIq5K1k^D+x?^-zO3cqdSbPX|zYmm4mc%t$4gL7yII41|6a#Z(#7t!G&gKKWSr z7DZ^%N5D-%Hh4+sq4Gl?riO9aBbqN0J2FDYdKd(WTa_EQ2MA~nbdqx1mTW`AZj(KE zgggxJmRSI1f{A+w(WUj4j299rbqG>?jFajptib9?2*MI#B&ObEZ z#3l|w`rGR58@HB+@$~oAo43}l-nf3_rZjKsH?OU~yMA>Uv$mkU{=sPv4$iYL9`l)A zp_SB1TBqIXFF`UZ|GN72%3E*Vx>{X*XIT%E9-c@$xm&BXuCLzy_XJV?ZrUxl|Cq$lGwf;{rcZrEr?H;DGnW5! zXi~iFh{36GF>=Io|Kcc5{%=^3^%KO?L7o79I5heB&

~5qr#a3h}fRD^C!gO7aBq z^P$Nv&z?YDIDy=sdlKYn>r;>;r;2of_;q%&tAS|ZBC>!uD8N^J#Q(R#VuH-M6J)z5 zRUW{7mdf8vP5xMzyi+JR@ 76: - raise RuntimeError( - "Column {} has invalid Decimal precision {}".format( - column_name, precision - ) - ) - if precision > 38: - raise RuntimeError( - "Column {} has unsupported Decimal precision {}".format( - column_name, precision - ) - ) - if scale < 0 or scale > precision: - raise RuntimeError( - "Column {} has invalid Decimal scale {} for precision {}".format( - column_name, scale, precision - ) - ) - return "Decimal({}, {})".format(precision, scale) - if converted_type and converted_type != "NONE": - result_type = TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE.get(converted_type) - if result_type: - return result_type - raise RuntimeError( - "Column {} has unknown ConvertedType: {}".format( - column_name, converted_type - ) - ) - if physical_type and physical_type != "NONE": - result_type = TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE.get(physical_type) - if result_type: - return result_type - raise RuntimeError( - "Column {} has unknown PhysicalType: {}".format(column_name, physical_type) - ) - raise RuntimeError( - "Column {} has invalid types: ConvertedType={}, PhysicalType={}".format( - column_name, converted_type, physical_type - ) - ) - - -def dump_columns(obj): - descr_by_column_name = {} - columns_descr = [] - for column in obj["Columns"]: - column_name = get_column_name(column) - column_type = resolve_clickhouse_column_type(column) - result_type = "Nullable({})".format(column_type) - if column_name in descr_by_column_name: - descr = descr_by_column_name[column_name] - descr["types"].append(result_type) - else: - descr = { - "name": column_name, - "types": [result_type], - } - descr_by_column_name[column_name] = descr - columns_descr.append(descr) - - # Make tuples from nested types. CH Server doesn't support such Arrow type but it makes Server Exceptions more relevant. - def _format_type(types): - if len(types) == 1: - return types[0] - else: - return "Tuple({})".format(", ".join(types)) - - print( - ", ".join( - map( - lambda descr: "`{}` {}".format( - descr["name"], _format_type(descr["types"]) - ), - columns_descr, - ) - ) - ) - - -def dump_columns_from_file(filename): - dump_columns(json.loads(read_file(filename), strict=False)) - - -if __name__ == "__main__": - filename = sys.argv[1] - dump_columns_from_file(filename) From c1f266b7894fe037394f801543bf53111501d919 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Tue, 16 Dec 2025 17:33:39 +0100 Subject: [PATCH 3/7] Revert "Merge pull request #1171 from Altinity/mkmkme/antalya-25.8-missing-field_id" This reverts commit f5fb292ae0cc37a2f2f4bbdb10b21328ee363eae, reversing changes made to 923825b4b32710ca0e76816b351f85ae2568f149. --- .../Formats/Impl/Parquet/SchemaConverter.cpp | 7 +----- .../ObjectStorage/StorageObjectStorage.cpp | 22 ++----------------- .../integration/test_storage_iceberg/test.py | 3 +-- 3 files changed, 4 insertions(+), 28 deletions(-) diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp index 714ec0f1c61d..3b1429c5231e 100644 --- a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp +++ b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp @@ -133,12 +133,7 @@ std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElem return element.name; const auto & map = column_mapper->getFieldIdToClickHouseName(); if (!element.__isset.field_id) - { - /// Does iceberg require that parquet files have field ids? - /// Our iceberg writer currently doesn't write them. - //throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name); - return element.name; - } + throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name); auto it = map.find(element.field_id); if (it == map.end()) throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Parquet file has column {} with field_id {} that is not in datalake metadata", element.name, element.field_id); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 597fea59cd21..740c3b4d1efb 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -203,26 +203,7 @@ StorageObjectStorage::StorageObjectStorage( sample_path); } - /// TODO: Known problems with datalake prewhere: - /// * If the iceberg table went through schema evolution, columns read from file may need to - /// be renamed or typecast before applying prewhere. There's already a mechanism for - /// telling parquet reader to rename columns: ColumnMapper. And parquet reader already - /// automatically does type casts to requested types. But weirdly the iceberg reader uses - /// those mechanism to request the *old* name and type of the column, then has additional - /// code to do the renaming and casting as a separate step outside parquet reader. - /// We should probably change this and delete that additional code? - /// * Delta Lake can have "partition columns", which are columns with constant value specified - /// in the metadata, not present in parquet file. Like hive partitioning, but in metadata - /// files instead of path. Currently these columns are added to the block outside parquet - /// reader. If they appear in prewhere expression, parquet reader gets a "no column in block" - /// error. Unlike hive partitioning, we can't (?) just return these columns from - /// supportedPrewhereColumns() because at the time of the call the delta lake metadata hasn't - /// been read yet. So we should probably pass these columns to the parquet reader instead of - /// adding them outside. - /// * There's a bug in StorageObjectStorageSource::createReader: it makes a copy of - /// FormatFilterInfo, but for some reason unsets prewhere_info and row_level_filter_info. - /// There's probably no reason for this, and it should just copy those fields like the others. - supports_prewhere = !configuration->isDataLakeConfiguration() && FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings); + supports_prewhere = FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings); StorageInMemoryMetadata metadata; metadata.setColumns(columns); @@ -726,3 +707,4 @@ void StorageObjectStorage::checkAlterIsPossible(const AlterCommands & commands, } } + diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py index b2ccd8375b04..f101d1078141 100644 --- a/tests/integration/test_storage_iceberg/test.py +++ b/tests/integration/test_storage_iceberg/test.py @@ -875,8 +875,7 @@ def test_position_deletes_out_of_order(started_cluster, use_roaring_bitmaps): create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, additional_settings=["input_format_parquet_use_native_reader_v3=1", f"use_roaring_bitmap_iceberg_positional_deletes={use_roaring_bitmaps}"]) - # TODO: Replace WHERE with PREWHERE when we add prewhere support for datalakes. - assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} WHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104] + assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} PREWHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104] instance.query(f"DROP TABLE {TABLE_NAME}") From 70bf34d2aae7f1bf0d3fbaf94ffd67d8e58c8f6a Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Fri, 14 Nov 2025 17:44:51 +0000 Subject: [PATCH 4/7] Merge pull request #88827 from ClickHouse/pqe Enable parquet reader v3 by default --- src/Common/threadPoolCallbackRunner.cpp | 25 ++++++--- src/Core/FormatFactorySettings.h | 6 +-- src/Core/SettingsChangesHistory.cpp | 3 +- .../Formats/Impl/Parquet/Decoding.cpp | 8 +-- .../Formats/Impl/Parquet/Prefetcher.cpp | 2 +- .../Formats/Impl/Parquet/ReadManager.cpp | 2 +- .../Formats/Impl/Parquet/Reader.cpp | 51 ++++++++++++------ src/Processors/Formats/Impl/Parquet/Reader.h | 5 +- .../Formats/Impl/Parquet/SchemaConverter.cpp | 32 ++++++++--- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 4 ++ .../ObjectStorage/StorageObjectStorage.cpp | 29 ++++++++-- .../ObjectStorage/StorageObjectStorage.h | 1 + tests/integration/test_storage_delta/test.py | 14 ++--- .../00900_long_parquet_load_2.reference | 16 ++++++ .../0_stateless/00900_long_parquet_load_2.sh | 2 +- .../data_parquet/hudi_array.parquet.gz | Bin 0 -> 1859 bytes .../hudi_array_nonnullable.parquet.gz | Bin 0 -> 1868 bytes .../data_parquet/hudi_nested_array.parquet.gz | Bin 0 -> 1872 bytes .../hudi_nested_array_nonnullable.parquet.gz | Bin 0 -> 1884 bytes 19 files changed, 147 insertions(+), 53 deletions(-) create mode 100644 tests/queries/0_stateless/data_parquet/hudi_array.parquet.gz create mode 100644 tests/queries/0_stateless/data_parquet/hudi_array_nonnullable.parquet.gz create mode 100644 tests/queries/0_stateless/data_parquet/hudi_nested_array.parquet.gz create mode 100644 tests/queries/0_stateless/data_parquet/hudi_nested_array_nonnullable.parquet.gz diff --git a/src/Common/threadPoolCallbackRunner.cpp b/src/Common/threadPoolCallbackRunner.cpp index 81117d22ba3d..a97566c42daf 100644 --- a/src/Common/threadPoolCallbackRunner.cpp +++ b/src/Common/threadPoolCallbackRunner.cpp @@ -69,6 +69,7 @@ void ThreadPoolCallbackRunnerFast::operator()(std::function f) { std::unique_lock lock(mutex); queue.push_back(std::move(f)); + startMoreThreadsIfNeeded(active_tasks_, lock); } @@ -92,23 +93,35 @@ void ThreadPoolCallbackRunnerFast::bulkSchedule(std::vector chars, const UI { col_str.getChars().reserve(col_str.getChars().size() + (offsets[num_values - 1] - offsets[-1]) - separator_bytes * num_values); for (size_t i = 0; i < num_values; ++i) - col_str.insertData(chars.data() + offsets[i - 1], offsets[i] - offsets[i - 1] - separator_bytes); + col_str.insertData(chars.data() + offsets[ssize_t(i) - 1], offsets[i] - offsets[ssize_t(i) - 1] - separator_bytes); } } @@ -1345,8 +1345,8 @@ void BigEndianDecimalStringConverter::convertColumn(std::span cha for (size_t i = 0; i < num_values; ++i) { - const char * data = chars.data() + offsets[i - 1]; - size_t size = offsets[i] - offsets[i - 1] - separator_bytes; + const char * data = chars.data() + offsets[ssize_t(i) - 1]; + size_t size = offsets[i] - offsets[ssize_t(i) - 1] - separator_bytes; if (size > sizeof(T)) throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpectedly wide Decimal value: {} > {} bytes", size, sizeof(T)); diff --git a/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp b/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp index 3e05029060fa..9cf2077b0c19 100644 --- a/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp +++ b/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp @@ -74,7 +74,7 @@ void Prefetcher::determineReadModeAndFileSize(ReadBuffer * reader_, const ReadOp if (!reader_->eof() && reader_->available() >= expected_prefix.size() && memcmp(reader_->position(), expected_prefix.data(), expected_prefix.size()) != 0) { - throw Exception(ErrorCodes::INCORRECT_DATA, "Not a parquet file (wrong magic bytes at the start)"); + throw Exception(ErrorCodes::INCORRECT_DATA, "Not a Parquet file (wrong magic bytes at the start)"); } WriteBufferFromVector> out(entire_file); diff --git a/src/Processors/Formats/Impl/Parquet/ReadManager.cpp b/src/Processors/Formats/Impl/Parquet/ReadManager.cpp index 0cea3ce8e407..333dc19aebab 100644 --- a/src/Processors/Formats/Impl/Parquet/ReadManager.cpp +++ b/src/Processors/Formats/Impl/Parquet/ReadManager.cpp @@ -846,7 +846,7 @@ ReadManager::ReadResult ReadManager::read() bool thread_pool_was_idle = parser_shared_resources->parsing_runner.isIdle(); if (exception) - std::rethrow_exception(exception); + std::rethrow_exception(copyMutableException(exception)); /// If `preserve_order`, only deliver chunks from `first_incomplete_row_group`. /// This ensures that row groups are delivered in order. Within a row group, row diff --git a/src/Processors/Formats/Impl/Parquet/Reader.cpp b/src/Processors/Formats/Impl/Parquet/Reader.cpp index 6bd568869669..f817068af107 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.cpp +++ b/src/Processors/Formats/Impl/Parquet/Reader.cpp @@ -15,6 +15,7 @@ #include #include +#include #if USE_SNAPPY #include @@ -28,6 +29,7 @@ namespace DB::ErrorCodes extern const int INCORRECT_DATA; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; + extern const int CHECKSUM_DOESNT_MATCH; } namespace DB::Parquet @@ -176,7 +178,7 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher) prefetcher.readSync(buf.data(), initial_read_size, file_size - initial_read_size); if (memcmp(buf.data() + initial_read_size - 4, "PAR1", 4) != 0) - throw Exception(ErrorCodes::INCORRECT_DATA, "Not a parquet file (wrong magic bytes at the end of file)"); + throw Exception(ErrorCodes::INCORRECT_DATA, "Not a Parquet file (wrong magic bytes at the end of file)"); int32_t metadata_size_i32; memcpy(&metadata_size_i32, buf.data() + initial_read_size - 8, 4); @@ -216,7 +218,7 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher) /// present. Instead, data_page_offset points to the dictionary page. /// (2) Old DuckDB versions (<= 0.10.2) wrote incorrect data_page_offset when dictionary is /// present. - /// We work around (1) in initializePage by allowing dictionary page in place of data page. + /// We work around (1) in initializeDataPage by allowing dictionary page in place of data page. /// We work around (2) here by converting it to case (1): /// data_page_offset = dictionary_page_offset /// dictionary_page_offset.reset() @@ -756,8 +758,9 @@ void Reader::processBloomFilterHeader(ColumnChunk & column, const PrimitiveColum bool Reader::decodeDictionaryPage(ColumnChunk & column, const PrimitiveColumnInfo & column_info) { auto data = prefetcher.getRangeData(column.dictionary_page_prefetch); - parq::PageHeader header; - size_t header_size = deserializeThriftStruct(header, data.data(), data.size()); + const char * data_ptr = data.data(); + const char * data_end = data.data() + data.size(); + auto [header, page_data] = decodeAndCheckPageHeader(data_ptr, data_end); if (header.type != parq::PageType::DICTIONARY_PAGE) { @@ -768,7 +771,7 @@ bool Reader::decodeDictionaryPage(ColumnChunk & column, const PrimitiveColumnInf return false; } - decodeDictionaryPageImpl(header, data.subspan(header_size), column, column_info); + decodeDictionaryPageImpl(header, page_data, column, column_info); return true; } @@ -776,7 +779,6 @@ void Reader::decodeDictionaryPageImpl(const parq::PageHeader & header, std::span { chassert(header.type == parq::PageType::DICTIONARY_PAGE); - /// TODO [parquet]: Check checksum. size_t compressed_page_size = size_t(header.compressed_page_size); if (header.compressed_page_size < 0 || compressed_page_size > data.size()) throw Exception(ErrorCodes::INCORRECT_DATA, "Dictionary page size out of bounds: {} > {}", header.compressed_page_size, data.size()); @@ -1381,7 +1383,7 @@ void Reader::skipToRow(size_t row_idx, ColumnChunk & column, const PrimitiveColu auto data = prefetcher.getRangeData(page_info.prefetch); const char * ptr = data.data(); - if (!initializePage(ptr, ptr + data.size(), first_row_idx, page_info.end_row_idx, row_idx, column, column_info)) + if (!initializeDataPage(ptr, ptr + data.size(), first_row_idx, page_info.end_row_idx, row_idx, column, column_info)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Page doesn't contain requested row"); found_page = true; } @@ -1403,12 +1405,33 @@ void Reader::skipToRow(size_t row_idx, ColumnChunk & column, const PrimitiveColu chassert(column.next_page_offset <= all_pages.size()); const char * ptr = all_pages.data() + column.next_page_offset; const char * end = all_pages.data() + all_pages.size(); - initializePage(ptr, end, page.next_row_idx, /*end_row_idx=*/ std::nullopt, row_idx, column, column_info); + initializeDataPage(ptr, end, page.next_row_idx, /*end_row_idx=*/ std::nullopt, row_idx, column, column_info); column.next_page_offset = ptr - all_pages.data(); } } -bool Reader::initializePage(const char * & data_ptr, const char * data_end, size_t next_row_idx, std::optional end_row_idx, size_t target_row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info) +std::tuple> Reader::decodeAndCheckPageHeader(const char * & data_ptr, const char * data_end) const +{ + parq::PageHeader header; + data_ptr += deserializeThriftStruct(header, data_ptr, data_end - data_ptr); + size_t compressed_page_size = size_t(header.compressed_page_size); + if (header.compressed_page_size < 0 || compressed_page_size > size_t(data_end - data_ptr)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Page size out of bounds: {} > {}", header.compressed_page_size, data_end - data_ptr); + + std::span page_data(data_ptr, compressed_page_size); + data_ptr += compressed_page_size; + + if (header.__isset.crc && options.format.parquet.verify_checksums) + { + uint32_t crc = arrow::internal::crc32(0, page_data.data(), page_data.size()); + if (crc != uint32_t(header.crc)) + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Page CRC checksum verification failed"); + } + + return {header, page_data}; +} + +bool Reader::initializeDataPage(const char * & data_ptr, const char * data_end, size_t next_row_idx, std::optional end_row_idx, size_t target_row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info) { PageState & page = column.page; /// We reuse PageState instance across pages to reuse memory in buffers like decompressed_buf. @@ -1425,13 +1448,7 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size /// Decode page header. parq::PageHeader header; - data_ptr += deserializeThriftStruct(header, data_ptr, data_end - data_ptr); - /// TODO [parquet]: Check checksum. - size_t compressed_page_size = size_t(header.compressed_page_size); - if (header.compressed_page_size < 0 || compressed_page_size > size_t(data_end - data_ptr)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Page size out of bounds: {} > {}", header.compressed_page_size, data_end - data_ptr); - page.data = std::span(data_ptr, compressed_page_size); - data_ptr += compressed_page_size; + std::tie(header, page.data) = decodeAndCheckPageHeader(data_ptr, data_end); /// Check if all rows of the page are filtered out, if we have enough information. @@ -1525,7 +1542,7 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size page.codec = parq::CompressionCodec::UNCOMPRESSED; } - if (encoded_def_size + encoded_rep_size > compressed_page_size) + if (encoded_def_size + encoded_rep_size > page.data.size()) throw Exception(ErrorCodes::INCORRECT_DATA, "Page data is too short (def+rep)"); encoded_rep = page.data.data(); encoded_def = page.data.data() + encoded_rep_size; diff --git a/src/Processors/Formats/Impl/Parquet/Reader.h b/src/Processors/Formats/Impl/Parquet/Reader.h index eb7cf2931f6f..8b319ec6ed96 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.h +++ b/src/Processors/Formats/Impl/Parquet/Reader.h @@ -61,7 +61,7 @@ namespace DB::Parquet // - no columns to read outside prewhere // - no columns to read, but not trivial count either // - ROW POLICY, with and without prewhere, with old and new reader -// - prewhere with defaults (it probably doesn't fill them correctly, see MergeTreeRangeReader::executeActionsBeforePrewhere) +// - prewhere and other skipping with defaults (it probably doesn't fill them correctly, see MergeTreeRangeReader::executeActionsBeforePrewhere) // - prewhere on virtual columns (do they end up in additional_columns?) // - prewhere with weird filter type (LowCardinality(UInt8), Nullable(UInt8), const UInt8) // - prewhere involving arrays and tuples @@ -523,7 +523,8 @@ struct Reader double estimateAverageStringLengthPerRow(const ColumnChunk & column, const RowGroup & row_group) const; void decodeDictionaryPageImpl(const parq::PageHeader & header, std::span data, ColumnChunk & column, const PrimitiveColumnInfo & column_info); void skipToRow(size_t row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info); - bool initializePage(const char * & data_ptr, const char * data_end, size_t next_row_idx, std::optional end_row_idx, size_t target_row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info); + std::tuple> decodeAndCheckPageHeader(const char * & data_ptr, const char * data_end) const; + bool initializeDataPage(const char * & data_ptr, const char * data_end, size_t next_row_idx, std::optional end_row_idx, size_t target_row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info); void decompressPageIfCompressed(PageState & page); void createPageDecoder(PageState & page, ColumnChunk & column, const PrimitiveColumnInfo & column_info); bool skipRowsInPage(size_t target_row_idx, PageState & page, ColumnChunk & column, const PrimitiveColumnInfo & column_info); diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp index 3b1429c5231e..dfc26b7adcd3 100644 --- a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp +++ b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp @@ -133,7 +133,12 @@ std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElem return element.name; const auto & map = column_mapper->getFieldIdToClickHouseName(); if (!element.__isset.field_id) - throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name); + { + /// Does iceberg require that parquet files have field ids? + /// Our iceberg writer currently doesn't write them. + //throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name); + return element.name; + } auto it = map.find(element.field_id); if (it == map.end()) throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Parquet file has column {} with field_id {} that is not in datalake metadata", element.name, element.field_id); @@ -270,15 +275,19 @@ void SchemaConverter::processSubtree(TraversalNode & node) } } -bool SchemaConverter::processSubtreePrimitive(TraversalNode & node) +static bool isPrimitiveNode(const parq::SchemaElement & elem) { /// `parquet.thrift` says "[num_children] is not set when the element is a primitive type". - /// If it's set but has value 0, logically it would make sense to interpret it as empty tuple/struct. + /// If it's set but has value 0, logically it should be an empty tuple/struct. /// But in practice some writers are sloppy about it and set this field to 0 (rather than unset) /// for primitive columns. E.g. /// tests/queries/0_stateless/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet - bool is_primitive = !node.element->__isset.num_children || (node.element->num_children == 0 && node.element->__isset.type); - if (!is_primitive) + return !elem.__isset.num_children || (elem.num_children == 0 && elem.__isset.type); +} + +bool SchemaConverter::processSubtreePrimitive(TraversalNode & node) +{ + if (!isPrimitiveNode(*node.element)) return false; primitive_column_idx += 1; @@ -468,13 +477,18 @@ bool SchemaConverter::processSubtreeMap(TraversalNode & node) bool SchemaConverter::processSubtreeArrayOuter(TraversalNode & node) { /// Array: - /// required group `name` (List): + /// required/optional group `name` (List): /// repeated group "list": /// "element" /// /// I.e. it's a double-wrapped burrito. To unwrap it into one Array, we have to coordinate /// across two levels of recursion: processSubtreeArrayOuter for the outer wrapper, /// processSubtreeArrayInner for the inner wrapper. + /// + /// But hudi writes arrays differently, without the inner group: + /// required/optional group `name` (List): + /// repeated "array" + /// This probably makes it indinsinguishable from a single-element tuple. if (node.element->converted_type != parq::ConvertedType::LIST && !node.element->logicalType.__isset.LIST) return false; @@ -483,10 +497,12 @@ bool SchemaConverter::processSubtreeArrayOuter(TraversalNode & node) if (node.element->num_children != 1) return false; const parq::SchemaElement & child = file_metadata.schema.at(schema_idx); - if (child.repetition_type != parq::FieldRepetitionType::REPEATED || child.num_children != 1) + if (child.repetition_type != parq::FieldRepetitionType::REPEATED) return false; - TraversalNode subnode = node.prepareToRecurse(SchemaContext::ListTuple, node.type_hint); + bool has_inner_group = child.num_children == 1; + + TraversalNode subnode = node.prepareToRecurse(has_inner_group ? SchemaContext::ListTuple : SchemaContext::ListElement, node.type_hint); processSubtree(subnode); if (!node.requested || !subnode.output_idx.has_value()) diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 3da015c2a46a..2649ea49233b 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -354,6 +354,10 @@ void ParquetBlockOutputFormat::writeUsingArrow(std::vector chunks) builder.version(getParquetVersion(format_settings)); auto compression_codec = getParquetCompression(format_settings.parquet.output_compression_method); builder.compression(compression_codec); + if (format_settings.parquet.max_dictionary_size == 0) + builder.disable_dictionary(); + else + builder.dictionary_pagesize_limit(format_settings.parquet.max_dictionary_size); if (arrow::util::Codec::SupportsCompressionLevel(compression_codec)) { diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 740c3b4d1efb..d66391f57671 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -203,7 +203,31 @@ StorageObjectStorage::StorageObjectStorage( sample_path); } - supports_prewhere = FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings); + bool format_supports_prewhere = FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings); + + /// TODO: Known problems with datalake prewhere: + /// * If the iceberg table went through schema evolution, columns read from file may need to + /// be renamed or typecast before applying prewhere. There's already a mechanism for + /// telling parquet reader to rename columns: ColumnMapper. And parquet reader already + /// automatically does type casts to requested types. But weirdly the iceberg reader uses + /// those mechanism to request the *old* name and type of the column, then has additional + /// code to do the renaming and casting as a separate step outside parquet reader. + /// We should probably change this and delete that additional code? + /// * Delta Lake can have "partition columns", which are columns with constant value specified + /// in the metadata, not present in parquet file. Like hive partitioning, but in metadata + /// files instead of path. Currently these columns are added to the block outside parquet + /// reader. If they appear in prewhere expression, parquet reader gets a "no column in block" + /// error. Unlike hive partitioning, we can't (?) just return these columns from + /// supportedPrewhereColumns() because at the time of the call the delta lake metadata hasn't + /// been read yet. So we should probably pass these columns to the parquet reader instead of + /// adding them outside. + /// * There's a bug in StorageObjectStorageSource::createReader: it makes a copy of + /// FormatFilterInfo, but for some reason unsets prewhere_info and row_level_filter_info. + /// There's probably no reason for this, and it should just copy those fields like the others. + /// * If the table contains files in different formats, with only some of them supporting + /// prewhere, things break. + supports_prewhere = !configuration->isDataLakeConfiguration() && format_supports_prewhere; + supports_tuple_elements = format_supports_prewhere; StorageInMemoryMetadata metadata; metadata.setColumns(columns); @@ -357,7 +381,7 @@ void StorageObjectStorage::read( column_names, storage_snapshot, supportsSubsetOfColumns(local_context), - /*supports_tuple_elements=*/ supports_prewhere, + supports_tuple_elements, local_context, PrepareReadingFromFormatHiveParams { file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() }); if (query_info.prewhere_info) @@ -707,4 +731,3 @@ void StorageObjectStorage::checkAlterIsPossible(const AlterCommands & commands, } } - diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 9b1fe6ea9aee..b4f029094631 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -199,6 +199,7 @@ class StorageObjectStorage : public IStorage /// (One of the reading replicas, not the initiator). const bool distributed_processing; bool supports_prewhere = false; + bool supports_tuple_elements = false; /// Whether we need to call `configuration->update()` /// (e.g. refresh configuration) on each read() method call. bool update_configuration_on_read_write = true; diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py index 3b40e1eb4870..02c860280bff 100644 --- a/tests/integration/test_storage_delta/test.py +++ b/tests/integration/test_storage_delta/test.py @@ -66,7 +66,6 @@ "field_ids_struct_test/data/00000-1-7cad83a6-af90-42a9-8a10-114cbc862a42-0-00001.parquet", ] - def get_spark(): builder = ( pyspark.sql.SparkSession.builder.appName("spark_test") @@ -2295,21 +2294,23 @@ def test_column_pruning(started_cluster): query_id = f"query_{TABLE_NAME}_1" sum = int( instance.query( - f"SELECT sum(id) FROM {table_function} SETTINGS allow_experimental_delta_kernel_rs=0, max_read_buffer_size_remote_fs=100", + f"SELECT sum(id) FROM {table_function} SETTINGS allow_experimental_delta_kernel_rs=0, max_read_buffer_size_remote_fs=100, remote_read_min_bytes_for_seek=1, input_format_parquet_use_native_reader_v3=1", query_id=query_id, ) ) instance.query("SYSTEM FLUSH LOGS") - assert 107220 == int( + bytes_read = int( instance.query( f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'" ) ) + # Slightly different number depending on reader implementation. + assert 107220 <= bytes_read <= 107232 query_id = f"query_{TABLE_NAME}_2" assert sum == int( instance.query( - f"SELECT sum(id) FROM {table_function} SETTINGS enable_filesystem_cache=0, max_read_buffer_size_remote_fs=100", + f"SELECT sum(id) FROM {table_function} SETTINGS enable_filesystem_cache=0, max_read_buffer_size_remote_fs=100, remote_read_min_bytes_for_seek=1, input_format_parquet_use_native_reader_v3=1", query_id=query_id, ) ) @@ -2319,12 +2320,13 @@ def test_column_pruning(started_cluster): f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'" ) ) - # Small diff because in case of delta-kernel metadata reading is not counted in the metric. - assert 105677 == int( + bytes_read = int( instance.query( f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'" ) ) + # Small diff because in case of delta-kernel metadata reading is not counted in the metric. + assert 105677 <= bytes_read <= 105689 def test_concurrent_reads(started_cluster): diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.reference b/tests/queries/0_stateless/00900_long_parquet_load_2.reference index 955134ffd23c..26c944b1cda0 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load_2.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.reference @@ -358,6 +358,22 @@ 2 1593604801 abc 42.125 1 9545089655608587545 \N 0 \N 4 14247877966938301061 +=== Try load data from hudi_array.parquet.gz +0 20251021133601353 20251021133601353_0_0 20251021133601353_31_0 02b5ff08-ab93-41f4-a646-2ae35d622787-0_0-16-109_20251021133601353.parquet ['str1','str2'] 1 10899520343274658172 + +\N \N \N \N \N \N [] 1 10899520343274658172 +=== Try load data from hudi_array_nonnullable.parquet.gz +0 20251021135411504 20251021135411504_0_0 20251021135411504_31_0 80554b2b-65e4-49de-8a21-dfd9d5cabb1b-0_0-16-109_20251021135411504.parquet ['str1','str2'] 1 2444990607497031498 + +\N \N \N \N \N \N [] 1 2444990607497031498 +=== Try load data from hudi_nested_array.parquet.gz +0 20251021135942392 20251021135942392_0_0 20251021135942392_31_0 8a501b64-754c-4677-b643-cc06603bf4a8-0_0-16-109_20251021135942392.parquet [['a','b'],['c','d']] 1 17699594303488273172 + +\N \N \N \N \N \N [] 1 17699594303488273172 +=== Try load data from hudi_nested_array_nonnullable.parquet.gz +0 20251021135758096 20251021135758096_0_0 20251021135758096_31_0 0f908091-95de-49c1-9ebe-213449570767-0_0-16-109_20251021135758096.parquet [['a','b'],['c','d']] 1 14842865479642761902 + +\N \N \N \N \N \N [] 1 14842865479642761902 === Try load data from int-list-zero-based-chunked-array.parquet 36993 [36993,36994,36995,36996,36997,36998,36999,37000,37001] 1 13704691802482625316 36152 [36152,36153,36154,36155,36156,36157,36158,36159,36160] 1 17679192445898543326 diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.sh b/tests/queries/0_stateless/00900_long_parquet_load_2.sh index 34b668ca0f13..0070a97983fb 100755 --- a/tests/queries/0_stateless/00900_long_parquet_load_2.sh +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.sh @@ -37,7 +37,7 @@ EXCLUDE=( 02716_data.parquet ) -for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort | grep -vFf <(printf '%s\n' "${EXCLUDE[@]}")); do +for NAME in $(find "$DATA_DIR" -type f \( -iname '*.parquet' -o -iname '*.parquet.gz' \) -print0 | xargs -0 -n 1 basename | LC_ALL=C sort | grep -vFf <(printf '%s\n' "${EXCLUDE[@]}")); do echo "=== Try load data from $NAME" # We want to read the file once and get both a hash of the whole data and a sample of a few diff --git a/tests/queries/0_stateless/data_parquet/hudi_array.parquet.gz b/tests/queries/0_stateless/data_parquet/hudi_array.parquet.gz new file mode 100644 index 0000000000000000000000000000000000000000..01a117bb59901e8351d2bc5e4d64f42cab00556b GIT binary patch literal 1859 zcmZ{hdpHw%7{_&#(#X?6j@zb_gRJOjxg?k9W??9DC%0A9uxyOkI#J8qVhPDsM{BuP zn)^m6qNe*8ZJ5GnM$A1sM|yM~Pkw)WKkw)Ly}##qf1sod8$g*;r=_G%qk_)(`6H43 z7=v(sWCSWWQcB2mIID3=RjP-R(&q`)R=VL)QUFLlWPg<(j6d6)C^uvVz2s~S-EVu3 zcp~&w2M;Yov4`jH9kj9az94ODF>x%FN;K7ax>?)BG(2_fP(kY@VxXfk|F`m5*h2@l zEAHc2l6hM3mu2pYRQ-KXcSl7Q5YC(^oRS zuRf+!S2UVdi^*YPs5ZJ|PTBHSv+W=plcpQd?N*!EqGOxNAZ^5FZN$&^*4B$k7QUsm zMdfeFILE9qWz^lInasc_=6zVc>W5=<8WAzhkJY_$-6 zGA?8#c}0K=aJz2;@MDt_T%H<$V5XwJ>7q(`UMO|y&Sggt;Bv+oyvUx;qtdS37}tb0sDls1uBv5tfx0t2ys%b2Z6N%IZ5> zRO2rB1f!}N8r^V?*4m(;0X;=2i&WINOQF*}i{2KNJUepHt*KMoIiSEr;ULT^-k|(6 zzvKdf+d4eS73ypz4QREO=hN8N+|)I-ha(zi zcUe|fJNtwjnZUT|Y<@wn0-j5yV8tn3nX`4S8C4^}ewu#Wh3zEnr}+15we0O;4^>LnapJRbp*E08_7#3iX^yjg{$bz-5M#lZA3ps~id<__h2BhXwACt6B5Cc)_PH%yNZZLSRdd zwaG8daTqb7RWjSGTcQus_m&7DZQAmYIyATun4Q1hLOD{gEHi; z{c8VfAY%aWh(+$OIJGqNr!?1)9NhF1SEL@FvVJBg;)(gRvuq^YPE*{>SQ!H z9??@C1^>QoId=8Z_R=`>x72@iE!unjNb*WCy1z}X=I^=ty9Q;XqHTFS%f8{jDEHbj z{Rrj^aza`9+V9k3x%{Uq+j`kN~u5y2%DzGs^3Qs6Ea z3+4@$()crh4bxrk+EK`i&6ygP?aKNys2lyaLM{#Rr~nm`S=NBlRyiVEDuU literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/hudi_array_nonnullable.parquet.gz b/tests/queries/0_stateless/data_parquet/hudi_array_nonnullable.parquet.gz new file mode 100644 index 0000000000000000000000000000000000000000..7359e4ed648574fcd8440146421c8e7c36320cc4 GIT binary patch literal 1868 zcmZWndpOf;9M@Bi=|)fKR4&!2Q;M2Xl;yS)k>qSdPtGiIE0?j^Fkv|2bUW;n(A3ry zQFCn=W-FPaTw-M%m(B2-m<=(rjXB+1s`rog{d~Tk@8|h`-|zcwxvHVDC0!PvwlpBZ zFVM>yi}gP56^afGjXz3&bGgGk(%USZOM840 zwQuRia3i)|D1mu+yE(62vo=59U%aXFaRH+_nrvz)2RUo$LEIed^qjS(*Y+krq;E_v zURrwp(UtuBo9?ARb=w>7-i)BrX2dn%RcLJ%CFfE0hJ590Zq<67pyTD)yT zS53M|Z+JvfcXyw>6~icvZKF~#!FW--^Sh4jYSPL(UEQ%`&Qprll1YnNLC`sqkn!hc z%n}}oE%mD`)AyohxOLJ1b`{^vqrtIs{l)sk(RX=B!L=atZxPw_Vi)wOPH9TN+vuC9 zT(5zt%Q8JatN?XTxzZY6b(dSNKw%lP7}ca8yn!YXN8?Pog9>Gtdd$-$2zXyeyJXoe zQ1U$p17uG@ynm~?L6eK1c8J&Y<9z#kQyRhtH|rV^Qy*@KeVw};Cr4U07s6A)tK1(h zjRY=mMjnI7$|78Sj@faQ!O_}va^@Dz%nJ?FsxWn!rx|=Z^8lSzRIlLs;&W|NrbP51PB>93BbtlY`pBx#(=3x5vRs$*0gBnt)8L2(DH%6{E&TVkz!UW zuZG_yy+?8;W)gCFl4oLShhwe}t8_oroobL071P_rY2iyud-ATgOJT~17ed&BYc;<^ z9SFFPUXsbWFx(FP8wgs>8>el9KjR8$W{Ku}`e(qh7232f`3P0S>?cd-G|SlO{6k;JaCzNAd`|!7#6ZFG zwZ0J+ldUFjzfTP%5Zin{*IN*K9u(h*SfKN$M^Jq8Yux&m9~z%e`vfd+(td1m6uRg8 zzN{hfJ3)(`v40~M4RAsE&wYNBYHAqP6m{E~EejJ}$V-4H&5Z}M2D!+?GH6D8ch3da zBGhAx{aLh#FNS(sRWS$V0;7k2te-%n87z8G){J}O+y|Jm&7YVeg4M1hQPWu@JYQGZ z6lC;AU|H`_X@k@zE(hY8dd0-W)B|kiLU+$^wtws6MG(_vCugZtgy_z g5F(!J+q`T^)$=T8 ztiiVC6R%AnZXpvtG?uC1Vt2JZ!a48HdhFeqUmg4Y-l3Cj+JRWPqCx~Yhwc?sS@wKZKBP}{9u+=A$iO&d3DX~vz zw=odg`g@p_O1@{?Qu{|JL0DPtc4ud8RjvAfknC2;Nr~NBoECroCM}CeF>sAe3B6Bv&g|FkK z_Tt_v8CJdAwu9#K-zqShS~Zee#w5o&JD*ZX)EpozH1xxyvy9g{o|@2a4fLK6Ya$`~ z=Mp9LEECI|leQoV`A7!k1s)Ss`mQe3(U@%ReKKadil!dxZ9W-c;Cj9lkB?(?kSWg_ zu8gEV_|P$UL}XP9g8E&+tcXWg`O;Q=PY$-}0Ku7^H$?!0uS05>(h;-*NuE0zcjDAUSmjCppy6H5q=5ZW!ho;a{MkA9=l%@k=2O4Xsb~zjL+W5SS`my9x9^0F^iQ*Sx-q01=^3KtGMkFYa zOQ}?=i~E`Tu1^_GjqN>N@k{jzng*G2CN#t$offpV)HPkVJ#KC~g@SY+f9G~=z`{il zkQjBp{isFP8`A7};YTOKsgvb*^|vt!Vv4HTgGXG5Te@6Cu9PXSZv15$@g4b&LU8j9 z9!oRm1)fO}HP@Q2+JD7lpAV|Rcw#2;V`mS=5FDK^E1a+wNw{vBMy?SmfBYRzZ5nz~T#oSw=XI<;{(j$A*aI>as1F=zq&jaTeL zOkF_t@sZiy1@g|P4)rghKso9>FD~G~)jq+JFT#9av(9vy@fXwWJ*=3owgJVrERC6t zu>^jd_$X-Sm&C96KjS3Z;f3-42XUF==tTp+ZFcaY?|3v&;;`rbVkJp4oZP4WL~Cr` z<2|L?cZ*bi&G%4~MTV0M+428;)FfLLbmg)|<)1f#I+L%>%LsD!u+Q3mT_wMIj{b{7 z$?64r-!_v7)`0>hBQ#4b7Axrl)lbOJlqlVfss;c&;rA2$KXZVjBTv4%62Y2j`DPu{ zg-4D2kmGJls_aA@#%BzM(uEGNfMSo#?dzOQm*cbwdu})udF+U+cZbr6vR$Mz-Wiqy z(r(9zZhR%x4@CwUr|CEi)}J+8XE~la>hajhGk1r!YI!?npr`}xlaswuyCCcIH80oH o6n1?~*>0`FDO^^XS=#bCJwn|CMSa=APo-Q&@K-=l_iESS#g(xM{tSv=eVlPa2$y=A4oLpNwl}s-d z^S-hbXRAbo);`W=)0!B*{m_;10R2qU({jd zb(58O7j0V)3rEk)8{XuT9q36Y|9RKm>TW87U7<+qmX`*$g;Dz_dS{Ei7%B0-LL|g6;Mv!r~URw)FKKe#CSO9un@r1NI`~v zy>R28ycWxybQdKtWHx+iFcc2|ZKG1- z%38fT&vuzU4WY@Y87qYAg*liQpsw>Ff>m*t96=ydoF_WY56IuDl)dC{zG2RL_``kL z+0ITnBe$p7BR+r)fZ zZG9X}T{k%LJoFMSuWPUW)K;-^nwE-pW2cdrDtub1;wVBSLZ`tkV0YLl^6eT=;e@Vu zr22Fz*HWq|DrR_f!!vYzOj$VN zmzL1V5tF5+NPByaiDqJ>U_S&I=Um}>bQ`J1{g4vLc{Pb2 zw|s&&#pF^b;R)Az%Bvk!caGPJJ(!}V) z$uur%+w>8ZM1$lX(SO`yCmYTs{@rigySsbF7h|UVkMLCEFW>(Mcw4`Hv)+eg>cQlH zkblTeI?0V#{)s7Ri$Zb= z$vA+;&HX9RidKr?DU!CxiEkO$Ptka!H^GyKBm35r6xJ vi Date: Tue, 16 Dec 2025 17:53:57 +0100 Subject: [PATCH 5/7] update 00900_long_parquet_load_2 from the upstream --- .../00900_long_parquet_load_2.reference | 10 ++++++++++ .../conversion_to_datetime64_test.parquet | Bin 0 -> 1042 bytes .../0_stateless/data_parquet/not_utc.parquet | Bin 0 -> 547 bytes 3 files changed, 10 insertions(+) create mode 100644 tests/queries/0_stateless/data_parquet/conversion_to_datetime64_test.parquet create mode 100644 tests/queries/0_stateless/data_parquet/not_utc.parquet diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.reference b/tests/queries/0_stateless/00900_long_parquet_load_2.reference index 26c944b1cda0..3141bff40b3f 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load_2.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.reference @@ -193,6 +193,10 @@ 1 456 2 1 10860017704905464526 \N \N \N 2 10884978688299004095 +=== Try load data from conversion_to_datetime64_test.parquet +0 1 2020-01-01 14:00:00 2020-01-01 14:00:00+00:00 1 14913180766044805002 + +\N \N \N \N 1 14913180766044805002 === Try load data from datapage_v2.snappy.parquet 0 abc 1 2 true [1,2,3] 1 16078188846132661856 4 abc 5 2 true [1,2] 1 6454514002962808519 @@ -609,6 +613,12 @@ 0 8 [-1] [[-1,-2],[]] {'k1':-1} [{},{'k1':1},{},{}] (-1,[-1],([[(-1,'nonnullable')]]),{}) 1 9565612059618739419 \N 0 [] [] {} [] (0,[],([]),{}) 1 9565612059618739419 +=== Try load data from not_utc.parquet +0 2023-01-01 12:00:00.000 1 2604320271863989052 +1 \N 1 \N +2 2023-01-02 13:00:00.000 1 1660056001302137693 + +\N \N 3 4264376273166126745 === Try load data from nullable.impala.parquet 0 1 [1,2,3] [[1,2],[3,4]] {'k1':1,'k2':100} [{'k1':1}] (1,[1],([[(10,'aaa'),(-10,'bbb')],[(11,'c')]]),{'foo':(([1.1]))}) 1 5288452023017997993 4 5 [] [] {} [] (NULL,[],([]),{'foo':(([2.2,3.3]))}) 1 5760559227194221665 diff --git a/tests/queries/0_stateless/data_parquet/conversion_to_datetime64_test.parquet b/tests/queries/0_stateless/data_parquet/conversion_to_datetime64_test.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fe7e6e12f894946d129b31cb1f4b28d1bbb9d34c GIT binary patch literal 1042 zcmaJ>&2G~`5FRhVDpiD{R=ap5Bl%Jbi6+<{HctN>hZ7y#f)kSkKH zld~aBY1$9qj^SH|Pq7=0AT10-0F}06P?p|$xk`b+3%sW1({lX#ZO@}s0k&F^W>q=o z2;1^eNq9Tf)r5B!(dr7?BH3CT`nLt$+W^Z9IZjmrqserlUZ{) zO%CJN7s+8d{pD6d+q&a|E$ey%%!XzJDU?vB>e3C|r9Ttb>ojJNG;N^E5YVAoQ@g;o z@E1Uu_0D z%bhF^g$et2S>;gOjG@U0!$IzhSW{go#DjSBWcN6FeC0>}rZ*z4^F9@APPr+HZtrY! zQkb`vf3jInhhCH+PC5SyxmoJx0K=^4lakBxl>7%=Im1s!K^*RL+8+C7@VBn0s|q0@khoOkLj}=RLd;PV1%}e#6uPpovG6CD z5I>KZJC|%MNIcp1&d+z}lkEx4`VwR;<5&uEgcL>v&&O}}#RE_Qd=2L8h5pVjDggL2 zH#Y0AIg*#> oQJLC@$9&nz%CeJMyI4g}xvlON_fdbp+w1nUyG~2bKB4dN4T%YL-~a#s literal 0 HcmV?d00001 From a2dcc745f9f8a981c17b86f08609ebfda65c198c Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Wed, 17 Dec 2025 14:36:01 +0100 Subject: [PATCH 6/7] Fix 00900_long_parquet_load_2.reference The version from the upstream contains changes specific to ClickHouse 25.10+, which are not applicable to our build. --- .../00900_long_parquet_load_2.reference | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.reference b/tests/queries/0_stateless/00900_long_parquet_load_2.reference index 3141bff40b3f..e3f9cd89e089 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load_2.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.reference @@ -43,13 +43,13 @@ 75722 722 Hello722 2 1 8736384626514688031 39807 807 Hello807 \N 1 \N -\N 0 \N 100000 16777408911109712567 +\N \N \N \N 100000 16777408911109712567 === Try load data from 02725_data.parquet 0 0 1 5222652260262540557 1 1 1 2963573356302499406 2 2 1 787535183767392522 -\N 0 3 8973760800332432485 +\N \N 3 8973760800332432485 === Try load data from 02960_polygon_bound_bug.parquet 0 [[[(-157.9582991377384,21.28868463439519),(-157.9582991377384,21.35254833902807),(-157.89014212298457,21.35254833902807),(-157.89014212298457,21.28868463439519),(-157.9582991377384,21.28868463439519)]]] 1 2965019508272517993 1 [[[(174.76274050943928,-41.35902838296618),(174.76274050943928,-41.29535963198179),(174.8472037311771,-41.29535963198179),(174.8472037311771,-41.35902838296618),(174.76274050943928,-41.35902838296618)]]] 1 12759744261968863701 @@ -123,7 +123,7 @@ 5 idx6 [10.2] 1 3798195961072290386 2 idx3 [10.2,8.2] 1 1513321551386387090 -\N [] 10 16413140331168905886 +\N \N [] 10 16413140331168905886 === Try load data from array_int.parquet 9 idx10 [100,101,102] 1 13206811767060123460 0 idx1 [100,101,102] 1 1468807849511680661 @@ -136,7 +136,7 @@ 5 idx6 [100,101] 1 4988466500581683121 2 idx3 [100,101,102,101] 1 13792751652532249689 -\N [] 10 14458819231670106154 +\N \N [] 10 14458819231670106154 === Try load data from array_string.parquet 9 idx10 ['This','is','a','test'] 1 8058991263431016296 0 idx1 ['This','is','a','test'] 1 4316848877505089401 @@ -204,7 +204,7 @@ 1 abc 2 3 true [] 1 243797108142459904 2 abc 3 4 true [] 1 16370920853893187024 -\N \N 0 0 false [] 5 2253932663712014071 +\N \N \N \N \N [] 5 2253932663712014071 === Try load data from datatype-date32.parquet 0 1925-01-01 1 95355579932678910 3 2282-12-31 1 1199432099726642882 @@ -361,7 +361,7 @@ 1 1593604800 def 7.7 1 5861304996888164348 2 1593604801 abc 42.125 1 9545089655608587545 -\N 0 \N 4 14247877966938301061 +\N \N \N \N 4 14247877966938301061 === Try load data from hudi_array.parquet.gz 0 20251021133601353 20251021133601353_0_0 20251021133601353_31_0 02b5ff08-ab93-41f4-a646-2ae35d622787-0_0-16-109_20251021133601353.parquet ['str1','str2'] 1 10899520343274658172 @@ -453,7 +453,7 @@ 1 2 1 463667963421364848 2 4 1 17956467173040956166 -\N 0 4 1916660908465950835 +\N \N 4 1916660908465950835 === Try load data from ipv6_bloom_filter.gz.parquet 0 zTNx7ꙕ 1 13288000489149796924 4 w*I\n+ 1 9468751629168416770 @@ -461,7 +461,7 @@ 1  }B,K32 1 7253289996785367618 2 ZD/y 1 842579928916594899 -\N \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 5 11185867936307116338 +\N \N 5 11185867936307116338 === Try load data from iris.parquet 73 6.1 2.8 4.7 1.2 Versicolor 1 18204632795337117686 89 5.5 2.5 4 1.3 Versicolor 1 4653652012410455888 @@ -513,7 +513,7 @@ 162 -61 16727 488 4294967364 10247569155516682409 true CGBM {"key":290, "value":"HXAPM"} QQKP {?GxDZ\r 1 14424261032199581839 958 37 11953 67 4294967298 6685068849521451102 true NBTZ {"key":681, "value":"RYARL"} IEBX 5I\'9~^8 1 4887123917339036857 -\N 0 0 0 0 0 false \0\0\0\0 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 1000 5163471426232818546 +\N \N \N \N \N \N \N \N \N \N \N 1000 5163471426232818546 === Try load data from nation.dict-malformed.parquet 24 24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be 1 12039456768470916139 20 20 SAUDI ARABIA 4 ts. silent requests haggle. closely express packages sleep across the blithely 1 1938971365265172649 @@ -569,7 +569,7 @@ 1 [[['a','b'],['c','d']],[[],['e']]] 1 1 8741655072046188693 2 [[['a','b'],['c','d'],['e']],[[],['f']]] 1 1 16465740724229345044 -\N [] 0 3 10822235990477762966 +\N [] \N 3 10822235990477762966 === Try load data from nested_maps.snappy.parquet 0 {'a':{1:true,2:false}} 1 1 1 13131726677860377679 4 {'e':{1:true}} 1 1 1 4543095021381921673 @@ -578,7 +578,7 @@ 5 {'f':{3:true,4:false,5:true}} 1 1 1 8080011959117096065 2 {'c':{}} 1 1 1 410499042983863970 -\N {} 0 0 6 7936364349798542993 +\N {} \N \N 6 7936364349798542993 === Try load data from nine_byte_decimals_from_spark.parquet 73 73 74 1 13383650340633121129 89 89 90 1 7405217149357997963 @@ -612,7 +612,7 @@ === Try load data from nonnullable.impala.parquet 0 8 [-1] [[-1,-2],[]] {'k1':-1} [{},{'k1':1},{},{}] (-1,[-1],([[(-1,'nonnullable')]]),{}) 1 9565612059618739419 -\N 0 [] [] {} [] (0,[],([]),{}) 1 9565612059618739419 +\N \N [] [] {} [] (NULL,[],([]),{}) 1 9565612059618739419 === Try load data from not_utc.parquet 0 2023-01-01 12:00:00.000 1 2604320271863989052 1 \N 1 \N @@ -647,7 +647,7 @@ 5 5 \N 1 \N 2 2 false 1 15522680419900289749 -\N 0 \N 10 5974982459813463751 +\N \N \N 10 5974982459813463751 === Try load data from nulls.snappy.parquet 0 (NULL) 1 2635166664326777715 4 (NULL) 1 16032008459108269792 From 9c390fc9bca5ea52e20d96e1e6f22256e1fe3385 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Wed, 17 Dec 2025 14:43:47 +0100 Subject: [PATCH 7/7] fix test_position_deletes_out_of_order With the backport of 88827, prewhere for datalakes was explicitly disabled. --- tests/integration/test_storage_iceberg/test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py index f101d1078141..3d345134e1ee 100644 --- a/tests/integration/test_storage_iceberg/test.py +++ b/tests/integration/test_storage_iceberg/test.py @@ -875,7 +875,8 @@ def test_position_deletes_out_of_order(started_cluster, use_roaring_bitmaps): create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, additional_settings=["input_format_parquet_use_native_reader_v3=1", f"use_roaring_bitmap_iceberg_positional_deletes={use_roaring_bitmaps}"]) - assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} PREWHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104] + # TODO: Replace WHERE with PREWHERE when we add prewhere support for datalakes. + assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} WHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104] instance.query(f"DROP TABLE {TABLE_NAME}") @@ -3922,7 +3923,7 @@ def check_validity_and_get_prunned_files(select_expression): ) - + def test_iceberg_write_minmax(started_cluster): instance = started_cluster.instances["node1"] TABLE_NAME = "test_iceberg_write_minmax_" + get_uuid_str()