From 342c3c091151927e048f54049b1d045749b0e4c5 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 22 Feb 2022 21:00:05 -0800 Subject: [PATCH 01/11] Add TPC-H Generator --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compute/exec/CMakeLists.txt | 2 + cpp/src/arrow/compute/exec/tpch_benchmark.cc | 175 + cpp/src/arrow/compute/exec/tpch_node.cc | 3704 ++++++++++++++++++ cpp/src/arrow/compute/exec/tpch_node.h | 69 + cpp/src/arrow/compute/kernels/vector_sort.cc | 5 +- 6 files changed, 3954 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/compute/exec/tpch_benchmark.cc create mode 100644 cpp/src/arrow/compute/exec/tpch_node.cc create mode 100644 cpp/src/arrow/compute/exec/tpch_node.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b984bc10425..d73de73565f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -394,6 +394,7 @@ if(ARROW_COMPUTE) compute/exec/sink_node.cc compute/exec/source_node.cc compute/exec/task_util.cc + compute/exec/tpch_node.cc compute/exec/union_node.cc compute/exec/util.cc compute/function.cc diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index 3832273593d..cf725667107 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -32,6 +32,8 @@ add_arrow_compute_test(util_test PREFIX "arrow-compute") add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(tpch_benchmark PREFIX "arrow-compute") + if(ARROW_BUILD_OPENMP_BENCHMARKS) find_package(OpenMP REQUIRED) add_arrow_benchmark(hash_join_benchmark diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc new file mode 100644 index 00000000000..963782333cf --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/testing/future_util.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/util/make_unique.h" +#include "arrow/compute/cast.h" + +namespace arrow +{ +namespace compute +{ + +std::shared_ptr Plan_Q1(AsyncGenerator> &sink_gen, int scale_factor) +{ + ExecContext *ctx = default_exec_context(); + *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(ctx); + TpchGen gen = *TpchGen::Make(plan.get(), scale_factor); + + ExecNode *lineitem = *gen.Lineitem( + { + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_TAX", + "L_DISCOUNT", + "L_SHIPDATE", + "L_RETURNFLAG", + "L_LINESTATUS" + }); + + std::shared_ptr sept_2_1998 = std::make_shared(10471); // September 2, 1998 is 10471 days after January 1, 1970 + Expression filter = less_equal(field_ref("L_SHIPDATE"), literal(std::move(sept_2_1998))); + FilterNodeOptions filter_opts(filter); + + Expression l_returnflag = field_ref("L_RETURNFLAG"); + Expression l_linestatus = field_ref("L_LINESTATUS"); + Expression quantity = field_ref("L_QUANTITY"); + Expression base_price = field_ref("L_EXTENDEDPRICE"); + + std::shared_ptr decimal_1 = std::make_shared(Decimal128{0, 100}, decimal(12, 2)); + Expression discount_multiplier = call("subtract", { literal(decimal_1), field_ref("L_DISCOUNT") }); + Expression tax_multiplier = call("add", { literal(decimal_1), field_ref("L_TAX") }); + Expression disc_price = call("multiply", { field_ref("L_EXTENDEDPRICE"), discount_multiplier }); + Expression charge = call("multiply", + { + call("cast", + { + call("multiply", { field_ref("L_EXTENDEDPRICE"), discount_multiplier }) + }, compute::CastOptions::Unsafe(decimal(12, 2))), + tax_multiplier + }); + Expression discount = field_ref("L_DISCOUNT"); + + std::vector projection_list = + { + l_returnflag, + l_linestatus, + quantity, + base_price, + disc_price, + charge, + quantity, + base_price, + discount + }; + std::vector project_names = + { + "l_returnflag", + "l_linestatus", + "sum_qty", + "sum_base_price", + "sum_disc_price", + "sum_charge", + "avg_qty", + "avg_price", + "avg_disc" + }; + ProjectNodeOptions project_opts(std::move(projection_list)); + + ScalarAggregateOptions sum_opts = ScalarAggregateOptions::Defaults(); + CountOptions count_opts(CountOptions::CountMode::ALL); + std::vector aggs = + { + { "hash_sum", &sum_opts }, + { "hash_sum", &sum_opts }, + { "hash_sum", &sum_opts }, + { "hash_sum", &sum_opts }, + { "hash_mean", &sum_opts }, + { "hash_mean", &sum_opts }, + { "hash_mean", &sum_opts }, + { "hash_count", &count_opts } + }; + + std::vector cols = + { + 2, 3, 4, 5, 6, 7, 8, 2 + }; + + std::vector names = + { + "sum_qty", + "sum_base_price", + "sum_disc_price", + "sum_charge", + "avg_qty", + "avg_price", + "avg_disc", + "count_order" + }; + + std::vector keys = { "L_RETURNFLAG", "L_LINESTATUS" }; + AggregateNodeOptions agg_opts(aggs, cols, names, keys); + + SortKey l_returnflag_key("L_RETURNFLAG"); + SortKey l_linestatus_key("L_LINESTATUS"); + SortOptions sort_opts({ l_returnflag_key, l_linestatus_key }); + OrderBySinkNodeOptions order_by_opts(sort_opts, &sink_gen); + + Declaration filter_decl("filter", { Declaration::Input(lineitem) }, filter_opts); + Declaration project_decl("project", project_opts); + Declaration aggregate_decl("aggregate", agg_opts); + Declaration orderby_decl("order_by_sink", order_by_opts); + + Declaration q1 = Declaration::Sequence( + { + filter_decl, + project_decl, + aggregate_decl, + orderby_decl + }); + std::ignore = *q1.AddToPlan(plan.get()); + return plan; +} + +static void BM_Tpch_Q1(benchmark::State &st) +{ + for(auto _ : st) + { + st.PauseTiming(); + AsyncGenerator> sink_gen; + std::shared_ptr plan = Plan_Q1(sink_gen, st.range(0)); + st.ResumeTiming(); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); +#ifndef NDEBUG + st.PauseTiming(); + for(auto &batch : res) + std::cout << batch.ToString() << std::endl; + st.ResumeTiming(); +#endif + } +} + +//BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 1000)->ArgNames({ "SF" }); +BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); +} +} diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc new file mode 100644 index 00000000000..842bf828574 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -0,0 +1,3704 @@ +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/future.h" +#include "arrow/util/unreachable.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace arrow +{ + using internal::checked_cast; + + namespace compute + { + class TpchText + { + public: + Status Init(); + Result GenerateComments( + size_t num_comments, + size_t min_length, + size_t max_length, + random::pcg32_fast &rng); + + private: + void GenerateWord(size_t &offset, const char **words, size_t num_choices); + void GenerateNoun(size_t &offset); + void GenerateVerb(size_t &offset); + void GenerateAdjective(size_t &offset); + void GenerateAdverb(size_t &offset); + void GeneratePreposition(size_t &offset); + void GenerateAuxiliary(size_t &offset); + void GenerateTerminator(size_t &offset); + + void GenerateNounPhrase(size_t &offset); + void GenerateVerbPhrase(size_t &offset); + void GeneratePrepositionalPhrase(size_t &offset); + + void GenerateSentence(size_t &offset); + + std::unique_ptr text_; + random::pcg32_fast rng_; + static constexpr size_t kTextBytes = 300 * 1024 * 1024; // 300 MB + }; + + class TpchTableGenerator + { + public: + using OutputBatchCallback = std::function; + using FinishedCallback = std::function; + using GenerateFn = std::function; + using ScheduleCallback = std::function; + using AbortCallback = std::function; + + virtual Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) = 0; + + virtual Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) = 0; + + void Abort(AbortCallback abort_callback) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + abort_callback(); + } + } + + virtual std::shared_ptr schema() const = 0; + + virtual ~TpchTableGenerator() = default; + + protected: + std::atomic done_ = { false }; + std::atomic batches_generated_ = { 0 }; + }; + + int GetNumDigits(int64_t x) + { + // This if statement chain is for MAXIMUM SPEED + /* + ., + . _,'f----.._ + |\ ,-'"/ | ,' + |,_ ,--. / + /,-. ,'`. (_ + f o| o|__ "`-. + ,-._.,--'_ `. _.,-` + `"' ___.,'` j,-' + `-.__.,--' + */ + // Source: https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c + ARROW_DCHECK(x >= 0); + if(x < 10ll) return 1; + if(x < 100ll) return 2; + if(x < 1000ll) return 3; + if(x < 10000ll) return 4; + if(x < 100000ll) return 5; + if(x < 1000000ll) return 6; + if(x < 10000000ll) return 7; + if(x < 100000000ll) return 8; + if(x < 1000000000ll) return 9; + if(x < 10000000000ll) return 10; + if(x < 100000000000ll) return 11; + if(x < 1000000000000ll) return 12; + if(x < 10000000000000ll) return 13; + if(x < 100000000000000ll) return 14; + if(x < 1000000000000000ll) return 15; + if(x < 10000000000000000ll) return 16; + if(x < 100000000000000000ll) return 17; + if(x < 1000000000000000000ll) return 18; + return -1; + } + + void AppendNumberPaddedToNineDigits(char *out, int64_t x) + { + // We do all of this to avoid calling snprintf, which does a lot of crazy + // locale stuff. On Windows and MacOS this can get suuuuper slow + int num_digits = GetNumDigits(x); + int num_padding_zeros = std::max(9 - num_digits, 0); + std::memset(out, '0', static_cast(num_padding_zeros)); + while(x > 0) + { + *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); + num_digits -= 1; + x /= 10; + } + } + + Result> SetOutputColumns( + const std::vector &columns, + const std::vector> &types, + const std::unordered_map &name_map, + std::vector &gen_list) + { + gen_list.clear(); + std::vector> fields; + if(columns.empty()) + { + for(auto pair : name_map) + { + int col_idx = pair.second; + fields.push_back(field(pair.first, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } + else + { + for(const std::string &col : columns) + { + auto entry = name_map.find(col); + if(entry == name_map.end()) + return Status::Invalid("Not a valid column name"); + int col_idx = static_cast(entry->second); + fields.push_back(field(col, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } + } + + static TpchText g_text; + + Status TpchText::Init() + { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + size_t offset = 0; + while(offset < kTextBytes) + GenerateSentence(offset); + return Status::OK(); + } + + Result TpchText::GenerateComments( + size_t num_comments, + size_t min_length, + size_t max_length, + random::pcg32_fast &rng) + { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); + int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); + offsets[0] = 0; + for(size_t i = 1; i <= num_comments; i++) + offsets[i] = offsets[i - 1] + length_dist(rng); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, AllocateBuffer(offsets[num_comments])); + char *comments = reinterpret_cast(comment_buffer->mutable_data()); + for(size_t i = 0; i < num_comments; i++) + { + size_t length = offsets[i + 1] - offsets[i]; + std::uniform_int_distribution offset_dist(0, kTextBytes - length); + size_t offset_in_text = offset_dist(rng); + std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); + } + ArrayData ad(utf8(), num_comments, { nullptr, std::move(comment_buffer), std::move(offset_buffer) }); + return std::move(ad); + } + + Result RandomVString( + random::pcg32_fast &rng, + int64_t num_rows, + int32_t min_length, + int32_t max_length) + { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((num_rows + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t i = 1; i <= num_rows; i++) + offsets[i] = offsets[i - 1] + length_dist(rng); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[num_rows])); + char *str = reinterpret_cast(str_buff->mutable_data()); + + // Spec says to pick random alphanumeric characters from a set of at least + // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, + // so 52 total for upper and lower case, and 10 possible digits gives 62 + // characters... + // dbgen solves this by including a space and a comma as well, so we'll + // copy that. + const char alpha_numerics[65] = + "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; + std::uniform_int_distribution char_dist(0, 63); + for(int32_t i = 0; i < offsets[num_rows]; i++) + str[i] = alpha_numerics[char_dist(rng)]; + + ArrayData ad(utf8(), num_rows, { nullptr, std::move(str_buff), std::move(offset_buff) }); + return std::move(ad); + } + + void AppendNumber(char *&out, int num_digits, int32_t x) + { + out += (num_digits - 1); + while(x > 0) + { + *out-- = x % 10; + x /= 10; + } + x += num_digits; + } + + void GeneratePhoneNumber( + char *out, + random::pcg32_fast &rng, + int32_t country) + { + std::uniform_int_distribution three_digit(100, 999); + std::uniform_int_distribution four_digit(1000, 9999); + + int32_t country_code = country + 10; + int32_t l1 = three_digit(rng); + int32_t l2 = three_digit(rng); + int32_t l3 = four_digit(rng); + AppendNumber(out, 2, country_code); + *out++ = '-'; + AppendNumber(out, 3, l1); + *out++ = '-'; + AppendNumber(out, 3, l2); + *out++ = '-'; + AppendNumber(out, 4, l3); + } + + static constexpr uint32_t STARTDATE = 8035; // January 1, 1992 is 8035 days after January 1, 1970 + static constexpr uint32_t CURRENTDATE = 9298; // June 17, 1995 is 9298 days after January 1, 1970 + static constexpr uint32_t ENDDATE = 10591; // December 12, 1998 is 10591 days after January 1, 1970 + + const char *NameParts[] = + { + "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", + "blush", "brown", "burlywood", "burnished", "chartreuse", "chiffon", "chocolate", "coral", + "cornflower", "cornsilk", "cream", "cyan", "dark", "deep", "dim", "dodger", "drab", "firebrick", + "floral", "forest", "frosted", "gainsboro", "ghost", "goldenrod", "green", "grey", "honeydew", + "hot", "indian", "ivory", "khaki", "lace", "lavender", "lawn", "lemon", "light", "lime", "linen", + "magenta", "maroon", "medium", "metallic", "midnight", "mint", "misty", "moccasin", "navajo", + "navy", "olive", "orange", "orchid", "pale", "papaya", "peach", "peru", "pink", "plum", "powder", + "puff", "purple", "red", "rose", "rosy", "royal", "saddle", "salmon", "sandy", "seashell", "sienna", + "sky", "slate", "smoke", "snow", "spring", "steel", "tan", "thistle", "tomato", "turquoise", "violet", + "wheat", "white", "yellow", + }; + static constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); + + const char *Types_1[] = + { + "STANDARD ", "SMALL ", "MEDIUM ", "LARGE ", "ECONOMY ", "PROMO ", + }; + static constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); + + const char *Types_2[] = + { + "ANODIZED ", "BURNISHED ", "PLATED ", "POLISHED ", "BRUSHED ", + }; + static constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); + + const char *Types_3[] = + { + "TIN", "NICKEL", "BRASS", "STEEL", "COPPER", + }; + static constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); + + const char *Containers_1[] = + { + "SM ", "LG ", "MD ", "JUMBO ", "WRAP ", + }; + static constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); + + const char *Containers_2[] = + { + "CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM", + }; + static constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); + + const char *Segments[] = + { + "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD", + }; + static constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); + + const char *Priorities[] = + { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", + }; + static constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); + + const char *Instructions[] = + { + "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN", + }; + static constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); + + const char *Modes[] = + { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", + }; + static constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); + + const char *Nouns[] = + { + "foxes ", "ideas ", "theodolites ", "pinto beans ", "instructions ", "dependencies ", "excuses ", + "platelets ", "asymptotes ", "courts ", "dolphins ", "multipliers ", "sautemes ", "warthogs ", "frets ", + "dinos ", "attainments ", "somas ", "Tiresias '", "patterns ", "forges ", "braids ", "hockey players ", "frays ", + "warhorses ", "dugouts ", "notomis ", "epitaphs ", "pearls ", "tithes ", "waters ", "orbits ", "gifts ", "sheaves ", + "depths ", "sentiments ", "decoys ", "realms ", "pains ", "grouches ", "escapades ", + }; + static constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); + + const char *Verbs[] = + { + "sleep ", "wake ", "are ", "cajole ", "haggle ", "nag ", "use ", "boost ", "affix ", "detect ", "integrate ", + "maintain ", "nod ", "was ", "lose ", "sublate ", "solve ", "thrash ", "promise ", "engage ", "hinder ", + "print ", "x-ray ", "breach ", "eat ", "grow ", "impress ", "mold ", "poach ", "serve ", "run ", "dazzle ", + "snooze ", "doze ", "unwind ", "kindle ", "play ", "hang ", "believe ", "doubt ", + }; + static constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); + + const char *Adjectives[] = + { + "furious ", "sly ", "careful ", "blithe ", "quick ", "fluffy ", "slow ", "quiet ", "ruthless ", "thin ", + "close ", "dogged ", "daring ", "brave ", "stealthy ", "permanent ", "enticing ", "idle ", "busy ", + "regular ", "final ", "ironic ", "even ", "bold ", "silent ", + }; + static constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); + + const char *Adverbs[] = + { + "sometimes ", "always ", "never ", "furiously ", "slyly ", "carefully ", "blithely ", "quickly ", "fluffily ", + "slowly ", "quietly ", "ruthlessly ", "thinly ", "closely ", "doggedly ", "daringly ", "bravely ", "stealthily ", + "permanently ", "enticingly ", "idly ", "busily ", "regularly ", "finally ", "ironically ", "evenly ", "boldly ", + "silently ", + }; + static constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); + + const char *Prepositions[] = + { + "about ", "above ", "according to ", "across ", "after ", "against ", "along ", "alongside of ", "among ", + "around ", "at ", "atop ", "before ", "behind ", "beneath ", "beside ", "besides ", "between ", "beyond ", + "beyond ", "by ", "despite ", "during ", "except ", "for ", "from ", "in place of ", "inside ", "instead of ", + "into ", "near ", "of ", "on ", "outside ", "over ", "past ", "since ", "through ", "throughout ", "to ", + "toward ", "under ", "until ", "up ", "upon ", "without ", "with ", "within ", + }; + static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); + + const char *Auxiliaries[] = + { + "do ", "may ", "might ", "shall ", "will ", "would ", "can ", "could ", "should ", "ought to ", "must ", + "will have to ", "shall have to ", "could have to ", "should have to ", "must have to ", "need to ", "try to ", + }; + static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); + + const char *Terminators[] = + { + ".", ";", ":", "?", "!", "--", + }; + static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + + void TpchText::GenerateWord(size_t &offset, const char **words, size_t num_choices) + { + std::uniform_int_distribution dist(0, num_choices - 1); + const char *word = words[dist(rng_)]; + size_t bytes_left = kTextBytes - offset; + size_t length = std::strlen(word); + size_t bytes_to_copy = std::min(bytes_left, length); + std::memcpy(text_->mutable_data() + offset, word, bytes_to_copy); + offset += bytes_to_copy; + } + + void TpchText::GenerateNoun(size_t &offset) + { + GenerateWord(offset, Nouns, kNumNouns); + } + + void TpchText::GenerateVerb(size_t &offset) + { + GenerateWord(offset, Verbs, kNumVerbs); + } + + void TpchText::GenerateAdjective(size_t &offset) + { + GenerateWord(offset, Adjectives, kNumAdjectives); + } + + void TpchText::GenerateAdverb(size_t &offset) + { + GenerateWord(offset, Adverbs, kNumAdverbs); + } + + void TpchText::GeneratePreposition(size_t &offset) + { + GenerateWord(offset, Prepositions, kNumPrepositions); + } + + void TpchText::GenerateAuxiliary(size_t &offset) + { + GenerateWord(offset, Auxiliaries, kNumAuxiliaries); + } + + void TpchText::GenerateTerminator(size_t &offset) + { + GenerateWord(offset, Terminators, kNumTerminators); + } + + void TpchText::GenerateNounPhrase(size_t &offset) + { + std::uniform_int_distribution dist(0, 3); + const char *comma_space = ", "; + switch(dist(rng_)) + { + case 0: + GenerateNoun(offset); + break; + case 1: + GenerateAdjective(offset); + GenerateNoun(offset); + break; + case 2: + GenerateAdjective(offset); + GenerateWord(offset, &comma_space, 1); + GenerateAdjective(offset); + GenerateNoun(offset); + break; + case 3: + GenerateAdverb(offset); + GenerateAdjective(offset); + GenerateNoun(offset); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + } + + void TpchText::GenerateVerbPhrase(size_t &offset) + { + std::uniform_int_distribution dist(0, 3); + switch(dist(rng_)) + { + case 0: + GenerateVerb(offset); + break; + case 1: + GenerateAuxiliary(offset); + GenerateVerb(offset); + break; + case 2: + GenerateVerb(offset); + GenerateAdverb(offset); + break; + case 3: + GenerateAuxiliary(offset); + GenerateVerb(offset); + GenerateAdverb(offset); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + } + + void TpchText::GeneratePrepositionalPhrase(size_t &offset) + { + const char *the_space = "the "; + GeneratePreposition(offset); + GenerateWord(offset, &the_space, 1); + GenerateNounPhrase(offset); + } + + void TpchText::GenerateSentence(size_t &offset) + { + std::uniform_int_distribution dist(0, 4); + switch(dist(rng_)) + { + case 0: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GenerateTerminator(offset); + break; + case 1: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateTerminator(offset); + break; + case 2: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GenerateNounPhrase(offset); + GenerateTerminator(offset); + break; + case 3: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GenerateNounPhrase(offset); + GenerateTerminator(offset); + break; + case 4: + GenerateNounPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateVerbPhrase(offset); + GenerateNounPhrase(offset); + GenerateTerminator(offset); + break; + case 5: + GenerateNounPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateVerbPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateTerminator(offset); + break; + default: + Unreachable("Random number should be between 0 and 5 inclusive"); + break; + } + } + + using GenerateColumnFn = std::function; + class PartAndPartSupplierGenerator + { + public: + Status Init( + size_t num_threads, + int64_t batch_size, + int scale_factor) + { + if(!inited_) + { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + thread_local_data_.resize(num_threads); + for(ThreadLocalData &tld : thread_local_data_) + { + // 5 is the maximum number of different strings we need to concatenate + tld.string_indices.resize(5 * batch_size_); + } + part_rows_to_generate_ = scale_factor_ * 200000; + } + return Status::OK(); + } + + Result> SetPartOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, part_types_, part_name_map_, part_cols_); + } + + Result> SetPartSuppOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, partsupp_types_, partsupp_name_map_, partsupp_cols_); + } + + Result> NextPartBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(part_output_queue_mutex_); + if(!part_output_queue_.empty()) + { + ExecBatch batch = std::move(part_output_queue_.front()); + part_output_queue_.pop(); + return std::move(batch); + } + else if(part_rows_generated_ == part_rows_to_generate_) + { + return util::nullopt; + } + else + { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = std::min( + batch_size_, + part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.clear(); + tld.part.resize(PART::kNumCols); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for(int col : part_cols_) + RETURN_NOT_OK(part_generators_[col](thread_index)); + for(int col : partsupp_cols_) + RETURN_NOT_OK(partsupp_generators_[col](thread_index)); + + std::vector part_result(part_cols_.size()); + for(size_t i = 0; i < part_cols_.size(); i++) + { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + if(!partsupp_cols_.empty()) + { + std::vector partsupp_results; + for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) + { + std::vector partsupp_result(partsupp_cols_.size()); + for(size_t icol = 0; icol < partsupp_cols_.size(); icol++) + { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(partsupp_output_queue_mutex_); + for(ExecBatch &eb : partsupp_results) + { + partsupp_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(part_result)); + } + + Result> NextPartSuppBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(part_output_queue_mutex_); + if(!part_output_queue_.empty()) + { + ExecBatch batch = std::move(part_output_queue_.front()); + part_output_queue_.pop(); + return std::move(batch); + } + else if(part_rows_generated_ == part_rows_to_generate_) + { + return util::nullopt; + } + else + { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = std::min( + batch_size_, + part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.clear(); + tld.part.resize(PART::kNumCols); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for(int col : part_cols_) + RETURN_NOT_OK(part_generators_[col](thread_index)); + for(int col : partsupp_cols_) + RETURN_NOT_OK(partsupp_generators_[col](thread_index)); + if(!part_cols_.empty()) + { + std::vector part_result(part_cols_.size()); + for(size_t i = 0; i < part_cols_.size(); i++) + { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch part_batch, ExecBatch::Make(std::move(part_result))); + { + std::lock_guard lock(part_output_queue_mutex_); + part_output_queue_.emplace(std::move(part_batch)); + } + } + std::vector partsupp_results; + for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) + { + std::vector partsupp_result(partsupp_cols_.size()); + for(size_t icol = 0; icol < partsupp_cols_.size(); icol++) + { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(partsupp_output_queue_mutex_); + for(size_t i = 1; i < partsupp_results.size(); i++) + partsupp_output_queue_.emplace(std::move(partsupp_results[i])); + } + return std::move(partsupp_results[0]); + } + + private: +#define FOR_EACH_PART_COLUMN(F) \ + F(P_PARTKEY) \ + F(P_NAME) \ + F(P_MFGR) \ + F(P_BRAND) \ + F(P_TYPE) \ + F(P_SIZE) \ + F(P_CONTAINER) \ + F(P_RETAILPRICE) \ + F(P_COMMENT) + +#define FOR_EACH_PARTSUPP_COLUMN(F) \ + F(PS_PARTKEY) \ + F(PS_SUPPKEY) \ + F(PS_AVAILQTY) \ + F(PS_SUPPLYCOST) \ + F(PS_COMMENT) \ + +#define MAKE_ENUM(col) col, + struct PART + { + enum + { + FOR_EACH_PART_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + struct PARTSUPP + { + enum + { + FOR_EACH_PARTSUPP_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) \ + { #col, PART::col }, + const std::unordered_map part_name_map_ = + { + FOR_EACH_PART_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_STRING_MAP(col) \ + { #col, PARTSUPP::col }, + const std::unordered_map partsupp_name_map_ = + { + FOR_EACH_PARTSUPP_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector part_generators_ = + { + FOR_EACH_PART_COLUMN(MAKE_FN_ARRAY) + }; + std::vector partsupp_generators_ = + { + FOR_EACH_PARTSUPP_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_LINEITEM_COLUMN +#undef FOR_EACH_ORDERS_COLUMN + + const std::vector> part_types_ = + { + int32(), + utf8(), + fixed_size_binary(25), + fixed_size_binary(10), + utf8(), + int32(), + fixed_size_binary(10), + decimal(12, 2), + utf8(), + }; + + const std::vector> partsupp_types_ = + { + int32(), + int32(), + int32(), + decimal(12, 2), + utf8(), + }; + + Status AllocatePartBatch(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.part[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.part_to_generate * byte_width)); + ArrayData ad(part_types_[column], tld.part_to_generate, { nullptr, std::move(buff) }); + tld.part[column] = std::move(ad); + return Status::OK(); + } + + Status P_PARTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_PARTKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_PARTKEY)); + int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.part_to_generate; i++) + { + p_partkey[i] = (tld.partkey_start + i + 1); + ARROW_DCHECK(1 <= p_partkey[i] && p_partkey[i] <= part_rows_to_generate_); + } + } + return Status::OK(); + } + + Status P_NAME(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_NAME].kind() == Datum::NONE) + { + std::uniform_int_distribution dist(0, static_cast(kNumNameParts - 1)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + size_t string_length = 0; + for(int ipart = 0; ipart < 5; ipart++) + { + uint8_t name_part_index = dist(tld.rng); + tld.string_indices[irow * 5 + ipart] = name_part_index; + string_length += std::strlen(NameParts[name_part_index]); + } + // Add 4 because there is a space between each word (i.e. four spaces) + offsets[irow + 1] = offsets[irow] + string_length + 4; + } + // Add an extra byte for the space after in the very last string. + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); + char *strings = reinterpret_cast(string_buffer->mutable_data()); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + char *row = strings + offsets[irow]; + for(int ipart = 0; ipart < 5; ipart++) + { + uint8_t name_part_index = tld.string_indices[irow * 5 + ipart]; + const char *part = NameParts[name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + *row++ = ' '; + } + } + ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + Datum datum(ad); + tld.part[PART::P_NAME] = std::move(datum); + } + return Status::OK(); + } + + Status P_MFGR(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_MFGR].kind() == Datum::NONE) + { + std::uniform_int_distribution dist(1, 5); + const char *manufacturer = "Manufacturer#"; + const size_t manufacturer_length = std::strlen(manufacturer); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR)); + char *p_mfgr = reinterpret_cast(tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width); + char mfgr_id = '0' + dist(tld.rng); + *(p_mfgr + byte_width * irow + manufacturer_length) = mfgr_id; + } + } + return Status::OK(); + } + + Status P_BRAND(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_MFGR].kind() == Datum::NONE) + { + RETURN_NOT_OK(P_MFGR(thread_index)); + std::uniform_int_distribution dist(1, 5); + const char *brand = "Brand#"; + const size_t brand_length = std::strlen(brand); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); + const char *p_mfgr = reinterpret_cast( + tld.part[PART::P_MFGR].array()->buffers[1]->data()); + char *p_brand = reinterpret_cast(tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_BRAND]); + int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); + const size_t mfgr_id_offset = std::strlen("Manufacturer#"); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); + char brand_id = '0' + dist(tld.rng); + std::strncpy(p_brand + byte_width * irow, brand, byte_width); + *(p_brand + byte_width * irow + brand_length) = mfgr_id; + *(p_brand + byte_width * irow + brand_length + 1) = brand_id; + } + } + return Status::OK(); + } + + Status P_TYPE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_TYPE].kind() == Datum::NONE) + { + using D = std::uniform_int_distribution; + D dists[] = + { + D{ 0, static_cast(kNumTypes_1 - 1) }, + D{ 0, static_cast(kNumTypes_2 - 1) }, + D{ 0, static_cast(kNumTypes_3 - 1) }, + }; + + const char **types[] = { Types_1, Types_2, Types_3 }; + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + size_t string_length = 0; + for(int ipart = 0; ipart < 3; ipart++) + { + uint8_t name_part_index = dists[ipart](tld.rng); + tld.string_indices[irow * 3 + ipart] = name_part_index; + string_length += std::strlen(types[ipart][name_part_index]); + } + // Add 4 because there is a space between each word (i.e. 2 spaces) + offsets[irow + 1] = offsets[irow] + string_length + 2; + } + // Add an extra byte for the space after in the very last string. + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); + char *strings = reinterpret_cast(string_buffer->mutable_data()); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + char *row = strings + offsets[irow]; + for(int ipart = 0; ipart < 3; ipart++) + { + uint8_t name_part_index = tld.string_indices[irow * 3 + ipart]; + const char *part = types[ipart][name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + *row++ = ' '; + } + } + ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + Datum datum(ad); + tld.part[PART::P_TYPE] = std::move(datum); + } + return Status::OK(); + } + + Status P_SIZE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_SIZE].kind() == Datum::NONE) + { + std::uniform_int_distribution dist(1, 50); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_SIZE)); + int32_t *p_size = reinterpret_cast( + tld.part[PART::P_SIZE].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.part_to_generate; i++) + p_size[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status P_CONTAINER(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_CONTAINER].kind() == Datum::NONE) + { + std::uniform_int_distribution dist1(0, static_cast(kNumContainers_1 - 1)); + std::uniform_int_distribution dist2(0, static_cast(kNumContainers_2 - 1)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER)); + char *p_container = reinterpret_cast( + tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_CONTAINER]); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + int container1_idx = dist1(tld.rng); + int container2_idx = dist2(tld.rng); + const char *container1 = Containers_1[container1_idx]; + const char *container2 = Containers_2[container2_idx]; + size_t container1_length = std::strlen(container1); + size_t container2_length = std::strlen(container2); + + char *row = p_container + byte_width * irow; + // Abuse strncpy to zero out the rest of the array + std::strncpy(row, container1, byte_width); + row[container1_length] = ' '; + std::memcpy(row + container1_length + 1, container2, container2_length); + } + } + return Status::OK(); + } + + Status P_RETAILPRICE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_RETAILPRICE].kind() == Datum::NONE) + { + RETURN_NOT_OK(P_PARTKEY(thread_index)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_RETAILPRICE)); + const int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + Decimal128 *p_retailprice = reinterpret_cast( + tld.part[PART::P_RETAILPRICE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + int32_t partkey = p_partkey[irow]; + int64_t retail_price = (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + p_retailprice[irow] = { retail_price }; + } + } + return Status::OK(); + } + + Status P_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(batch_size_, 5, 22, tld.rng)); + } + return Status::OK(); + } + + Status InitPartsupp(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + tld.generated_partsupp.reset(); + tld.partsupp.clear(); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + tld.partsupp.resize(num_batches); + for(std::vector &batch : tld.partsupp) + { + batch.clear(); + batch.resize(PARTSUPP::kNumCols); + } + return Status::OK(); + } + + Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + int32_t byte_width = arrow::internal::GetByteWidth(*partsupp_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(partsupp_types_[column], batch_size_, { nullptr, std::move(buff) }); + tld.partsupp[ibatch][column] = std::move(ad); + return Status::OK(); + } + + Status PS_PARTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_PARTKEY]) + { + tld.generated_partsupp[PARTSUPP::PS_PARTKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_PARTKEY)); + int32_t *ps_partkey = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for(int64_t irun = 0; irun < next_run;) + { + for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) + ps_partkey[batch_offset++] = p_partkey[ipart]; + if(ipartsupp == kPartSuppRowsPerPart) + { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_SUPPKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_SUPPKEY]) + { + tld.generated_partsupp[PARTSUPP::PS_SUPPKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + const int32_t S = scale_factor_ * 10000; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); + int32_t *ps_suppkey = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for(int64_t irun = 0; irun < next_run;) + { + for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) + { + int32_t supplier = static_cast(ipartsupp); + int32_t partkey = p_partkey[ipart]; + ps_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + if(ipartsupp == kPartSuppRowsPerPart) + { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_AVAILQTY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_AVAILQTY]) + { + tld.generated_partsupp[PARTSUPP::PS_AVAILQTY] = true; + std::uniform_int_distribution dist(1, 9999); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_AVAILQTY)); + int32_t *ps_availqty = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for(int64_t irun = 0; irun < next_run; irun++) + ps_availqty[irun] = dist(tld.rng); + + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_SUPPLYCOST(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST]) + { + tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST] = true; + std::uniform_int_distribution dist(100, 100000); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPLYCOST)); + Decimal128 *ps_supplycost = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for(int64_t irun = 0; irun < next_run; irun++) + ps_supplycost[irun] = { dist(tld.rng) }; + + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PARTSUPP::PS_COMMENT].kind() == Datum::NONE) + { + int64_t irow = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) + { + int64_t num_rows = std::min(batch_size_, ps_to_generate - irow); + ARROW_ASSIGN_OR_RAISE( + tld.partsupp[ibatch][PARTSUPP::PS_COMMENT], g_text.GenerateComments(num_rows, 49, 198, tld.rng)); + irow += num_rows; + } + } + return Status::OK(); + } + + struct ThreadLocalData + { + std::vector part; + std::vector string_indices; + int64_t part_to_generate; + int64_t partkey_start; + + std::vector> partsupp; + std::bitset generated_partsupp; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex part_output_queue_mutex_; + std::mutex partsupp_output_queue_mutex_; + std::queue part_output_queue_; + std::queue partsupp_output_queue_; + int64_t batch_size_; + int scale_factor_; + int64_t part_rows_to_generate_; + int64_t part_rows_generated_; + std::vector part_cols_; + std::vector partsupp_cols_; + + static constexpr int64_t kPartSuppRowsPerPart = 4; + }; + + class OrdersAndLineItemGenerator + { + public: + Status Init( + size_t num_threads, + int64_t batch_size, + int scale_factor) + { + if(!inited_) + { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + thread_local_data_.resize(num_threads); + for(ThreadLocalData &tld : thread_local_data_) + { + tld.items_per_order.resize(batch_size_); + } + orders_rows_to_generate_ = scale_factor_ * 150000 * 10; + } + return Status::OK(); + } + + Result> SetOrdersOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, orders_types_, orders_name_map_, orders_cols_); + } + + Result> SetLineItemOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, lineitem_types_, lineitem_name_map_, lineitem_cols_); + } + + Result> NextOrdersBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(orders_output_queue_mutex_); + if(!orders_output_queue_.empty()) + { + ExecBatch batch = std::move(orders_output_queue_.front()); + orders_output_queue_.pop(); + return std::move(batch); + } + else if(orders_rows_generated_ == orders_rows_to_generate_) + { + return util::nullopt; + } + else + { + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = std::min( + batch_size_, + orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + } + } + tld.orders.clear(); + tld.orders.resize(ORDERS::kNumCols); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.first_batch_offset = 0; + tld.generated_lineitem.reset(); + + for(int col : orders_cols_) + RETURN_NOT_OK(orders_generators_[col](thread_index)); + for(int col : lineitem_cols_) + RETURN_NOT_OK(lineitem_generators_[col](thread_index)); + + std::vector orders_result(orders_cols_.size()); + for(size_t i = 0; i < orders_cols_.size(); i++) + { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + if(!lineitem_cols_.empty()) + { + std::vector lineitem_results; + for(size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) + { + std::vector lineitem_result(lineitem_cols_.size()); + for(size_t icol = 0; icol < lineitem_cols_.size(); icol++) + { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(lineitem_output_queue_mutex_); + for(ExecBatch &eb : lineitem_results) + { + lineitem_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(orders_result)); + } + + Result> NextLineItemBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ExecBatch queued; + bool from_queue = false; + { + std::lock_guard lock(lineitem_output_queue_mutex_); + if(!lineitem_output_queue_.empty()) + { + queued = std::move(lineitem_output_queue_.front()); + lineitem_output_queue_.pop(); + from_queue = true; + } + } + tld.first_batch_offset = 0; + if(from_queue) + { + ARROW_DCHECK(queued.length <= batch_size_); + tld.first_batch_offset = queued.length; + if(queued.length == batch_size_) + return std::move(queued); + } + { + std::lock_guard lock(orders_output_queue_mutex_); + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = std::min( + batch_size_, + orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + if(orders_rows_generated_ == orders_rows_to_generate_) + { + if(from_queue) + return std::move(queued); + return util::nullopt; + } + } + tld.orders.clear(); + tld.orders.resize(ORDERS::kNumCols); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.generated_lineitem.reset(); + if(from_queue) + { + for(size_t i = 0; i < lineitem_cols_.size(); i++) + if(tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) + tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); + } + + for(int col : orders_cols_) + RETURN_NOT_OK(orders_generators_[col](thread_index)); + for(int col : lineitem_cols_) + RETURN_NOT_OK(lineitem_generators_[col](thread_index)); + + if(!orders_cols_.empty()) + { + std::vector orders_result(orders_cols_.size()); + for(size_t i = 0; i < orders_cols_.size(); i++) + { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch orders_batch, ExecBatch::Make(std::move(orders_result))); + { + std::lock_guard lock(orders_output_queue_mutex_); + orders_output_queue_.emplace(std::move(orders_batch)); + } + } + std::vector lineitem_results; + for(size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) + { + std::vector lineitem_result(lineitem_cols_.size()); + for(size_t icol = 0; icol < lineitem_cols_.size(); icol++) + { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(lineitem_output_queue_mutex_); + for(size_t i = 1; i < lineitem_results.size(); i++) + lineitem_output_queue_.emplace(std::move(lineitem_results[i])); + } + return std::move(lineitem_results[0]); + } + + private: +#define FOR_EACH_ORDERS_COLUMN(F) \ + F(O_ORDERKEY) \ + F(O_CUSTKEY) \ + F(O_ORDERSTATUS) \ + F(O_TOTALPRICE) \ + F(O_ORDERDATE) \ + F(O_ORDERPRIORITY) \ + F(O_CLERK) \ + F(O_SHIPPRIORITY) \ + F(O_COMMENT) + +#define FOR_EACH_LINEITEM_COLUMN(F) \ + F(L_ORDERKEY) \ + F(L_PARTKEY) \ + F(L_SUPPKEY) \ + F(L_LINENUMBER) \ + F(L_QUANTITY) \ + F(L_EXTENDEDPRICE) \ + F(L_DISCOUNT) \ + F(L_TAX) \ + F(L_RETURNFLAG) \ + F(L_LINESTATUS) \ + F(L_SHIPDATE) \ + F(L_COMMITDATE) \ + F(L_RECEIPTDATE) \ + F(L_SHIPINSTRUCT) \ + F(L_SHIPMODE) \ + F(L_COMMENT) + +#define MAKE_ENUM(col) col, + struct ORDERS + { + enum + { + FOR_EACH_ORDERS_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + struct LINEITEM + { + enum + { + FOR_EACH_LINEITEM_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) \ + { #col, ORDERS::col }, + const std::unordered_map orders_name_map_ = + { + FOR_EACH_ORDERS_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_STRING_MAP(col) \ + { #col, LINEITEM::col }, + const std::unordered_map lineitem_name_map_ = + { + FOR_EACH_LINEITEM_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector orders_generators_ = + { + FOR_EACH_ORDERS_COLUMN(MAKE_FN_ARRAY) + }; + std::vector lineitem_generators_ = + { + FOR_EACH_LINEITEM_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_LINEITEM_COLUMN +#undef FOR_EACH_ORDERS_COLUMN + + const std::vector> orders_types_ = + { + int32(), + int32(), + fixed_size_binary(1), + decimal(12, 2), + date32(), + fixed_size_binary(15), + fixed_size_binary(15), + int32(), + utf8() + }; + + const std::vector> lineitem_types_ = + { + int32(), + int32(), + int32(), + int32(), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + fixed_size_binary(1), + fixed_size_binary(1), + date32(), + date32(), + date32(), + fixed_size_binary(25), + fixed_size_binary(10), + utf8(), + }; + + Status AllocateOrdersBatch(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.orders_to_generate * byte_width)); + ArrayData ad(orders_types_[column], tld.orders_to_generate, { nullptr, std::move(buff) }); + tld.orders[column] = std::move(ad); + return Status::OK(); + } + + Status O_ORDERKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERKEY)); + int32_t *o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + o_orderkey[i] = (tld.orderkey_start + i + 1); + ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= orders_rows_to_generate_); + } + } + return Status::OK(); + } + + Status O_CUSTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_CUSTKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CUSTKEY)); + + // Spec says it must be a random number between 1 and SF*150000 that is not + // divisible by 3. Rather than repeatedly generating numbers until we get to + // a non-divisible-by-3 number, we just generate a number between + // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. + std::uniform_int_distribution base_dist(0, scale_factor_ * 50000 - 1); + std::uniform_int_distribution offset_dist(1, 2); + int32_t *o_custkey = reinterpret_cast( + tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + o_custkey[i] = 3 * base_dist(tld.rng) + offset_dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERSTATUS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERSTATUS].kind() == Datum::NONE) + { + RETURN_NOT_OK(L_LINESTATUS(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERSTATUS)); + + char *o_orderstatus = reinterpret_cast( + tld.orders[ORDERS::O_ORDERSTATUS].array()->buffers[1]->mutable_data()); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + bool all_f = true; + bool all_o = true; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + const char *l_linestatus = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->data()); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++, batch_offset++) + { + all_f &= l_linestatus[batch_offset] == 'F'; + all_o &= l_linestatus[batch_offset] == 'O'; + } + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + ARROW_DCHECK(!(all_f && all_o)); + if(all_f) + o_orderstatus[iorder] = 'F'; + else if(all_o) + o_orderstatus[iorder] = 'O'; + else + o_orderstatus[iorder] = 'P'; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_TOTALPRICE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_TOTALPRICE].kind() == Datum::NONE) + { + RETURN_NOT_OK(L_EXTENDEDPRICE(thread_index)); + RETURN_NOT_OK(L_TAX(thread_index)); + RETURN_NOT_OK(L_DISCOUNT(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_TOTALPRICE)); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + int64_t sum = 0; + Decimal128 *o_totalprice = reinterpret_cast( + tld.orders[ORDERS::O_TOTALPRICE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + const Decimal128 *l_extendedprice = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->data()); + const Decimal128 *l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->data()); + const Decimal128 *l_discount = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->data()); + + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++, batch_offset++) + { + int64_t eprice = static_cast(l_extendedprice[batch_offset]); + int64_t tax = static_cast(l_tax[batch_offset]); + int64_t discount = static_cast(l_discount[batch_offset]); + sum += (eprice * (100 + tax) * (100 - discount)); + } + if(iline == tld.items_per_order[iorder]) + { + sum /= 100 * 100; + o_totalprice[iorder] = { sum }; + iline = 0; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_ORDERDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERDATE].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERDATE)); + + std::uniform_int_distribution dist(STARTDATE, ENDDATE - 151); + uint32_t *o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + o_orderdate[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERPRIORITY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY)); + int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_ORDERPRIORITY]); + std::uniform_int_distribution dist(0, kNumPriorities - 1); + char *o_orderpriority = reinterpret_cast( + tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + const char *str = Priorities[dist(tld.rng)]; + std::strncpy(o_orderpriority + i * byte_width, str, byte_width); + } + } + return Status::OK(); + } + + Status O_CLERK(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); + int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_CLERK]); + std::uniform_int_distribution dist(1, scale_factor_ * 1000); + char *o_clerk = reinterpret_cast( + tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + const char *clerk = "Clerk#"; + const size_t clerk_length = std::strlen(clerk); + int64_t clerk_number = dist(tld.rng); + char *output = o_clerk + i * byte_width; + std::strncpy(output, clerk, byte_width); + AppendNumberPaddedToNineDigits(output + clerk_length, clerk_number); + } + } + return Status::OK(); + } + + Status O_SHIPPRIORITY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_SHIPPRIORITY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_SHIPPRIORITY)); + int32_t *o_shippriority = reinterpret_cast( + tld.orders[ORDERS::O_SHIPPRIORITY].array()->buffers[1]->mutable_data()); + std::memset(o_shippriority, 0, tld.orders_to_generate * sizeof(int32_t)); + } + return Status::OK(); + } + + Status O_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(batch_size_, 19, 78, tld.rng)); + } + return Status::OK(); + } + + Status GenerateRowCounts(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + std::uniform_int_distribution length_dist(1, 7); + tld.lineitem_to_generate = 0; + tld.items_per_order.clear(); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + int64_t length = length_dist(tld.rng); + tld.items_per_order.push_back(length); + tld.lineitem_to_generate += length; + } + size_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; + tld.lineitem.clear(); + tld.lineitem.resize(num_batches); + for(std::vector &batch : tld.lineitem) + { + batch.clear(); + batch.resize(LINEITEM::kNumCols); + } + return Status::OK(); + } + + Status AllocateLineItemBufferIfNeeded(size_t thread_index, size_t ibatch, int column, size_t &out_batch_offset) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.lineitem[ibatch][column].kind() == Datum::NONE) + { + int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); + tld.lineitem[ibatch][column] = std::move(ad); + out_batch_offset = 0; + } + if(ibatch == 0) + out_batch_offset = tld.first_batch_offset; + return Status::OK(); + } + + Status L_ORDERKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_ORDERKEY]) + { + tld.generated_lineitem[LINEITEM::L_ORDERKEY] = true; + RETURN_NOT_OK(O_ORDERKEY(thread_index)); + const int32_t *o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_ORDERKEY, batch_offset)); + int32_t *l_linenumber = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->buffers[1]->mutable_data()); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_linenumber[batch_offset++] = o_orderkey[iorder]; + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_PARTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_PARTKEY]) + { + tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; + + size_t ibatch = 0; + std::uniform_int_distribution dist(1, scale_factor_ * 200000); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_PARTKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t *l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_partkey[batch_offset] = dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SUPPKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SUPPKEY]) + { + tld.generated_lineitem[LINEITEM::L_SUPPKEY] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 3); + const int32_t S = scale_factor_ * 10000; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset = 0; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SUPPKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t *l_suppkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->buffers[1]->mutable_data()); + const int32_t *l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + for(int64_t i = 0; i < next_run; i++) + { + int32_t supplier = dist(tld.rng); + int32_t partkey = l_partkey[batch_offset]; + // Fun fact: the parentheses for this expression are unbalanced in the TPC-H spec. + l_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINENUMBER(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_LINENUMBER]) + { + tld.generated_lineitem[LINEITEM::L_LINENUMBER] = true; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_LINENUMBER, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t *l_linenumber = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->buffers[1]->mutable_data()); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + { + l_linenumber[batch_offset++] = (iline + 1); + ARROW_DCHECK(1 <= (iline + 1) && (iline + 1) <= 7); + } + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_QUANTITY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_QUANTITY]) + { + tld.generated_lineitem[LINEITEM::L_QUANTITY] = true; + + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 50); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_QUANTITY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128 *l_quantity = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < next_run; i++) + { + // Multiply by 100 because the type is decimal(12, 2), so the decimal goes after two digits + int64_t quantity = dist(tld.rng) * 100; + l_quantity[batch_offset++] = { quantity }; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_EXTENDEDPRICE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE]) + { + tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + RETURN_NOT_OK(L_QUANTITY(thread_index)); + size_t ibatch = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_EXTENDEDPRICE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + const int32_t *l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + const Decimal128 *l_quantity = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->data()); + Decimal128 *l_extendedprice = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + int64_t partkey = static_cast(l_partkey[batch_offset]); + // Divide by 100 to recover the integer representation (not Decimal). + int64_t quantity = static_cast(l_quantity[batch_offset]) / 100; + + // Spec says to divide by 100, but that happens automatically due to this being stored + // to two decimal points. + int64_t retail_price = (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + int64_t extended_price = retail_price * quantity; + l_extendedprice[batch_offset] = { extended_price }; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_DISCOUNT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_DISCOUNT]) + { + tld.generated_lineitem[LINEITEM::L_DISCOUNT] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 10); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_DISCOUNT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128 *l_discount = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_discount[batch_offset] = { dist(tld.rng) }; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_TAX(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_TAX]) + { + tld.generated_lineitem[LINEITEM::L_TAX] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 8); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_TAX, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128 *l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_tax[batch_offset] = { dist(tld.rng) }; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RETURNFLAG(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_RETURNFLAG]) + { + tld.generated_lineitem[LINEITEM::L_RETURNFLAG] = true; + RETURN_NOT_OK(L_RECEIPTDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_RETURNFLAG, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_returnflag = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->buffers[1]->mutable_data()); + const uint32_t *l_receiptdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + if(l_receiptdate[batch_offset] <= CURRENTDATE) + { + uint32_t r = dist(tld.rng); + l_returnflag[batch_offset] = (r % 2 == 1) ? 'R' : 'A'; + } + else + { + l_returnflag[batch_offset] = 'N'; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINESTATUS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_LINESTATUS]) + { + tld.generated_lineitem[LINEITEM::L_LINESTATUS] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_LINESTATUS, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_linestatus = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->mutable_data()); + const uint32_t *l_shipdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + if(l_shipdate[batch_offset] > CURRENTDATE) + l_linestatus[batch_offset] = 'O'; + else + l_linestatus[batch_offset] = 'F'; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SHIPDATE]) + { + tld.generated_lineitem[LINEITEM::L_SHIPDATE] = true; + RETURN_NOT_OK(O_ORDERDATE(thread_index)); + const int32_t *o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(1, 121); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPDATE, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t *l_shipdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_shipdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMITDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_COMMITDATE]) + { + tld.generated_lineitem[LINEITEM::L_COMMITDATE] = true; + const int32_t *o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(30, 90); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_COMMITDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t *l_commitdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->buffers[1]->mutable_data()); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_commitdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RECEIPTDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_RECEIPTDATE]) + { + tld.generated_lineitem[LINEITEM::L_RECEIPTDATE] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 30); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_RECEIPTDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t *l_receiptdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->buffers[1]->mutable_data()); + const uint32_t *l_shipdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_receiptdate[batch_offset] = l_shipdate[batch_offset] + dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPINSTRUCT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) + { + tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true; + int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[LINEITEM::L_SHIPINSTRUCT]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumInstructions - 1); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPINSTRUCT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_shipinstruct = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + const char *str = Instructions[dist(tld.rng)]; + // Note that we don't have to memset the buffer to 0 because strncpy pads each string + // with 0's anyway + std::strncpy(l_shipinstruct + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPMODE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) + { + tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true; + int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[LINEITEM::L_SHIPMODE]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumModes - 1); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPMODE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_shipmode = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + const char *str = Modes[dist(tld.rng)]; + std::strncpy(l_shipmode + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_COMMENT]) + { + tld.generated_lineitem[LINEITEM::L_COMMENT] = true; + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + // Comments are kind of sneaky: we always generate the full batch and then just bump the length + if(tld.lineitem[ibatch][LINEITEM::L_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.lineitem[ibatch][LINEITEM::L_COMMENT], g_text.GenerateComments(batch_size_, 10, 43, tld.rng)); + batch_offset = 0; + } + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + batch_offset += next_run; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMENT].array()->length = batch_offset; + } + } + return Status::OK(); + } + + struct ThreadLocalData + { + std::vector orders; + int64_t orders_to_generate; + int64_t orderkey_start; + + std::vector> lineitem; + std::vector items_per_order; + int64_t lineitem_to_generate; + int64_t first_batch_offset; + std::bitset generated_lineitem; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex orders_output_queue_mutex_; + std::mutex lineitem_output_queue_mutex_; + std::queue orders_output_queue_; + std::queue lineitem_output_queue_; + int64_t batch_size_; + int scale_factor_; + int64_t orders_rows_to_generate_; + int64_t orders_rows_generated_; + std::vector orders_cols_; + std::vector lineitem_cols_; + }; + + class SupplierGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = scale_factor_ * 10000; + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( + columns, + types_, + name_map_, + gen_list_)); + + random::pcg32_fast rng; + std::uniform_int_distribution dist(0, rows_to_generate_ - 1); + size_t num_special_rows = static_cast(5 * scale_factor_); + std::unordered_set good_rows_set; + while(good_rows_set.size() < num_special_rows) + { + good_rows_set.insert(dist(rng)); + } + std::unordered_set bad_rows_set; + while(bad_rows_set.size() < num_special_rows) + { + int64_t bad_row; + do + { + bad_row = dist(rng); + } while(good_rows_set.find(bad_row) != good_rows_set.end()); + } + good_rows_.clear(); + bad_rows_.clear(); + good_rows_.insert(good_rows_.end(), good_rows_set.begin(), good_rows_set.end()); + bad_rows_.insert(bad_rows_.end(), bad_rows_set.begin(), bad_rows_set.end()); + std::sort(good_rows_.begin(), good_rows_.end()); + std::sort(bad_rows_.begin(), bad_rows_.end()); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + thread_local_data_.resize(num_threads); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: +#define FOR_EACH_COLUMN(F) \ + F(S_SUPPKEY) \ + F(S_NAME) \ + F(S_ADDRESS) \ + F(S_NATIONKEY) \ + F(S_PHONE) \ + F(S_ACCTBAL) \ + F(S_COMMENT) + +#define MAKE_ENUM(col) col, + struct SUPPLIER + { + enum + { + FOR_EACH_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; +#undef MAKE_ENUM +#define MAKE_STRING_MAP(col) \ + { #col, SUPPLIER::col }, + const std::unordered_map name_map_ = + { + FOR_EACH_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector generators_ = + { + FOR_EACH_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_COLUMN + + std::vector> types_ = + { + int32(), + fixed_size_binary(25), + utf8(), + int32(), + fixed_size_binary(15), + decimal(12, 2), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) + { + if(done_.load()) + return Status::OK(); + ThreadLocalData &tld = thread_local_data_[thread_index]; + tld.suppkey_start = rows_generated_.fetch_add(batch_size_); + if(tld.suppkey_start >= rows_to_generate_) + return Status::OK(); + + tld.to_generate = std::min(batch_size_, + rows_to_generate_ - tld.suppkey_start); + bool is_last_batch = tld.to_generate < batch_size_; + + tld.batch.clear(); + tld.batch.resize(SUPPLIER::kNumCols); + for(int col : gen_list_) + RETURN_NOT_OK(generators_[col](thread_index)); + + std::vector result(gen_list_.size()); + for(size_t i = 0; i < gen_list_.size(); i++) + { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + batches_generated_++; + output_callback_(std::move(eb)); + if(is_last_batch) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(types_[column], tld.to_generate, { nullptr, std::move(buff) }); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status S_SUPPKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_SUPPKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_SUPPKEY)); + int32_t *s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + s_suppkey[irow] = (tld.suppkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status S_NAME(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_NAME].kind() == Datum::NONE) + { + RETURN_NOT_OK(S_SUPPKEY(thread_index)); + const int32_t *s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data()); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[SUPPLIER::S_NAME]); + char *s_name = reinterpret_cast( + tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data()); + // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022 + const char *supplier = "Supplie#r"; + const size_t supplier_length = std::strlen(supplier); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + char *out = s_name + byte_width * irow; + std::memcpy(out, supplier, supplier_length); + AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); + } + } + return Status::OK(); + } + + Status S_ADDRESS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_ADDRESS].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE( + tld.batch[SUPPLIER::S_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status S_NATIONKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_NATIONKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t *s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + s_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status S_PHONE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_PHONE].kind() == Datum::NONE) + { + RETURN_NOT_OK(S_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[SUPPLIER::S_PHONE]); + const int32_t *s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data()); + char *s_phone = reinterpret_cast( + tld.batch[SUPPLIER::S_PHONE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + GeneratePhoneNumber( + s_phone + irow * byte_width, + tld.rng, + s_nationkey[irow]); + } + } + return Status::OK(); + } + + Status S_ACCTBAL(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_ACCTBAL].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_ACCTBAL)); + Decimal128 *s_acctbal = reinterpret_cast( + tld.batch[SUPPLIER::S_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + s_acctbal[irow] = { dist(tld.rng) }; + } + return Status::OK(); + } + + Status S_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(batch_size_, 25, 100, tld.rng)); + ModifyComments(thread_index, "Recommends", good_rows_); + ModifyComments(thread_index, "Complaints", bad_rows_); + } + return Status::OK(); + } + + void ModifyComments( + size_t thread_index, + const char *review, + const std::vector &indices) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + const int32_t *offsets = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->data()); + char *str = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->mutable_data()); + const char *customer = "Customer"; + const size_t customer_length = std::strlen(customer); + const size_t review_length = std::strlen(review); + + auto it = std::lower_bound(indices.begin(), indices.end(), tld.suppkey_start); + for(; it != indices.end() && *it < tld.suppkey_start + tld.to_generate; it++) + { + int64_t idx_in_batch = *it - tld.suppkey_start; + char *out = str + offsets[idx_in_batch]; + int32_t str_length = offsets[idx_in_batch + 1] - offsets[idx_in_batch]; + std::uniform_int_distribution gap_dist(0, str_length - customer_length - review_length); + int32_t gap = gap_dist(tld.rng); + int32_t total_length = customer_length + gap + review_length; + std::uniform_int_distribution start_dist(0, str_length - total_length); + int32_t start = start_dist(tld.rng); + std::memcpy(out + start, customer, customer_length); + std::memcpy(out + start + gap, review, review_length); + } + } + + struct ThreadLocalData + { + random::pcg32_fast rng; + int64_t suppkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + std::vector good_rows_; + std::vector bad_rows_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_; + std::atomic rows_generated_; + int scale_factor_; + int64_t batch_size_; + std::vector gen_list_; + std::shared_ptr schema_; + }; + + class PartGenerator : public TpchTableGenerator + { + public: + PartGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + { + batches_generated_.store(0); + } + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetPartOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextPartBatch(thread_index)); + if(done_.load() || !maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class PartSuppGenerator : public TpchTableGenerator + { + public: + PartSuppGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + { + batches_generated_.store(0); + } + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetPartSuppOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextPartSuppBatch(thread_index)); + if(done_.load() || !maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class CustomerGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = scale_factor_ * 150000; + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( + columns, + types_, + name_map_, + gen_list_)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + thread_local_data_.resize(num_threads); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: +#define FOR_EACH_COLUMN(F) \ + F(C_CUSTKEY) \ + F(C_NAME) \ + F(C_ADDRESS) \ + F(C_NATIONKEY) \ + F(C_PHONE) \ + F(C_ACCTBAL) \ + F(C_MKTSEGMENT) \ + F(C_COMMENT) + +#define MAKE_ENUM(col) col, + struct CUSTOMER + { + enum + { + FOR_EACH_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; +#undef MAKE_ENUM +#define MAKE_STRING_MAP(col) \ + { #col, CUSTOMER::col }, + const std::unordered_map name_map_ = + { + FOR_EACH_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector generators_ = + { + FOR_EACH_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_COLUMN + + std::vector> types_ = + { + int32(), + utf8(), + utf8(), + int32(), + fixed_size_binary(15), + decimal(12, 2), + fixed_size_binary(10), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) + { + if(done_.load()) + return Status::OK(); + ThreadLocalData &tld = thread_local_data_[thread_index]; + tld.custkey_start = rows_generated_.fetch_add(batch_size_); + if(tld.custkey_start >= rows_to_generate_) + return Status::OK(); + + tld.to_generate = std::min(batch_size_, + rows_to_generate_ - tld.custkey_start); + bool is_last_batch = tld.to_generate < batch_size_; + + tld.batch.clear(); + tld.batch.resize(CUSTOMER::kNumCols); + for(int col : gen_list_) + RETURN_NOT_OK(generators_[col](thread_index)); + + std::vector result(gen_list_.size()); + for(size_t i = 0; i < gen_list_.size(); i++) + { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + batches_generated_++; + output_callback_(std::move(eb)); + if(is_last_batch) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(types_[column], tld.to_generate, { nullptr, std::move(buff) }); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status C_CUSTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_CUSTKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_CUSTKEY)); + int32_t *c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + c_custkey[irow] = (tld.custkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status C_NAME(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_NAME].kind() == Datum::NONE) + { + RETURN_NOT_OK(C_CUSTKEY(thread_index)); + const int32_t *c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.to_generate + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + const char *customer = "Customer#"; + const size_t customer_length = std::strlen(customer); + offsets[0] = 0; + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + int num_digits = GetNumDigits(c_custkey[irow]); + int num_chars = std::max(num_digits, 9); + offsets[irow + 1] = offsets[irow] + num_chars + customer_length; + } + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[tld.to_generate])); + char *str = reinterpret_cast(str_buff->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + char *out = str + offsets[irow]; + std::memcpy(out, customer, customer_length); + AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); + } + ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(str_buff), std::move(offset_buff) }); + tld.batch[CUSTOMER::C_NAME] = std::move(ad); + } + return Status::OK(); + } + + Status C_ADDRESS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_ADDRESS].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE( + tld.batch[CUSTOMER::C_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status C_NATIONKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_NATIONKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t *c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + c_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status C_PHONE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_PHONE].kind() == Datum::NONE) + { + RETURN_NOT_OK(C_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[CUSTOMER::C_PHONE]); + const int32_t *c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data()); + char *c_phone = reinterpret_cast( + tld.batch[CUSTOMER::C_PHONE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + GeneratePhoneNumber( + c_phone + irow * byte_width, + tld.rng, + c_nationkey[irow]); + } + } + return Status::OK(); + } + + Status C_ACCTBAL(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_ACCTBAL].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_ACCTBAL)); + Decimal128 *c_acctbal = reinterpret_cast( + tld.batch[CUSTOMER::C_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + c_acctbal[irow] = { dist(tld.rng) }; + } + return Status::OK(); + } + + Status C_MKTSEGMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[CUSTOMER::C_MKTSEGMENT]); + char *c_mktsegment = reinterpret_cast( + tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(0, kNumSegments - 1); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + char *out = c_mktsegment + irow * byte_width; + int str_idx = dist(tld.rng); + std::strncpy(out, Segments[str_idx], byte_width); + } + } + return Status::OK(); + } + + Status C_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(batch_size_, 29, 116, tld.rng)); + } + return Status::OK(); + } + + struct ThreadLocalData + { + random::pcg32_fast rng; + int64_t custkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_; + std::atomic rows_generated_; + int scale_factor_; + int64_t batch_size_; + std::vector gen_list_; + std::shared_ptr schema_; + }; + + class OrdersGenerator : public TpchTableGenerator + { + public: + OrdersGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + { + batches_generated_.store(0); + } + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetOrdersOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextOrdersBatch(thread_index)); + if(done_.load() || !maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class LineitemGenerator : public TpchTableGenerator + { + public: + LineitemGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + {} + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetLineItemOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextLineItemBatch(thread_index)); + if(!maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class NationGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int /*scale_factor*/, + int64_t /*batch_size*/) override + { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns( + columns, + types_, + name_map_, + column_indices_)); + return Status::OK(); + } + + Status StartProducing( + size_t /*num_threads*/, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override + { + std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(N_NATIONKEY, sizeof(N_NATIONKEY)); + ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_NATIONKEY_buffer) }); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr N_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); + char *N_NAME = reinterpret_cast(N_NAME_buffer->mutable_data()); + for(size_t i = 0; i < kRowCount; i++) + std::strncpy(N_NAME + kNameByteWidth * i, country_names_[i], kNameByteWidth); + ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, { nullptr, std::move(N_NAME_buffer) }); + + std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(N_REGIONKEY, sizeof(N_REGIONKEY)); + ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_REGIONKEY_buffer) }); + + ARROW_ASSIGN_OR_RAISE(Datum N_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 114, rng_)); + + std::vector fields = + { + std::move(N_NATIONKEY_arraydata), + std::move(N_NAME_arraydata), + std::move(N_REGIONKEY_arraydata), + std::move(N_COMMENT_datum) + }; + + std::vector result; + for(const int &col : column_indices_) + result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 25; + static constexpr int32_t kNameByteWidth = 25; + const int32_t N_NATIONKEY[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; + const char *country_names_[kRowCount] = + { + "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", "FRANCE", "GERMANY", + "INDONESIA", "IRAQ", "IRAN", "JAPAN", "JORDAN", "KENYA", "MOROCCO", "MOZAMBIQUE", "PERU", + "CHINA", "ROMANIA", "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES" + }; + const int32_t N_REGIONKEY[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; + + struct NATION + { + enum + { + N_NATIONKEY, + N_NAME, + N_REGIONKEY, + N_COMMENT, + }; + }; + + const std::unordered_map name_map_ = + { + { "N_NATIONKEY", NATION::N_NATIONKEY }, + { "N_NAME", NATION::N_NAME }, + { "N_REGIONKEY", NATION::N_REGIONKEY }, + { "N_COMMENT", NATION::N_COMMENT }, + }; + + std::vector> types_ = + { + int32(), + fixed_size_binary(kNameByteWidth), + int32(), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; + }; + + class RegionGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int /*scale_factor*/, + int64_t /*batch_size*/) override + { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns( + columns, + types_, + name_map_, + column_indices_)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override + { + std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(R_REGIONKEY, sizeof(R_REGIONKEY)); + ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(R_REGIONKEY_buffer) }); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr R_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); + char *R_NAME_data = reinterpret_cast(R_NAME_buffer->mutable_data()); + for(size_t i = 0; i < kRowCount; i++) + std::strncpy(R_NAME_data + kNameByteWidth * i, region_names_[i], kNameByteWidth); + ArrayData R_NAME_arraydata(types_[static_cast(REGION::R_NAME)], kRowCount, { nullptr, std::move(R_NAME_buffer) }); + + ARROW_ASSIGN_OR_RAISE(Datum R_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 115, rng_)); + + std::vector fields = { std::move(R_REGIONKEY_arraydata), std::move(R_NAME_arraydata), std::move(R_COMMENT_datum) }; + std::vector result; + for(const int &col : column_indices_) + result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 5; + static constexpr int32_t kNameByteWidth = 25; + const int32_t R_REGIONKEY[kRowCount] = { 0, 1, 2, 3, 4 }; + const char *region_names_[kRowCount] = + { + "AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST" + }; + + struct REGION + { + enum + { + R_REGIONKEY, + R_NAME, + R_COMMENT, + kNumColumns, + }; + }; + + const std::unordered_map name_map_ = + { + { "R_REGIONKEY", REGION::R_REGIONKEY }, + { "R_NAME", REGION::R_NAME }, + { "R_COMMENT", REGION::R_COMMENT }, + }; + + const std::vector> types_ = + { + int32(), + fixed_size_binary(kNameByteWidth), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; + }; + + class TpchNode : public ExecNode + { + public: + TpchNode(ExecPlan *plan, + std::unique_ptr generator) + : ExecNode(plan, {}, {}, generator->schema(), /*num_outputs=*/1), + generator_(std::move(generator)) + { + } + + const char *kind_name() const override + { + return "TpchNode"; + } + + [[noreturn]] + static void NoInputs() + { + Unreachable("TPC-H node should never have any inputs"); + } + + [[noreturn]] + void InputReceived(ExecNode *, ExecBatch) override + { + NoInputs(); + } + + [[noreturn]] + void ErrorReceived(ExecNode *, Status) override + { + NoInputs(); + } + + [[noreturn]] + void InputFinished(ExecNode *, int) override + { + NoInputs(); + } + + Status StartProducing() override + { + finished_ = Future<>::Make(); + return generator_->StartProducing( + thread_indexer_.Capacity(), + [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, + [this](int64_t num_batches) { this->FinishedCallback(num_batches); }, + [this](std::function func) -> Status { return this->ScheduleTaskCallback(std::move(func)); } + ); + } + + void PauseProducing(ExecNode *output) override {} + void ResumeProducing(ExecNode *output) override {} + + void StopProducing(ExecNode *output) override + { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override + { + generator_->Abort([this]() { this->finished_.MarkFinished(); }); + } + + Future<> finished() override + { + return finished_; + } + + private: + void OutputBatchCallback(ExecBatch batch) + { + outputs_[0]->InputReceived(this, std::move(batch)); + } + + void FinishedCallback(int64_t total_num_batches) + { + outputs_[0]->InputFinished(this, static_cast(total_num_batches)); + finished_.MarkFinished(); + } + + Status ScheduleTaskCallback(std::function func) + { + auto executor = plan_->exec_context()->executor(); + if (executor) + { + RETURN_NOT_OK(executor->Spawn([this, func] + { + size_t thread_index = thread_indexer_(); + Status status = func(thread_index); + if (!status.ok()) + { + StopProducing(); + ErrorIfNotOk(status); + return; + } + })); + } + else + { + return func(0); + } + return Status::OK(); + } + + std::unique_ptr generator_; + + Future<> finished_ = Future<>::MakeFinished(); + ThreadIndexer thread_indexer_; + }; + + Result TpchGen::Make(ExecPlan *plan, int scale_factor, int64_t batch_size) + { + static bool has_inited_text = false; + if(!has_inited_text) + { + RETURN_NOT_OK(g_text.Init()); + has_inited_text = true; + } + TpchGen result(plan, scale_factor, batch_size); + return result; + } + + template + Result TpchGen::CreateNode(std::vector columns) + { + std::unique_ptr generator = arrow::internal::make_unique(); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Supplier(std::vector columns) + { + return CreateNode(std::move(columns)); + } + + Result TpchGen::Part(std::vector columns) + { + if(!part_and_part_supp_generator_) + { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::PartSupp(std::vector columns) + { + if(!part_and_part_supp_generator_) + { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Customer(std::vector columns) + { + return CreateNode(std::move(columns)); + } + + Result TpchGen::Orders(std::vector columns) + { + if(!orders_and_line_item_generator_) + { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Lineitem(std::vector columns) + { + if(!orders_and_line_item_generator_) + { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Nation(std::vector columns) + { + return CreateNode(std::move(columns)); + } + + Result TpchGen::Region(std::vector columns) + { + return CreateNode(std::move(columns)); + } + } +} diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h new file mode 100644 index 00000000000..dc282aae981 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/pcg_random.h" +#include +#include + +namespace arrow +{ + namespace compute + { + class OrdersAndLineItemGenerator; + class PartAndPartSupplierGenerator; + + class TpchGen + { + public: + static Result Make(ExecPlan *plan, int scale_factor = 1, int64_t batch_size = 4096); + + Result Supplier(std::vector columns = {}); + Result Part(std::vector columns = {}); + Result PartSupp(std::vector columns = {}); + Result Customer(std::vector columns = {}); + Result Orders(std::vector columns = {}); + Result Lineitem(std::vector columns = {}); + Result Nation(std::vector columns = {}); + Result Region(std::vector columns = {}); + + private: + TpchGen(ExecPlan *plan, int scale_factor, int64_t batch_size) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + orders_and_line_item_generator_(nullptr) + {} + + template + Result CreateNode(std::vector columns); + + ExecPlan *plan_; + int scale_factor_; + int64_t batch_size_; + + std::shared_ptr part_and_part_supp_generator_; + std::shared_ptr orders_and_line_item_generator_; + }; + } +} diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index dd5bead58aa..0bf7e5422b2 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -893,7 +893,8 @@ class TableSorter { TableSorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const Table& table, const SortOptions& options) - : ctx_(ctx), + : status_(), + ctx_(ctx), table_(table), batches_(MakeBatches(table, &status_)), options_(options), @@ -1138,6 +1139,7 @@ class TableSorter { MergeNullsOnly(range_begin, range_middle, range_end, temp_indices, null_count); } + Status status_; ExecContext* ctx_; const Table& table_; const RecordBatchVector batches_; @@ -1148,7 +1150,6 @@ class TableSorter { uint64_t* indices_begin_; uint64_t* indices_end_; Comparator comparator_; - Status status_; }; // ---------------------------------------------------------------------- From c4495dcd003614734d08cf6d77d2d8b09dacaf65 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 2 Mar 2022 15:08:17 -0600 Subject: [PATCH 02/11] Draft of R bindings --- r/DESCRIPTION | 1 + r/NAMESPACE | 1 + r/R/arrowExports.R | 5 +++ r/R/tpch.R | 36 ++++++++++++++++++++++ r/man/tpch_dbgen.Rd | 20 ++++++++++++ r/src/arrowExports.cpp | 18 +++++++++++ r/src/compute-exec.cpp | 56 ++++++++++++++++++++++++++++++++++ r/tests/testthat/test-tpch.R | 59 ++++++++++++++++++++++++++++++++++++ 8 files changed, 196 insertions(+) create mode 100644 r/R/tpch.R create mode 100644 r/man/tpch_dbgen.Rd create mode 100644 r/tests/testthat/test-tpch.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index ae4bbcb8c38..17d97bebe08 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -125,4 +125,5 @@ Collate: 'reexports-bit64.R' 'reexports-tidyselect.R' 'schema.R' + 'tpch.R' 'util.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index d841bb29072..029177df0aa 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -282,6 +282,7 @@ export(time64) export(timestamp) export(to_arrow) export(to_duckdb) +export(tpch_dbgen) export(type) export(uint16) export(uint32) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index d6cf785a650..8bfd08b7a1e 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -412,6 +412,10 @@ ExecNode_ReadFromRecordBatchReader <- function(plan, reader) { .Call(`_arrow_ExecNode_ReadFromRecordBatchReader`, plan, reader) } +Tpch_Dbgen <- function(plan, scale_factor, table_name) { + .Call(`_arrow_Tpch_Dbgen`, plan, scale_factor, table_name) +} + RecordBatch__cast <- function(batch, schema, options) { .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } @@ -1851,3 +1855,4 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } + diff --git a/r/R/tpch.R b/r/R/tpch.R new file mode 100644 index 00000000000..78c2d112584 --- /dev/null +++ b/r/R/tpch.R @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +tpch_tables <- c("customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier") + + +#' Generate a RecordBatchReader with TPC-H data in it +#' +#' @param table the table to generate +#' @param scale_factor the scale factor to generate +#' +#' @return a RecordBatchReader that will contain the generated data +#' @export +#' +#' @keywords internal +tpch_dbgen <- function(table = tpch_tables, scale_factor) { + table <- match.arg(table) + + Tpch_Dbgen(arrow:::ExecPlan$create(), scale_factor, table) +} + + diff --git a/r/man/tpch_dbgen.Rd b/r/man/tpch_dbgen.Rd new file mode 100644 index 00000000000..88cc1cf1857 --- /dev/null +++ b/r/man/tpch_dbgen.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tpch.R +\name{tpch_dbgen} +\alias{tpch_dbgen} +\title{Generate a RecordBatchReader with TPC-H data in it} +\usage{ +tpch_dbgen(table = tpch_tables, scale_factor) +} +\arguments{ +\item{table}{the table to generate} + +\item{scale_factor}{the scale factor to generate} +} +\value{ +a RecordBatchReader that will contain the generated data +} +\description{ +Generate a RecordBatchReader with TPC-H data in it +} +\keyword{internal} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 3e4196421c9..e3cc6d79933 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1626,6 +1626,23 @@ extern "C" SEXP _arrow_ExecNode_ReadFromRecordBatchReader(SEXP plan_sexp, SEXP r } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr Tpch_Dbgen(const std::shared_ptr& plan, int scale_factor, std::string table_name); +extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input::type scale_factor(scale_factor_sexp); + arrow::r::Input::type table_name(table_name_sexp); + return cpp11::as_sexp(Tpch_Dbgen(plan, scale_factor, table_name)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ + Rf_error("Cannot call Tpch_Dbgen(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); @@ -7472,6 +7489,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 7}, { "_arrow_ExecNode_ReadFromRecordBatchReader", (DL_FUNC) &_arrow_ExecNode_ReadFromRecordBatchReader, 2}, + { "_arrow_Tpch_Dbgen", (DL_FUNC) &_arrow_Tpch_Dbgen, 3}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 3982af4f7f5..0d556d102a7 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -277,4 +278,59 @@ std::shared_ptr ExecNode_ReadFromRecordBatchReader( return MakeExecNodeOrStop("source", plan.get(), {}, options); } +// [[arrow::export]] +std::shared_ptr Tpch_Dbgen( + const std::shared_ptr& plan, + int scale_factor, + std::string table_name + ) { + + auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); + + compute::ExecNode *table; + if (table_name == "part") { + table = ValueOrStop(gen.Part()); + } else if (table_name == "supplier") { + table = ValueOrStop(gen.Supplier()); + } else if (table_name == "partsupp") { + table = ValueOrStop(gen.PartSupp()); + } else if (table_name == "customer") { + table = ValueOrStop(gen.Customer()); + } else if (table_name == "nation") { + table = ValueOrStop(gen.Nation()); + } else if (table_name == "lineitem") { + table = ValueOrStop(gen.Lineitem()); + } else if (table_name == "region") { + table = ValueOrStop(gen.Region()); + } else if (table_name == "orders") { + table = ValueOrStop(gen.Orders()); + } else { + cpp11::stop("That's not a valid table name"); + } + + arrow::AsyncGenerator> sink_gen; + + MakeExecNodeOrStop("sink", plan.get(), {table}, + compute::SinkNodeOptions{&sink_gen}); + + StopIfNotOk(plan->Validate()); + StopIfNotOk(plan->StartProducing()); + + // If the generator is destroyed before being completely drained, inform plan + std::shared_ptr stop_producing{nullptr, [plan](...) { + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + + if (not_finished_yet) { + plan->StopProducing(); + } + }}; + + return compute::MakeGeneratorReader( + table->output_schema(), + [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); +} + #endif diff --git a/r/tests/testthat/test-tpch.R b/r/tests/testthat/test-tpch.R new file mode 100644 index 00000000000..8077f76e4fd --- /dev/null +++ b/r/tests/testthat/test-tpch.R @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("tpch_dbgen()", { + lineitem_rbr <- tpch_dbgen("lineitem", 1) + lineitem_tab <- lineitem_rbr$read_table() + expect_identical(ncol(lineitem_tab), 16L) + + # and check a handful of types + expect_type_equal(lineitem_tab[["L_ORDERKEY"]], int32()) + expect_type_equal(lineitem_tab[["L_RECEIPTDATE"]], date32()) + + region_rbr <- tpch_dbgen("region", 1) + region_tab <- region_rbr$read_table() + expect_identical(dim(region_tab), c(5L, 3L)) + + # and check a handful of types + expect_type_equal(region_tab[["R_REGIONKEY"]], int32()) + expect_type_equal(region_tab[["R_COMMENT"]], string()) + + part_rbr <- tpch_dbgen("part", 1) + part_tab <- part_rbr$read_table() + expect_identical(dim(part_tab), c(200000L, 9L)) + + # and check a handful of types + expect_type_equal(part_tab[["R_PARTKEY"]], int32()) +}) + +# these two are tested above +tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region")) + +# nation segfaults +# supplier hangs +tpch_tables_up <- setdiff(tpch_tables_up, c("nation", "supplier")) + +# all of the rest below have an error with: +# Invalid: Arrays used to construct an ExecBatch must have equal length + +for (table_name in tpch_tables_up) { + test_that(paste0("Generating table: ", table_name), { + rbr <- tpch_dbgen(table_name, 1) + tab <- rbr$read_table() + expect_r6_class(tab, "Table") + }) +} From d7c508c36467c9d97fbfc8c5c7ec4cd4b5c1e871 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Wed, 2 Mar 2022 18:08:41 -0800 Subject: [PATCH 03/11] Fix bugs, parallel text generation, rudimentary tests --- cpp/src/arrow/compute/exec/CMakeLists.txt | 1 + cpp/src/arrow/compute/exec/tpch_node.cc | 302 +++++++++++-------- cpp/src/arrow/compute/exec/tpch_node_test.cc | 203 +++++++++++++ 3 files changed, 382 insertions(+), 124 deletions(-) create mode 100644 cpp/src/arrow/compute/exec/tpch_node_test.cc diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index cf725667107..452cda8b914 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -26,6 +26,7 @@ add_arrow_compute_test(expression_test add_arrow_compute_test(plan_test PREFIX "arrow-compute") add_arrow_compute_test(hash_join_node_test PREFIX "arrow-compute") +add_arrow_compute_test(tpch_node_test PREFIX "arrow-compute") add_arrow_compute_test(union_node_test PREFIX "arrow-compute") add_arrow_compute_test(util_test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 842bf828574..445df7d08b9 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -22,7 +22,7 @@ namespace arrow class TpchText { public: - Status Init(); + Status InitIfNeeded(random::pcg32_fast &rng); Result GenerateComments( size_t num_comments, size_t min_length, @@ -30,24 +30,28 @@ namespace arrow random::pcg32_fast &rng); private: - void GenerateWord(size_t &offset, const char **words, size_t num_choices); - void GenerateNoun(size_t &offset); - void GenerateVerb(size_t &offset); - void GenerateAdjective(size_t &offset); - void GenerateAdverb(size_t &offset); - void GeneratePreposition(size_t &offset); - void GenerateAuxiliary(size_t &offset); - void GenerateTerminator(size_t &offset); + bool GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices); + bool GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr); - void GenerateNounPhrase(size_t &offset); - void GenerateVerbPhrase(size_t &offset); - void GeneratePrepositionalPhrase(size_t &offset); + bool GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - void GenerateSentence(size_t &offset); + bool GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr); + std::atomic done_ = { false }; + int64_t generated_offset_ = 0; + std::mutex text_guard_; std::unique_ptr text_; random::pcg32_fast rng_; - static constexpr size_t kTextBytes = 300 * 1024 * 1024; // 300 MB + static constexpr int64_t kChunkSize = 8192; + static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB }; class TpchTableGenerator @@ -150,11 +154,13 @@ namespace arrow std::vector> fields; if(columns.empty()) { + fields.resize(name_map.size()); + gen_list.resize(name_map.size()); for(auto pair : name_map) { int col_idx = pair.second; - fields.push_back(field(pair.first, types[col_idx])); - gen_list.push_back(col_idx); + fields[col_idx] = field(pair.first, types[col_idx]); + gen_list[col_idx] = col_idx; } return schema(std::move(fields)); } @@ -175,12 +181,39 @@ namespace arrow static TpchText g_text; - Status TpchText::Init() + Status TpchText::InitIfNeeded(random::pcg32_fast &rng) { - ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); - size_t offset = 0; - while(offset < kTextBytes) - GenerateSentence(offset); + if(done_.load()) + return Status::OK(); + + { + std::lock_guard lock(text_guard_); + if(!text_) + { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + } + } + char *out = reinterpret_cast(text_->mutable_data()); + char temp_buff[kChunkSize]; + while(done_.load() == false) + { + int64_t current_offset = 0; + int64_t offset = 0; + while(GenerateSentence(offset, rng, temp_buff)) + current_offset = offset; + + { + std::lock_guard lock(text_guard_); + if(done_.load()) + return Status::OK(); + int64_t bytes_remaining = kTextBytes - generated_offset_; + int64_t memcpy_size = std::min(offset, bytes_remaining); + std::memcpy(out + generated_offset_, temp_buff, memcpy_size); + generated_offset_ += memcpy_size; + if(generated_offset_ == kTextBytes) + done_.store(true); + } + } return Status::OK(); } @@ -190,6 +223,7 @@ namespace arrow size_t max_length, random::pcg32_fast &rng) { + RETURN_NOT_OK(InitIfNeeded(rng)); std::uniform_int_distribution length_dist(min_length, max_length); ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); @@ -206,7 +240,7 @@ namespace arrow size_t offset_in_text = offset_dist(rng); std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); } - ArrayData ad(utf8(), num_comments, { nullptr, std::move(comment_buffer), std::move(offset_buffer) }); + ArrayData ad(utf8(), num_comments, { nullptr, std::move(offset_buffer), std::move(comment_buffer) }); return std::move(ad); } @@ -237,7 +271,7 @@ namespace arrow for(int32_t i = 0; i < offsets[num_rows]; i++) str[i] = alpha_numerics[char_dist(rng)]; - ArrayData ad(utf8(), num_rows, { nullptr, std::move(str_buff), std::move(offset_buff) }); + ArrayData ad(utf8(), num_rows, { nullptr, std::move(offset_buff), std::move(str_buff) }); return std::move(ad); } @@ -246,10 +280,10 @@ namespace arrow out += (num_digits - 1); while(x > 0) { - *out-- = x % 10; + *out-- = '0' + (x % 10); x /= 10; } - x += num_digits; + out += num_digits; } void GeneratePhoneNumber( @@ -405,163 +439,176 @@ namespace arrow }; static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); - void TpchText::GenerateWord(size_t &offset, const char **words, size_t num_choices) + bool TpchText::GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices) { std::uniform_int_distribution dist(0, num_choices - 1); - const char *word = words[dist(rng_)]; - size_t bytes_left = kTextBytes - offset; + const char *word = words[dist(rng)]; size_t length = std::strlen(word); - size_t bytes_to_copy = std::min(bytes_left, length); - std::memcpy(text_->mutable_data() + offset, word, bytes_to_copy); - offset += bytes_to_copy; + if(offset + length > kChunkSize) + return false; + std::memcpy(arr + offset, word, length); + offset += length; + return true; } - void TpchText::GenerateNoun(size_t &offset) + bool TpchText::GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Nouns, kNumNouns); + return GenerateWord(offset, rng, arr, Nouns, kNumNouns); } - void TpchText::GenerateVerb(size_t &offset) + bool TpchText::GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Verbs, kNumVerbs); + return GenerateWord(offset, rng, arr, Verbs, kNumVerbs); } - void TpchText::GenerateAdjective(size_t &offset) + bool TpchText::GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Adjectives, kNumAdjectives); + return GenerateWord(offset, rng, arr, Adjectives, kNumAdjectives); } - void TpchText::GenerateAdverb(size_t &offset) + bool TpchText::GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Adverbs, kNumAdverbs); + return GenerateWord(offset, rng, arr, Adverbs, kNumAdverbs); } - void TpchText::GeneratePreposition(size_t &offset) + bool TpchText::GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Prepositions, kNumPrepositions); + return GenerateWord(offset, rng, arr, Prepositions, kNumPrepositions); } - void TpchText::GenerateAuxiliary(size_t &offset) + bool TpchText::GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Auxiliaries, kNumAuxiliaries); + return GenerateWord(offset, rng, arr, Auxiliaries, kNumAuxiliaries); } - void TpchText::GenerateTerminator(size_t &offset) + bool TpchText::GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Terminators, kNumTerminators); + bool result = GenerateWord(offset, rng, arr, Terminators, kNumTerminators); + // Swap the space with the terminator + if(result) + std::swap(*(arr + offset - 2), *(arr + offset - 1)); + return result; } - void TpchText::GenerateNounPhrase(size_t &offset) + bool TpchText::GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 3); const char *comma_space = ", "; + bool success = true; switch(dist(rng_)) { case 0: - GenerateNoun(offset); + success &= GenerateNoun(offset, rng, arr); break; case 1: - GenerateAdjective(offset); - GenerateNoun(offset); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); break; case 2: - GenerateAdjective(offset); - GenerateWord(offset, &comma_space, 1); - GenerateAdjective(offset); - GenerateNoun(offset); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateWord(offset, rng, arr, &comma_space, 1); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); break; case 3: - GenerateAdverb(offset); - GenerateAdjective(offset); - GenerateNoun(offset); + GenerateAdverb(offset, rng, arr); + GenerateAdjective(offset, rng, arr); + GenerateNoun(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 3 inclusive"); break; } + return success; } - void TpchText::GenerateVerbPhrase(size_t &offset) + bool TpchText::GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 3); + bool success = true; switch(dist(rng_)) { case 0: - GenerateVerb(offset); + success &= GenerateVerb(offset, rng, arr); break; case 1: - GenerateAuxiliary(offset); - GenerateVerb(offset); + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); break; case 2: - GenerateVerb(offset); - GenerateAdverb(offset); + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); break; case 3: - GenerateAuxiliary(offset); - GenerateVerb(offset); - GenerateAdverb(offset); + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 3 inclusive"); break; } + return success; } - void TpchText::GeneratePrepositionalPhrase(size_t &offset) + bool TpchText::GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { const char *the_space = "the "; - GeneratePreposition(offset); - GenerateWord(offset, &the_space, 1); - GenerateNounPhrase(offset); + bool success = true; + success &= GeneratePreposition(offset, rng, arr); + success &= GenerateWord(offset, rng, arr, &the_space, 1); + success &= GenerateNounPhrase(offset, rng, arr); + return success; } - void TpchText::GenerateSentence(size_t &offset) + bool TpchText::GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 4); + bool success = true; switch(dist(rng_)) { case 0: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 1: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 2: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GenerateNounPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 3: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GenerateNounPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 4: - GenerateNounPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateVerbPhrase(offset); - GenerateNounPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 5: - GenerateNounPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateVerbPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 5 inclusive"); break; } + return success; } using GenerateColumnFn = std::function; @@ -669,14 +716,17 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; { - std::lock_guard lock(part_output_queue_mutex_); - if(!part_output_queue_.empty()) + std::lock_guard lock(partsupp_output_queue_mutex_); + if(!partsupp_output_queue_.empty()) { - ExecBatch batch = std::move(part_output_queue_.front()); - part_output_queue_.pop(); - return std::move(batch); + ExecBatch result = std::move(partsupp_output_queue_.front()); + partsupp_output_queue_.pop(); + return std::move(result); } - else if(part_rows_generated_ == part_rows_to_generate_) + } + { + std::lock_guard lock(part_output_queue_mutex_); + if(part_rows_generated_ == part_rows_to_generate_) { return util::nullopt; } @@ -885,7 +935,7 @@ namespace arrow *row++ = ' '; } } - ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); Datum datum(ad); tld.part[PART::P_NAME] = std::move(datum); } @@ -916,7 +966,7 @@ namespace arrow Status P_BRAND(size_t thread_index) { ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_MFGR].kind() == Datum::NONE) + if(tld.part[PART::P_BRAND].kind() == Datum::NONE) { RETURN_NOT_OK(P_MFGR(thread_index)); std::uniform_int_distribution dist(1, 5); @@ -987,7 +1037,7 @@ namespace arrow *row++ = ' '; } } - ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); Datum datum(ad); tld.part[PART::P_TYPE] = std::move(datum); } @@ -1065,7 +1115,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.part[PART::P_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(batch_size_, 5, 22, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(tld.part_to_generate, 5, 22, tld.rng)); } return Status::OK(); } @@ -1222,7 +1272,7 @@ namespace arrow for(int64_t irun = 0; irun < next_run; irun++) ps_supplycost[irun] = { dist(tld.rng) }; - tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->length = next_run; irow += next_run; } } @@ -1594,8 +1644,11 @@ namespace arrow tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.orders_to_generate; i++) { - o_orderkey[i] = (tld.orderkey_start + i + 1); - ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= orders_rows_to_generate_); + int32_t orderkey_index = tld.orderkey_start + i; + int32_t index_of_run = orderkey_index / 8; + int32_t index_in_run = orderkey_index % 8; + o_orderkey[i] = (index_of_run * 32 + index_in_run + 1); + ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= 4 * orders_rows_to_generate_); } } return Status::OK(); @@ -1802,7 +1855,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(batch_size_, 19, 78, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(tld.orders_to_generate, 19, 78, tld.rng)); } return Status::OK(); } @@ -2444,6 +2497,7 @@ namespace arrow { bad_row = dist(rng); } while(good_rows_set.find(bad_row) != good_rows_set.end()); + bad_rows_set.insert(bad_row); } good_rows_.clear(); bad_rows_.clear(); @@ -2680,7 +2734,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(batch_size_, 25, 100, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(tld.to_generate, 25, 100, tld.rng)); ModifyComments(thread_index, "Recommends", good_rows_); ModifyComments(thread_index, "Complaints", bad_rows_); } @@ -2694,9 +2748,9 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; const int32_t *offsets = reinterpret_cast( - tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->data()); + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->data()); char *str = reinterpret_cast( - tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->mutable_data()); + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->mutable_data()); const char *customer = "Customer"; const size_t customer_length = std::strlen(customer); const size_t review_length = std::strlen(review); @@ -3057,7 +3111,7 @@ namespace arrow std::memcpy(out, customer, customer_length); AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); } - ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(str_buff), std::move(offset_buff) }); + ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(offset_buff), std::move(str_buff) }); tld.batch[CUSTOMER::C_NAME] = std::move(ad); } return Status::OK(); @@ -3153,7 +3207,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(batch_size_, 29, 116, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(tld.to_generate, 29, 116, tld.rng)); } return Status::OK(); } @@ -3381,9 +3435,15 @@ namespace arrow const int32_t N_NATIONKEY[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; const char *country_names_[kRowCount] = { - "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", "FRANCE", "GERMANY", - "INDONESIA", "IRAQ", "IRAN", "JAPAN", "JORDAN", "KENYA", "MOROCCO", "MOZAMBIQUE", "PERU", - "CHINA", "ROMANIA", "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES" + "ALGERIA", "ARGENTINA", "BRAZIL", + "CANADA", "EGYPT", "ETHIOPIA", + "FRANCE", "GERMANY", "INDIA", + "INDONESIA", "IRAN", "IRAQ", + "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", + "CHINA", "ROMANIA", "SAUDI ARABIA", + "VIETNAM", "RUSSIA", "UNITED KINGDOM", + "UNITED STATES" }; const int32_t N_REGIONKEY[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; @@ -3619,12 +3679,6 @@ namespace arrow Result TpchGen::Make(ExecPlan *plan, int scale_factor, int64_t batch_size) { - static bool has_inited_text = false; - if(!has_inited_text) - { - RETURN_NOT_OK(g_text.Init()); - has_inited_text = true; - } TpchGen result(plan, scale_factor, batch_size); return result; } @@ -3659,7 +3713,7 @@ namespace arrow { part_and_part_supp_generator_ = std::make_shared(); } - std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); + std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); return plan_->EmplaceNode(plan_, std::move(generator)); } diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc new file mode 100644 index 00000000000..c844d7e88c1 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/api.h" +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/util.h" +#include "arrow/compute/kernels/row_encoder.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" +#include "arrow/testing/random.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/pcg_random.h" +#include "arrow/util/thread_pool.h" +#include "arrow/array/validate.h" + +namespace arrow +{ + namespace compute + { + void ValidateBatch(const ExecBatch &batch) + { + for(const Datum &d : batch.values) + ASSERT_OK(arrow::internal::ValidateArray(*d.array())); + } + + TEST(TpchNode, Supplier) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Supplier(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 10000); + } + + TEST(TpchNode, Part) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Part(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 200000); + } + + TEST(TpchNode, PartSupp) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.PartSupp(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 800000); + } + + TEST(TpchNode, Customer) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Customer(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 150000); + } + + TEST(TpchNode, Orders) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Orders(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 1500000); + } + + TEST(TpchNode, Lineitem) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Lineitem(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + for(auto &batch : res) + { + ValidateBatch(batch); + } + } + + TEST(TpchNode, Nation) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Nation(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 25); + } + + TEST(TpchNode, Region) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Region(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 5); + } + } +} From 289337ea518c3617f7db41cf7851b89eaa3eb9df Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 3 Mar 2022 08:14:37 -0600 Subject: [PATCH 04/11] Uncommenting R tests, and a first stab at the filewriter C++ --- r/R/arrowExports.R | 4 ++ r/R/tpch.R | 20 ++++++++- r/src/arrowExports.cpp | 23 ++++++++++ r/src/compute-exec.cpp | 85 ++++++++++++++++++++++++++++++++++++ r/tests/testthat/test-tpch.R | 14 ++---- 5 files changed, 135 insertions(+), 11 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 8bfd08b7a1e..c20ecd188bc 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -416,6 +416,10 @@ Tpch_Dbgen <- function(plan, scale_factor, table_name) { .Call(`_arrow_Tpch_Dbgen`, plan, scale_factor, table_name) } +Tpch_Dbgen_Write <- function(plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions) { + invisible(.Call(`_arrow_Tpch_Dbgen_Write`, plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions)) +} + RecordBatch__cast <- function(batch, schema, options) { .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/tpch.R b/r/R/tpch.R index 78c2d112584..ef0e002a6e5 100644 --- a/r/R/tpch.R +++ b/r/R/tpch.R @@ -30,7 +30,25 @@ tpch_tables <- c("customer", "lineitem", "nation", "orders", "part", "partsupp", tpch_dbgen <- function(table = tpch_tables, scale_factor) { table <- match.arg(table) - Tpch_Dbgen(arrow:::ExecPlan$create(), scale_factor, table) + Tpch_Dbgen(ExecPlan$create(), scale_factor, table) } +tpch_dbgen_write <- function(table = tpch_tables, scale_factor, path, ...) { + table <- match.arg(table) + + path_and_fs <- get_path_and_filesystem(path) + + existing_data_behavior <- 0L + max_partitions <- 1024L + + Tpch_Dbgen_Write( + ExecPlan$create(), + scale_factor, + table, + path_and_fs$fs, + path_and_fs$path, + existing_data_behavior, + max_partitions + ) +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index e3cc6d79933..bce8a52a7f9 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1643,6 +1643,28 @@ extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP t } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +void Tpch_Dbgen_Write(const std::shared_ptr& plan, int scale_factor, std::string table_name, const std::shared_ptr& filesystem, std::string base_dir, arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions); +extern "C" SEXP _arrow_Tpch_Dbgen_Write(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input::type scale_factor(scale_factor_sexp); + arrow::r::Input::type table_name(table_name_sexp); + arrow::r::Input&>::type filesystem(filesystem_sexp); + arrow::r::Input::type base_dir(base_dir_sexp); + arrow::r::Input::type existing_data_behavior(existing_data_behavior_sexp); + arrow::r::Input::type max_partitions(max_partitions_sexp); + Tpch_Dbgen_Write(plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_Tpch_Dbgen_Write(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){ + Rf_error("Cannot call Tpch_Dbgen_Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); @@ -7490,6 +7512,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 7}, { "_arrow_ExecNode_ReadFromRecordBatchReader", (DL_FUNC) &_arrow_ExecNode_ReadFromRecordBatchReader, 2}, { "_arrow_Tpch_Dbgen", (DL_FUNC) &_arrow_Tpch_Dbgen, 3}, + { "_arrow_Tpch_Dbgen_Write", (DL_FUNC) &_arrow_Tpch_Dbgen_Write, 7}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 0d556d102a7..075cb030fcd 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -24,6 +24,11 @@ #include #include #include +// TODO: We probably don't want to add dataset + filesystem here, so instead we'll probably +// want to move the definition of Tpch_Dbgen_Write if it works +#include +#include +#include #include #include #include @@ -33,6 +38,10 @@ #include namespace compute = ::arrow::compute; +// TODO: We probably don't want to add dataset + fs here, so instead we'll probably +// want to move the definition of Tpch_Dbgen_Write if it works +namespace ds = ::arrow::dataset; +namespace fs = ::arrow::fs; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); @@ -333,4 +342,80 @@ std::shared_ptr Tpch_Dbgen( [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); } +// [[arrow::export]] +void Tpch_Dbgen_Write( + const std::shared_ptr& plan, + int scale_factor, + std::string table_name, + const std::shared_ptr& filesystem, std::string base_dir, + arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions +) { + auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); + + compute::ExecNode *table; + if (table_name == "part") { + table = ValueOrStop(gen.Part()); + } else if (table_name == "supplier") { + table = ValueOrStop(gen.Supplier()); + } else if (table_name == "partsupp") { + table = ValueOrStop(gen.PartSupp()); + } else if (table_name == "customer") { + table = ValueOrStop(gen.Customer()); + } else if (table_name == "nation") { + table = ValueOrStop(gen.Nation()); + } else if (table_name == "lineitem") { + table = ValueOrStop(gen.Lineitem()); + } else if (table_name == "region") { + table = ValueOrStop(gen.Region()); + } else if (table_name == "orders") { + table = ValueOrStop(gen.Orders()); + } else { + cpp11::stop("That's not a valid table name"); + } + + // TODO: unhardcode this once it's working + auto base_path = base_dir + "/parquet_dataset"; + filesystem->CreateDir(base_path); + + auto format = std::make_shared(); + + ds::FileSystemDatasetWriteOptions write_options; + write_options.file_write_options = format->DefaultWriteOptions(); + write_options.existing_data_behavior = ds::ExistingDataBehavior::kDeleteMatchingPartitions; + write_options.filesystem = filesystem; + write_options.base_dir = base_path; + write_options.partitioning = arrow::dataset::Partitioning::Default(); + write_options.basename_template = "part{i}.parquet"; + write_options.max_partitions = 1024; + + // TODO: this had a checked_cast in front of it in the code I adapted it from + // but I ran into namespace issues when doing it so I took it out to see if it + // worked, but maybe that's what's causing the sefault? + const ds::WriteNodeOptions options = + ds::WriteNodeOptions{write_options, table->output_schema()}; + + + MakeExecNodeOrStop("consuming_sink", plan.get(), {table}, options); + + cpp11::message("Just after consume"); + + StopIfNotOk(plan->Validate()); + + cpp11::message("Just after validate"); + + StopIfNotOk(plan->StartProducing()); + + // If the generator is destroyed before being completely drained, inform plan + std::shared_ptr stop_producing{nullptr, [plan](...) { + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + + if (not_finished_yet) { + plan->StopProducing(); + } + }}; +} + #endif diff --git a/r/tests/testthat/test-tpch.R b/r/tests/testthat/test-tpch.R index 8077f76e4fd..eedf8954807 100644 --- a/r/tests/testthat/test-tpch.R +++ b/r/tests/testthat/test-tpch.R @@ -37,18 +37,12 @@ test_that("tpch_dbgen()", { expect_identical(dim(part_tab), c(200000L, 9L)) # and check a handful of types - expect_type_equal(part_tab[["R_PARTKEY"]], int32()) + expect_type_equal(part_tab[["P_PARTKEY"]], int32()) + expect_type_equal(part_tab[["P_NAME"]], string()) }) -# these two are tested above -tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region")) - -# nation segfaults -# supplier hangs -tpch_tables_up <- setdiff(tpch_tables_up, c("nation", "supplier")) - -# all of the rest below have an error with: -# Invalid: Arrays used to construct an ExecBatch must have equal length +# these three are tested above, but test that we can get tables for all the rest +tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region", "part")) for (table_name in tpch_tables_up) { test_that(paste0("Generating table: ", table_name), { From 2c580acee7786cf4f027fb1eb8a862298b5d480b Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Fri, 4 Mar 2022 23:55:24 -0800 Subject: [PATCH 05/11] Make it actually multithreaded --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 3 +- cpp/src/arrow/compute/exec/tpch_node.cc | 221 +++++++++++++------ cpp/src/arrow/compute/exec/tpch_node_test.cc | 1 + 3 files changed, 153 insertions(+), 72 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 963782333cf..9b4fad177e4 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -170,6 +170,7 @@ static void BM_Tpch_Q1(benchmark::State &st) } //BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 1000)->ArgNames({ "SF" }); -BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); +//BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); +BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({ "SF" }); } } diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 445df7d08b9..f9367b1131a 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -89,7 +89,7 @@ namespace arrow protected: std::atomic done_ = { false }; - std::atomic batches_generated_ = { 0 }; + std::atomic batches_outputted_ = { 0 }; }; int GetNumDigits(int64_t x) @@ -197,17 +197,17 @@ namespace arrow char temp_buff[kChunkSize]; while(done_.load() == false) { - int64_t current_offset = 0; - int64_t offset = 0; - while(GenerateSentence(offset, rng, temp_buff)) - current_offset = offset; + int64_t known_valid_offset = 0; + int64_t try_offset = 0; + while(GenerateSentence(try_offset, rng, temp_buff)) + known_valid_offset = try_offset; { std::lock_guard lock(text_guard_); if(done_.load()) return Status::OK(); int64_t bytes_remaining = kTextBytes - generated_offset_; - int64_t memcpy_size = std::min(offset, bytes_remaining); + int64_t memcpy_size = std::min(known_valid_offset, bytes_remaining); std::memcpy(out + generated_offset_, temp_buff, memcpy_size); generated_offset_ += memcpy_size; if(generated_offset_ == kTextBytes) @@ -283,7 +283,7 @@ namespace arrow *out-- = '0' + (x % 10); x /= 10; } - out += num_digits; + out += (num_digits + 1); } void GeneratePhoneNumber( @@ -506,7 +506,7 @@ namespace arrow break; case 2: success &= GenerateAdjective(offset, rng, arr); - success &= GenerateWord(offset, rng, arr, &comma_space, 1); + success &= GenerateWord(--offset, rng, arr, &comma_space, 1); success &= GenerateAdjective(offset, rng, arr); success &= GenerateNoun(offset, rng, arr); break; @@ -637,6 +637,16 @@ namespace arrow return Status::OK(); } + int64_t part_batches_generated() const + { + return part_batches_generated_.load(); + } + + int64_t partsupp_batches_generated() const + { + return partsupp_batches_generated_.load(); + } + Result> SetPartOutputColumns(const std::vector &cols) { return SetOutputColumns(cols, part_types_, part_name_map_, part_cols_); @@ -647,18 +657,20 @@ namespace arrow return SetOutputColumns(cols, partsupp_types_, partsupp_name_map_, partsupp_cols_); } - Result> NextPartBatch(size_t thread_index) + Result> NextPartBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(part_output_queue_mutex_); + bool all_generated = part_rows_generated_ == part_rows_to_generate_; if(!part_output_queue_.empty()) { ExecBatch batch = std::move(part_output_queue_.front()); part_output_queue_.pop(); return std::move(batch); } - else if(part_rows_generated_ == part_rows_to_generate_) + else if(all_generated) { return util::nullopt; } @@ -669,6 +681,10 @@ namespace arrow batch_size_, part_rows_to_generate_ - part_rows_generated_); part_rows_generated_ += tld.part_to_generate; + + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } @@ -712,8 +728,9 @@ namespace arrow return ExecBatch::Make(std::move(part_result)); } - Result> NextPartSuppBatch(size_t thread_index) + Result> NextPartSuppBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(partsupp_output_queue_mutex_); @@ -737,6 +754,9 @@ namespace arrow batch_size_, part_rows_to_generate_ - part_rows_generated_); part_rows_generated_ += tld.part_to_generate; + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } @@ -1120,13 +1140,20 @@ namespace arrow return Status::OK(); } + int64_t PartsuppBatchesToGenerate(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + return num_batches; + } + Status InitPartsupp(size_t thread_index) { ThreadLocalData &tld = thread_local_data_[thread_index]; tld.generated_partsupp.reset(); tld.partsupp.clear(); - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + int64_t num_batches = PartsuppBatchesToGenerate(thread_index); tld.partsupp.resize(num_batches); for(std::vector &batch : tld.partsupp) { @@ -1321,7 +1348,10 @@ namespace arrow int64_t part_rows_generated_; std::vector part_cols_; std::vector partsupp_cols_; - + ThreadIndexer thread_indexer_; + + std::atomic part_batches_generated_ = { 0 }; + std::atomic partsupp_batches_generated_ = { 0 }; static constexpr int64_t kPartSuppRowsPerPart = 4; }; @@ -1349,6 +1379,16 @@ namespace arrow return Status::OK(); } + int64_t orders_batches_generated() const + { + return orders_batches_generated_.load(); + } + + int64_t lineitem_batches_generated() const + { + return lineitem_batches_generated_.load(); + } + Result> SetOrdersOutputColumns(const std::vector &cols) { return SetOutputColumns(cols, orders_types_, orders_name_map_, orders_cols_); @@ -1359,8 +1399,9 @@ namespace arrow return SetOutputColumns(cols, lineitem_types_, lineitem_name_map_, lineitem_cols_); } - Result> NextOrdersBatch(size_t thread_index) + Result> NextOrdersBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(orders_output_queue_mutex_); @@ -1381,6 +1422,7 @@ namespace arrow batch_size_, orders_rows_to_generate_ - orders_rows_generated_); orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1); ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } } @@ -1426,8 +1468,9 @@ namespace arrow return ExecBatch::Make(std::move(orders_result)); } - Result> NextLineItemBatch(size_t thread_index) + Result> NextLineItemBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; ExecBatch queued; bool from_queue = false; @@ -1450,18 +1493,20 @@ namespace arrow } { std::lock_guard lock(orders_output_queue_mutex_); - tld.orderkey_start = orders_rows_generated_; - tld.orders_to_generate = std::min( - batch_size_, - orders_rows_to_generate_ - orders_rows_generated_); - orders_rows_generated_ += tld.orders_to_generate; - ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); if(orders_rows_generated_ == orders_rows_to_generate_) { if(from_queue) return std::move(queued); return util::nullopt; } + + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = std::min( + batch_size_, + orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1ll); + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } tld.orders.clear(); tld.orders.resize(ORDERS::kNumCols); @@ -1469,6 +1514,7 @@ namespace arrow tld.generated_lineitem.reset(); if(from_queue) { + lineitem_batches_generated_.fetch_sub(1); for(size_t i = 0; i < lineitem_cols_.size(); i++) if(tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); @@ -1505,6 +1551,7 @@ namespace arrow ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); lineitem_results.emplace_back(std::move(eb)); } + lineitem_batches_generated_.fetch_add(static_cast(lineitem_results.size())); // Return the first batch, enqueue the rest. { std::lock_guard lock(lineitem_output_queue_mutex_); @@ -1872,7 +1919,7 @@ namespace arrow tld.items_per_order.push_back(length); tld.lineitem_to_generate += length; } - size_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; + int64_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; tld.lineitem.clear(); tld.lineitem.resize(num_batches); for(std::vector &batch : tld.lineitem) @@ -1889,13 +1936,17 @@ namespace arrow if(tld.lineitem[ibatch][column].kind() == Datum::NONE) { int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); + std::printf("Thread %lu, byte size %d\n", thread_index, byte_width); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); tld.lineitem[ibatch][column] = std::move(ad); out_batch_offset = 0; } - if(ibatch == 0) + else + { + ARROW_DCHECK(ibatch == 0); out_batch_offset = tld.first_batch_offset; + } return Status::OK(); } @@ -2461,6 +2512,10 @@ namespace arrow int64_t orders_rows_generated_; std::vector orders_cols_; std::vector lineitem_cols_; + ThreadIndexer thread_indexer_; + + std::atomic orders_batches_generated_ = { 0 }; + std::atomic lineitem_batches_generated_ = { 0 }; }; class SupplierGenerator : public TpchTableGenerator @@ -2518,7 +2573,9 @@ namespace arrow output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -2584,7 +2641,6 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.suppkey_start); - bool is_last_batch = tld.to_generate < batch_size_; tld.batch.clear(); tld.batch.resize(SUPPLIER::kNumCols); @@ -2598,15 +2654,14 @@ namespace arrow result[i] = tld.batch[col_idx]; } ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); - batches_generated_++; + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_outputted_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_outputted_before_this_one == (batches_to_generate - 1); output_callback_(std::move(eb)); if(is_last_batch) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - { - finished_callback_(batches_generated_.load()); - } + done_.store(true); + finished_callback_(batches_outputted_.load()); return Status::OK(); } return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); @@ -2657,7 +2712,7 @@ namespace arrow for(int64_t irow = 0; irow < tld.to_generate; irow++) { char *out = s_name + byte_width * irow; - std::memcpy(out, supplier, supplier_length); + std::strncpy(out, supplier, byte_width); AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); } } @@ -2799,7 +2854,6 @@ namespace arrow PartGenerator(std::shared_ptr gen) : gen_(std::move(gen)) { - batches_generated_.store(0); } Status Init( @@ -2825,7 +2879,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -2834,22 +2890,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextPartBatch(thread_index)); - if(done_.load() || !maybe_batch.has_value()) + gen_->NextPartBatch()); + if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->part_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } @@ -2868,7 +2928,6 @@ namespace arrow PartSuppGenerator(std::shared_ptr gen) : gen_(std::move(gen)) { - batches_generated_.store(0); } Status Init( @@ -2894,7 +2953,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -2903,22 +2964,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextPartSuppBatch(thread_index)); - if(done_.load() || !maybe_batch.has_value()) + gen_->NextPartSuppBatch()); + if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->partsupp_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } @@ -2961,7 +3026,9 @@ namespace arrow output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -3029,7 +3096,6 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.custkey_start); - bool is_last_batch = tld.to_generate < batch_size_; tld.batch.clear(); tld.batch.resize(CUSTOMER::kNumCols); @@ -3043,14 +3109,16 @@ namespace arrow result[i] = tld.batch[col_idx]; } ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); - batches_generated_++; + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_generated_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_generated_before_this_one == (batches_to_generate - 1); output_callback_(std::move(eb)); if(is_last_batch) { bool expected = false; if(done_.compare_exchange_strong(expected, true)) { - finished_callback_(batches_generated_.load()); + finished_callback_(batches_outputted_.load()); } return Status::OK(); } @@ -3238,7 +3306,6 @@ namespace arrow OrdersGenerator(std::shared_ptr gen) : gen_(std::move(gen)) { - batches_generated_.store(0); } Status Init( @@ -3264,7 +3331,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -3273,22 +3342,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextOrdersBatch(thread_index)); - if(done_.load() || !maybe_batch.has_value()) + gen_->NextOrdersBatch()); + if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->orders_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } @@ -3331,7 +3404,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -3340,22 +3415,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextLineItemBatch(thread_index)); + gen_->NextLineItemBatch()); if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->lineitem_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index c844d7e88c1..4273e18d4eb 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -58,6 +58,7 @@ namespace arrow for(auto &batch : res) { ValidateBatch(batch); + std::cout << batch.ToString() << std::endl; num_rows += batch.length; } ASSERT_EQ(num_rows, 10000); From de2305a81cd40b5633f931fc571d7e20b943066a Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Sat, 5 Mar 2022 12:47:39 -0800 Subject: [PATCH 06/11] Fill new arrays with empty Datums explicitly --- cpp/src/arrow/compute/exec/tpch_node.cc | 25 ++++++++------------ cpp/src/arrow/compute/exec/tpch_node_test.cc | 1 - 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index f9367b1131a..877fc85ab63 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -688,8 +688,8 @@ namespace arrow ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } - tld.part.clear(); tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) @@ -760,8 +760,8 @@ namespace arrow ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } - tld.part.clear(); tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) @@ -1152,13 +1152,12 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; tld.generated_partsupp.reset(); - tld.partsupp.clear(); int64_t num_batches = PartsuppBatchesToGenerate(thread_index); tld.partsupp.resize(num_batches); for(std::vector &batch : tld.partsupp) { - batch.clear(); batch.resize(PARTSUPP::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); } return Status::OK(); } @@ -1426,8 +1425,8 @@ namespace arrow ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } } - tld.orders.clear(); tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); RETURN_NOT_OK(GenerateRowCounts(thread_index)); tld.first_batch_offset = 0; tld.generated_lineitem.reset(); @@ -1508,8 +1507,8 @@ namespace arrow orders_batches_generated_.fetch_add(1ll); ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } - tld.orders.clear(); tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); RETURN_NOT_OK(GenerateRowCounts(thread_index)); tld.generated_lineitem.reset(); if(from_queue) @@ -1920,12 +1919,11 @@ namespace arrow tld.lineitem_to_generate += length; } int64_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; - tld.lineitem.clear(); tld.lineitem.resize(num_batches); for(std::vector &batch : tld.lineitem) { - batch.clear(); batch.resize(LINEITEM::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); } return Status::OK(); } @@ -1936,17 +1934,14 @@ namespace arrow if(tld.lineitem[ibatch][column].kind() == Datum::NONE) { int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); - std::printf("Thread %lu, byte size %d\n", thread_index, byte_width); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); tld.lineitem[ibatch][column] = std::move(ad); out_batch_offset = 0; } - else - { - ARROW_DCHECK(ibatch == 0); + if(ibatch == 0) out_batch_offset = tld.first_batch_offset; - } + return Status::OK(); } @@ -2642,8 +2637,8 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.suppkey_start); - tld.batch.clear(); tld.batch.resize(SUPPLIER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); for(int col : gen_list_) RETURN_NOT_OK(generators_[col](thread_index)); @@ -3097,8 +3092,8 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.custkey_start); - tld.batch.clear(); tld.batch.resize(CUSTOMER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); for(int col : gen_list_) RETURN_NOT_OK(generators_[col](thread_index)); diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 4273e18d4eb..c844d7e88c1 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -58,7 +58,6 @@ namespace arrow for(auto &batch : res) { ValidateBatch(batch); - std::cout << batch.ToString() << std::endl; num_rows += batch.length; } ASSERT_EQ(num_rows, 10000); From 3eb99c6f31fdd0fafc34d4109940bc2676219316 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 8 Mar 2022 12:01:15 -0800 Subject: [PATCH 07/11] Add some tests, fix some bugs --- cpp/src/arrow/compute/exec/tpch_node.cc | 95 +++--- cpp/src/arrow/compute/exec/tpch_node.h | 6 +- cpp/src/arrow/compute/exec/tpch_node_test.cc | 288 ++++++++++++++++++- 3 files changed, 338 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 877fc85ab63..496b44a1dc0 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -49,7 +49,6 @@ namespace arrow int64_t generated_offset_ = 0; std::mutex text_guard_; std::unique_ptr text_; - random::pcg32_fast rng_; static constexpr int64_t kChunkSize = 8192; static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB }; @@ -65,7 +64,7 @@ namespace arrow virtual Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) = 0; virtual Status StartProducing( @@ -495,7 +494,7 @@ namespace arrow std::uniform_int_distribution dist(0, 3); const char *comma_space = ", "; bool success = true; - switch(dist(rng_)) + switch(dist(rng)) { case 0: success &= GenerateNoun(offset, rng, arr); @@ -526,7 +525,7 @@ namespace arrow { std::uniform_int_distribution dist(0, 3); bool success = true; - switch(dist(rng_)) + switch(dist(rng)) { case 0: success &= GenerateVerb(offset, rng, arr); @@ -565,7 +564,7 @@ namespace arrow { std::uniform_int_distribution dist(0, 4); bool success = true; - switch(dist(rng_)) + switch(dist(rng)) { case 0: success &= GenerateNounPhrase(offset, rng, arr); @@ -618,7 +617,7 @@ namespace arrow Status Init( size_t num_threads, int64_t batch_size, - int scale_factor) + float scale_factor) { if(!inited_) { @@ -632,7 +631,7 @@ namespace arrow // 5 is the maximum number of different strings we need to concatenate tld.string_indices.resize(5 * batch_size_); } - part_rows_to_generate_ = scale_factor_ * 200000; + part_rows_to_generate_ = static_cast(scale_factor_ * 200000); } return Status::OK(); } @@ -693,7 +692,9 @@ namespace arrow RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) + { RETURN_NOT_OK(part_generators_[col](thread_index)); + } for(int col : partsupp_cols_) RETURN_NOT_OK(partsupp_generators_[col](thread_index)); @@ -995,17 +996,20 @@ namespace arrow RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); const char *p_mfgr = reinterpret_cast( tld.part[PART::P_MFGR].array()->buffers[1]->data()); - char *p_brand = reinterpret_cast(tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); + char *p_brand = reinterpret_cast( + tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_BRAND]); int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); const size_t mfgr_id_offset = std::strlen("Manufacturer#"); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char *row = p_brand + byte_width * irow; char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); char brand_id = '0' + dist(tld.rng); - std::strncpy(p_brand + byte_width * irow, brand, byte_width); - *(p_brand + byte_width * irow + brand_length) = mfgr_id; - *(p_brand + byte_width * irow + brand_length + 1) = brand_id; + std::strncpy(row, brand, byte_width); + *(row + brand_length) = mfgr_id; + *(row + brand_length + 1) = brand_id; + irow += 0; } } return Status::OK(); @@ -1038,11 +1042,9 @@ namespace arrow tld.string_indices[irow * 3 + ipart] = name_part_index; string_length += std::strlen(types[ipart][name_part_index]); } - // Add 4 because there is a space between each word (i.e. 2 spaces) - offsets[irow + 1] = offsets[irow] + string_length + 2; + offsets[irow + 1] = offsets[irow] + string_length; } - // Add an extra byte for the space after in the very last string. - ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate])); char *strings = reinterpret_cast(string_buffer->mutable_data()); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { @@ -1054,7 +1056,6 @@ namespace arrow size_t length = std::strlen(part); std::memcpy(row, part, length); row += length; - *row++ = ' '; } } ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); @@ -1100,10 +1101,8 @@ namespace arrow size_t container2_length = std::strlen(container2); char *row = p_container + byte_width * irow; - // Abuse strncpy to zero out the rest of the array std::strncpy(row, container1, byte_width); - row[container1_length] = ' '; - std::memcpy(row + container1_length + 1, container2, container2_length); + std::memcpy(row + container1_length, container2, container2_length); } } return Status::OK(); @@ -1225,7 +1224,7 @@ namespace arrow int64_t ipartsupp = 0; int64_t ipart = 0; int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - const int32_t S = scale_factor_ * 10000; + const int32_t S = static_cast(scale_factor_ * 10000); for(int64_t irow = 0; irow < ps_to_generate; ibatch++) { RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); @@ -1342,7 +1341,7 @@ namespace arrow std::queue part_output_queue_; std::queue partsupp_output_queue_; int64_t batch_size_; - int scale_factor_; + float scale_factor_; int64_t part_rows_to_generate_; int64_t part_rows_generated_; std::vector part_cols_; @@ -1360,7 +1359,7 @@ namespace arrow Status Init( size_t num_threads, int64_t batch_size, - int scale_factor) + float scale_factor) { if(!inited_) { @@ -1373,7 +1372,7 @@ namespace arrow { tld.items_per_order.resize(batch_size_); } - orders_rows_to_generate_ = scale_factor_ * 150000 * 10; + orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); } return Status::OK(); } @@ -1711,7 +1710,8 @@ namespace arrow // divisible by 3. Rather than repeatedly generating numbers until we get to // a non-divisible-by-3 number, we just generate a number between // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. - std::uniform_int_distribution base_dist(0, scale_factor_ * 50000 - 1); + int32_t sf_50k = static_cast(scale_factor_ * 50000); + std::uniform_int_distribution base_dist(0, sf_50k - 1); std::uniform_int_distribution offset_dist(1, 2); int32_t *o_custkey = reinterpret_cast( tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); @@ -1867,7 +1867,8 @@ namespace arrow { RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_CLERK]); - std::uniform_int_distribution dist(1, scale_factor_ * 1000); + int64_t max_clerk_id = static_cast(scale_factor_ * 1000); + std::uniform_int_distribution dist(1, max_clerk_id); char *o_clerk = reinterpret_cast( tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.orders_to_generate; i++) @@ -1991,7 +1992,8 @@ namespace arrow tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; size_t ibatch = 0; - std::uniform_int_distribution dist(1, scale_factor_ * 200000); + int32_t max_partkey = static_cast(scale_factor_ * 200000); + std::uniform_int_distribution dist(1, max_partkey); for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { size_t batch_offset; @@ -2020,7 +2022,7 @@ namespace arrow size_t ibatch = 0; std::uniform_int_distribution dist(0, 3); - const int32_t S = scale_factor_ * 10000; + const int32_t S = static_cast(scale_factor_ * 10000); for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { size_t batch_offset = 0; @@ -2502,7 +2504,7 @@ namespace arrow std::queue orders_output_queue_; std::queue lineitem_output_queue_; int64_t batch_size_; - int scale_factor_; + float scale_factor_; int64_t orders_rows_to_generate_; int64_t orders_rows_generated_; std::vector orders_cols_; @@ -2518,12 +2520,12 @@ namespace arrow public: Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; batch_size_ = batch_size; - rows_to_generate_ = scale_factor_ * 10000; + rows_to_generate_ = static_cast(scale_factor_ * 10000); rows_generated_.store(0); ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, @@ -2537,7 +2539,8 @@ namespace arrow std::unordered_set good_rows_set; while(good_rows_set.size() < num_special_rows) { - good_rows_set.insert(dist(rng)); + int64_t row = dist(rng); + good_rows_set.insert(row); } std::unordered_set bad_rows_set; while(bad_rows_set.size() < num_special_rows) @@ -2817,7 +2820,7 @@ namespace arrow std::uniform_int_distribution start_dist(0, str_length - total_length); int32_t start = start_dist(tld.rng); std::memcpy(out + start, customer, customer_length); - std::memcpy(out + start + gap, review, review_length); + std::memcpy(out + start + customer_length + gap, review, review_length); } } @@ -2837,7 +2840,7 @@ namespace arrow ScheduleCallback schedule_callback_; int64_t rows_to_generate_; std::atomic rows_generated_; - int scale_factor_; + float scale_factor_; int64_t batch_size_; std::vector gen_list_; std::shared_ptr schema_; @@ -2853,7 +2856,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -2912,7 +2915,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -2927,7 +2930,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -2986,7 +2989,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -2996,7 +2999,7 @@ namespace arrow public: Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -3289,7 +3292,7 @@ namespace arrow ScheduleCallback schedule_callback_; int64_t rows_to_generate_; std::atomic rows_generated_; - int scale_factor_; + float scale_factor_; int64_t batch_size_; std::vector gen_list_; std::shared_ptr schema_; @@ -3305,7 +3308,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -3364,7 +3367,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -3378,7 +3381,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -3437,7 +3440,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -3447,7 +3450,7 @@ namespace arrow public: Status Init( std::vector columns, - int /*scale_factor*/, + float /*scale_factor*/, int64_t /*batch_size*/) override { ARROW_ASSIGN_OR_RAISE(schema_, @@ -3557,7 +3560,7 @@ namespace arrow public: Status Init( std::vector columns, - int /*scale_factor*/, + float /*scale_factor*/, int64_t /*batch_size*/) override { ARROW_ASSIGN_OR_RAISE(schema_, @@ -3751,7 +3754,7 @@ namespace arrow ThreadIndexer thread_indexer_; }; - Result TpchGen::Make(ExecPlan *plan, int scale_factor, int64_t batch_size) + Result TpchGen::Make(ExecPlan *plan, float scale_factor, int64_t batch_size) { TpchGen result(plan, scale_factor, batch_size); return result; diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index dc282aae981..1d904a2b5f0 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -36,7 +36,7 @@ namespace arrow class TpchGen { public: - static Result Make(ExecPlan *plan, int scale_factor = 1, int64_t batch_size = 4096); + static Result Make(ExecPlan *plan, float scale_factor = 1.0f, int64_t batch_size = 4096); Result Supplier(std::vector columns = {}); Result Part(std::vector columns = {}); @@ -48,7 +48,7 @@ namespace arrow Result Region(std::vector columns = {}); private: - TpchGen(ExecPlan *plan, int scale_factor, int64_t batch_size) + TpchGen(ExecPlan *plan, float scale_factor, int64_t batch_size) : plan_(plan), scale_factor_(scale_factor), batch_size_(batch_size), @@ -59,7 +59,7 @@ namespace arrow Result CreateNode(std::vector columns); ExecPlan *plan_; - int scale_factor_; + float scale_factor_; int64_t batch_size_; std::shared_ptr part_and_part_supp_generator_; diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index c844d7e88c1..6253075b85f 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -33,6 +33,8 @@ #include "arrow/util/thread_pool.h" #include "arrow/array/validate.h" +#include + namespace arrow { namespace compute @@ -43,6 +45,227 @@ namespace arrow ASSERT_OK(arrow::internal::ValidateArray(*d.array())); } + void VerifyUniqueKey( + std::unordered_set &seen, + const Datum &d, + int32_t min, + int32_t max) + { + const int32_t *keys = reinterpret_cast(d.array()->buffers[1]->data()); + int64_t num_keys = d.length(); + for(int64_t i = 0; i < num_keys; i++) + { + ASSERT_TRUE(seen.find(keys[i]) == seen.end()); + ASSERT_LE(keys[i], max); + ASSERT_GE(keys[i], min); + seen.insert(keys[i]); + } + } + + void VerifyStringAndNumber_FixedWidth( + const Datum &strings, + const Datum &numbers, + int byte_width, + const char *prefix, + bool verify_padding = true) + { + int64_t length = strings.length(); + const char *str = reinterpret_cast( + strings.array()->buffers[1]->data()); + + const int32_t *nums = nullptr; + if(numbers.kind() != Datum::NONE) + { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast( + numbers.array()->buffers[1]->data()); + } + + size_t num_offset = std::strlen(prefix); + for(int64_t i = 0; i < length; i++) + { + const char *row = str + i * byte_width; + ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) << row << ", prefix=" << prefix << ", i=" << i; + const char *num_str = row + num_offset; + int64_t num = 0; + int ibyte = static_cast(num_offset); + for(; *num_str && ibyte < byte_width; ibyte++) + { + num *= 10; + ASSERT_TRUE(std::isdigit(*num_str)); + num += *num_str++ - '0'; + } + if(nums) + { + ASSERT_EQ(static_cast(num), nums[i]); + } + if(verify_padding) + { + int num_chars = ibyte - num_offset; + ASSERT_GE(num_chars, 9); + } + } + } + + void VerifyVString(const Datum &d, int min_length, int max_length) + { + int64_t length = d.length(); + const int32_t *off = reinterpret_cast( + d.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + d.array()->buffers[2]->data()); + for(int64_t i = 0; i < length; i++) + { + int32_t start = off[i]; + int32_t end = off[i + 1]; + int32_t length = end - start; + ASSERT_LE(length, max_length); + ASSERT_GE(length, min_length); + for(int32_t i = start; i < end; i++) + { + bool is_valid = std::isdigit(str[i]) || std::isalpha(str[i]) || str[i] == ',' || str[i] == ' '; + ASSERT_TRUE(is_valid) << "Character " << str[i] << " is not a digit, a letter, a comma, or a space"; + } + } + } + + void VerifyAllBetween(const Datum &d, int32_t min, int32_t max) + { + int64_t length = d.length(); + const int32_t *n = reinterpret_cast(d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + { + ASSERT_GE(n[i], min) << "Value must be between " << min << " and " << max << ", got " << n[i]; + ASSERT_LE(n[i], max) << "Value must be between " << min << " and " << max << ", got " << n[i]; + } + } + + void VerifyNationKey(const Datum &d) + { + VerifyAllBetween(d, 0, 24); + } + + void VerifyPhone(const Datum &d) + { + int64_t length = d.length(); + const char *phones = reinterpret_cast(d.array()->buffers[1]->data()); + constexpr int kByteWidth = 15; // This is common for all PHONE columns + for(int64_t i = 0; i < length; i++) + { + const char *row = phones + i * kByteWidth; + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + } + } + + void VerifyDecimalsBetween(const Datum &d, int64_t min, int64_t max) + { + int64_t length = d.length(); + const Decimal128 *decs = reinterpret_cast( + d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + { + int64_t val = static_cast(decs[i]); + ASSERT_LE(val, max); + ASSERT_GE(val, min); + } + } + + void VerifyCorrectNumberOfWords_Varlen(const Datum &d, int num_words) + { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const int32_t *offsets = reinterpret_cast( + d.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + d.array()->buffers[2]->data()); + + for(int64_t i = 0; i < length; i++) + { + int actual_num_spaces = 0; + + int32_t start = offsets[i]; + int32_t end = offsets[i + 1]; + int32_t str_len = end - start; + char tmp_str[256] = {}; + std::memcpy(tmp_str, str + start, str_len); + bool is_only_alphas_or_spaces = true; + for(int32_t j = offsets[i]; j < offsets[i + 1]; j++) + { + bool is_space = str[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(str[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) << "Words must be composed only of letters, got " << tmp_str; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) << "Wrong number of spaces in " << tmp_str; + } + } + + void VerifyCorrectNumberOfWords_FixedWidth(const Datum &d, int num_words, int byte_width) + { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const char *str = reinterpret_cast( + d.array()->buffers[1]->data()); + + for(int64_t i = 0; i < length; i++) + { + int actual_num_spaces = 0; + const char *row = str + i * byte_width; + bool is_only_alphas_or_spaces = true; + for(int32_t j = 0; j < byte_width && row[j]; j++) + { + bool is_space = row[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(row[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) << "Words must be composed only of letters, got " << row; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) << "Wrong number of spaces in " << row; + } + } + + void CountModifiedComments(const Datum &d, int &good_count, int &bad_count) + { + int64_t length = d.length(); + const int32_t *offsets = reinterpret_cast( + d.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + d.array()->buffers[2]->data()); + // Length of S_COMMENT is at most 100 + char tmp_string[101]; + for(int64_t i = 0; i < length; i++) + { + const char *row = str + offsets[i]; + int32_t row_length = offsets[i + 1] - offsets[i]; + std::memset(tmp_string, 0, sizeof(tmp_string)); + std::memcpy(tmp_string, row, row_length); + char *customer = std::strstr(tmp_string, "Customer"); + char *recommends = std::strstr(tmp_string, "Recommends"); + char *complaints = std::strstr(tmp_string, "Complaints"); + if(customer) + { + ASSERT_TRUE((recommends != nullptr) ^ (complaints != nullptr)); + if(recommends) + good_count++; + if(complaints) + bad_count++; + } + } + } + TEST(TpchNode, Supplier) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); @@ -54,13 +277,34 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 10000; int64_t num_rows = 0; + + std::unordered_set seen_suppkey; + int good_count = 0; + int bad_count = 0; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey( + seen_suppkey, + batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyStringAndNumber_FixedWidth(batch[1], batch[0], /*byte_width=*/25, "Supplie#r"); + VerifyVString(batch[2], /*min_length=*/10, /*max_length=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + CountModifiedComments(batch[6], good_count, bad_count); num_rows += batch.length; } - ASSERT_EQ(num_rows, 10000); + ASSERT_EQ(seen_suppkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + ASSERT_EQ(good_count, 5); + ASSERT_EQ(bad_count, 5); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Part) @@ -74,13 +318,47 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 200000; int64_t num_rows = 0; + + std::unordered_set seen_partkey; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey( + seen_partkey, + batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyCorrectNumberOfWords_Varlen( + batch[1], + /*num_words*=*/5); + VerifyStringAndNumber_FixedWidth( + batch[2], + Datum(), + /*byte_width=*/25, + "Manufacturer#", + /*verify_padding=*/false); + VerifyStringAndNumber_FixedWidth( + batch[3], + Datum(), + /*byte_width=*/10, + "Brand#", + /*verify_padding=*/false); + VerifyCorrectNumberOfWords_Varlen( + batch[4], + /*num_words=*/3); + VerifyAllBetween(batch[5], /*min=*/1, /*max=*/50); + VerifyCorrectNumberOfWords_FixedWidth( + batch[6], + /*num_words=*/2, + /*byte_width=*/10); num_rows += batch.length; } - ASSERT_EQ(num_rows, 200000); + ASSERT_EQ(seen_partkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, PartSupp) @@ -101,6 +379,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 800000); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Customer) @@ -121,6 +400,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 150000); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Orders) @@ -141,6 +421,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 1500000); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Lineitem) @@ -158,6 +439,7 @@ namespace arrow { ValidateBatch(batch); } + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Nation) @@ -178,6 +460,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 25); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Region) @@ -198,6 +481,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 5); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } } } From 7f3e6bc57b6ad0bfd9fcad694faa7a090c9091ae Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 8 Mar 2022 12:50:14 -1000 Subject: [PATCH 08/11] First pass at a query testing tool. --- dev/qtester/.clang-tidy | 22 + dev/qtester/.gitignore | 45 ++ dev/qtester/CMakeLists.txt | 59 ++ dev/qtester/builtin_queries.cc | 101 +++ dev/qtester/builtin_queries.h | 17 + dev/qtester/queries/tpch1.substrait.pb.json | 749 ++++++++++++++++++++ dev/qtester/query_tester.cc | 51 ++ dev/qtester/test_runner.cc | 219 ++++++ dev/qtester/test_runner.h | 113 +++ 9 files changed, 1376 insertions(+) create mode 100644 dev/qtester/.clang-tidy create mode 100644 dev/qtester/.gitignore create mode 100644 dev/qtester/CMakeLists.txt create mode 100644 dev/qtester/builtin_queries.cc create mode 100644 dev/qtester/builtin_queries.h create mode 100644 dev/qtester/queries/tpch1.substrait.pb.json create mode 100644 dev/qtester/query_tester.cc create mode 100644 dev/qtester/test_runner.cc create mode 100644 dev/qtester/test_runner.h diff --git a/dev/qtester/.clang-tidy b/dev/qtester/.clang-tidy new file mode 100644 index 00000000000..bcdacd174be --- /dev/null +++ b/dev/qtester/.clang-tidy @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +--- +Checks: '*,-llvmlibc*,-cert-err58-cpp,-modernize-use-trailing-return-type,-fuchsia-*,-cppcoreguidelines-*, + -readability-magic-numbers,-clang-analyzer-cplusplus.NewDelete,-clang-analyzer-cplusplus.NewDeleteLeaks, + -readability-function-cognitive-complexity, -hicpp-special-member-functions, -bugprone-exception-escape' +WarningsAsErrors: '*' +FormatStyle: 'file' diff --git a/dev/qtester/.gitignore b/dev/qtester/.gitignore new file mode 100644 index 00000000000..e1e921762f9 --- /dev/null +++ b/dev/qtester/.gitignore @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +thirdparty/*.tar* +CMakeFiles/ +CMakeCache.txt +CMakeUserPresets.json +CTestTestfile.cmake +Makefile +cmake_install.cmake +build/ +*-build/ +Testing/ +build-support/boost_* +vcpkg_installed/ + +# Build directories created by Clion +cmake-build-*/ + +######################################### +# Editor temporary/working/backup files # +.#* +*\#*\# +[#]*# +*~ +*$ +*.bak +*flymake* +*.kdev4 +*.log +*.swp diff --git a/dev/qtester/CMakeLists.txt b/dev/qtester/CMakeLists.txt new file mode 100644 index 00000000000..b35260f4134 --- /dev/null +++ b/dev/qtester/CMakeLists.txt @@ -0,0 +1,59 @@ +cmake_minimum_required(VERSION 3.19) +project(arrow-query-tester) + +set(CMAKE_CXX_STANDARD 17) +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++") +endif() +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +include(ExternalProject) + +# Add Arrow +find_package(Arrow REQUIRED COMPONENTS dataset parquet engine) +# Argparse is a modern library for interpreting CLI args +set(ARGPARSE_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/argparse_ep-install") +set(ARGPARSE_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARGPARSE_PREFIX}") +set(ARGPARSE_INCLUDE_DIR "${ARGPARSE_PREFIX}/include") +externalproject_add(argparse + CMAKE_ARGS ${ARGPARSE_CMAKE_ARGS} + INSTALL_DIR ${ARGPARSE_PREFIX} + URL https://github.com/p-ranav/argparse/archive/refs/tags/v2.2.tar.gz + URL_HASH "SHA256=f0fc6ab7e70ac24856c160f44ebb0dd79dc1f7f4a614ee2810d42bb73799872b") + +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_CXX_CLANG_TIDY "clang-tidy-12") +endif() + +function(ADD_PROGRAM TARGET) + set(options) + set(one_value_args) + set(multi_value_args EXTRA_SOURCES) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + add_executable( + ${TARGET} + ${TARGET}.cc + ${ARG_EXTRA_SOURCES} + ) + add_dependencies(${TARGET} argparse) + target_include_directories(${TARGET} SYSTEM PRIVATE "${ARGPARSE_INCLUDE_DIR}") + target_link_libraries( + ${TARGET} + arrow_shared + arrow_dataset + arrow_engine + parquet + ) + if (MSVC) + target_compile_options(${TARGET} PRIVATE /W4 /WX) + else () + target_compile_options(${TARGET} PRIVATE -Wall -Wextra -Wpedantic -Werror) + endif () + +endfunction() + +add_program(query_tester EXTRA_SOURCES builtin_queries.cc test_runner.cc) diff --git a/dev/qtester/builtin_queries.cc b/dev/qtester/builtin_queries.cc new file mode 100644 index 00000000000..d4a6da5f404 --- /dev/null +++ b/dev/qtester/builtin_queries.cc @@ -0,0 +1,101 @@ +#include "builtin_queries.h" + +#include +#include +#include + +namespace cp = arrow::compute; + +namespace arrow::qtest { + +namespace { + +Result> Tpch1( + std::shared_ptr consumer) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, cp::ExecPlan::Make()); + ARROW_ASSIGN_OR_RAISE(cp::TpchGen gen, cp::TpchGen::Make(plan.get(), 1)); + + ARROW_ASSIGN_OR_RAISE( + cp::ExecNode * lineitem, + gen.Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", "L_SHIPDATE", + "L_RETURNFLAG", "L_LINESTATUS"})); + + std::shared_ptr sept_2_1998 = std::make_shared( + 10471); // September 2, 1998 is 10471 days after January 1, 1970 + cp::Expression filter = + cp::less_equal(cp::field_ref("L_SHIPDATE"), cp::literal(std::move(sept_2_1998))); + cp::FilterNodeOptions filter_opts(filter); + + cp::Expression l_returnflag = cp::field_ref("L_RETURNFLAG"); + cp::Expression l_linestatus = cp::field_ref("L_LINESTATUS"); + cp::Expression quantity = cp::field_ref("L_QUANTITY"); + cp::Expression base_price = cp::field_ref("L_EXTENDEDPRICE"); + + std::shared_ptr decimal_1 = + std::make_shared(Decimal128{0, 100}, decimal(12, 2)); + cp::Expression discount_multiplier = + cp::call("subtract", {cp::literal(decimal_1), cp::field_ref("L_DISCOUNT")}); + cp::Expression tax_multiplier = + cp::call("add", {cp::literal(decimal_1), cp::field_ref("L_TAX")}); + cp::Expression disc_price = + cp::call("multiply", {cp::field_ref("L_EXTENDEDPRICE"), discount_multiplier}); + cp::Expression charge = cp::call( + "multiply", {cp::call("cast", + {cp::call("multiply", {cp::field_ref("L_EXTENDEDPRICE"), + discount_multiplier})}, + cp::CastOptions::Unsafe(decimal(12, 2))), + tax_multiplier}); + cp::Expression discount = cp::field_ref("L_DISCOUNT"); + + std::vector projection_list = {l_returnflag, l_linestatus, quantity, + base_price, disc_price, charge, + quantity, base_price, discount}; + std::vector project_names = { + "l_returnflag", "l_linestatus", "sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", "avg_disc"}; + cp::ProjectNodeOptions project_opts(std::move(projection_list)); + + cp::ScalarAggregateOptions sum_opts = cp::ScalarAggregateOptions::Defaults(); + cp::CountOptions count_opts(cp::CountOptions::CountMode::ALL); + std::vector aggs = { + {"hash_sum", &sum_opts}, {"hash_sum", &sum_opts}, {"hash_sum", &sum_opts}, + {"hash_sum", &sum_opts}, {"hash_mean", &sum_opts}, {"hash_mean", &sum_opts}, + {"hash_mean", &sum_opts}, {"hash_count", &count_opts}}; + + std::vector cols = {2, 3, 4, 5, 6, 7, 8, 2}; + + std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", + "avg_disc", "count_order"}; + + std::vector keys = {"L_RETURNFLAG", "L_LINESTATUS"}; + cp::AggregateNodeOptions agg_opts(aggs, cols, names, keys); + + cp::ConsumingSinkNodeOptions sink_opts(std::move(consumer)); + + cp::Declaration filter_decl("filter", {cp::Declaration::Input(lineitem)}, filter_opts); + cp::Declaration project_decl("project", project_opts); + cp::Declaration aggregate_decl("aggregate", agg_opts); + cp::Declaration sink_decl("consuming_sink", sink_opts); + + cp::Declaration q1 = + cp::Declaration::Sequence({filter_decl, project_decl, aggregate_decl, sink_decl}); + std::ignore = *q1.AddToPlan(plan.get()); + return plan; +} + +std::unordered_map CreateBuiltinQueriesMap() { + std::unordered_map builtin_queries_map; + builtin_queries_map.insert({"tpch-1", Tpch1}); + return builtin_queries_map; +} + +} // namespace + +const std::unordered_map& GetBuiltinQueries() { + static std::unordered_map builtin_queries_map = + CreateBuiltinQueriesMap(); + return builtin_queries_map; +} + +} // namespace arrow::qtest \ No newline at end of file diff --git a/dev/qtester/builtin_queries.h b/dev/qtester/builtin_queries.h new file mode 100644 index 00000000000..b84e8c98f0e --- /dev/null +++ b/dev/qtester/builtin_queries.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace arrow::qtest { + +using QueryPlanFactory = std::function>( + std::shared_ptr)>; + +const std::unordered_map& GetBuiltinQueries(); + +} // namespace arrow::qtest \ No newline at end of file diff --git a/dev/qtester/queries/tpch1.substrait.pb.json b/dev/qtester/queries/tpch1.substrait.pb.json new file mode 100644 index 00000000000..4b0ddaa6bc2 --- /dev/null +++ b/dev/qtester/queries/tpch1.substrait.pb.json @@ -0,0 +1,749 @@ +{ + "extensionUris": [{ + "extensionUriAnchor": 3, + "uri": "/functions_aggregate_generic.yaml" + }, { + "extensionUriAnchor": 2, + "uri": "/functions_arithmetic_decimal.yaml" + }, { + "extensionUriAnchor": 1, + "uri": "/functions_datetime.yaml" + }], + "extensions": [{ + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "lte:date_date" + } + }, { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "subtract:date_day" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "multiply:opt_decimal_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 3, + "name": "subtract:opt_decimal_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 4, + "name": "add:opt_decimal_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 5, + "name": "sum:opt_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 6, + "name": "avg:opt_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 7, + "name": "count:opt" + } + }], + "relations": [{ + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [16, 17, 18, 19, 20, 21, 22] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["LINEITEM"] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 1, + "args": [{ + "literal": { + "date": 10561, + "nullable": false + } + }, { + "literal": { + "intervalDayToSecond": { + "days": 120, + "seconds": 0 + }, + "nullable": false + } + }], + "outputType": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + } + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 3, + "args": [{ + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false + } + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [{ + "scalarFunction": { + "functionReference": 2, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 3, + "args": [{ + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false + } + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "scalarFunction": { + "functionReference": 4, + "args": [{ + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false + } + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": { + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + }] + } + }, + "groupings": [{ + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }] + }, { + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }] + }], + "measures": [{ + "measure": { + "functionReference": 5, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 5, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 5, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 5, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 6, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 6, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 6, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + } + } + }] + } + }, + "sorts": [{ + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }, { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }] + } + }, + "names": ["L_RETURNFLAG", "L_LINESTATUS", "SUM_QTY", "SUM_BASE_PRICE", "SUM_DISC_PRICE", "SUM_CHARGE", "AVG_QTY", "AVG_PRICE", "AVG_DISC", "COUNT_ORDER"] + } + }], + "expectedTypeUrls": [] +} diff --git a/dev/qtester/query_tester.cc b/dev/qtester/query_tester.cc new file mode 100644 index 00000000000..f9d924532a3 --- /dev/null +++ b/dev/qtester/query_tester.cc @@ -0,0 +1,51 @@ +#include + +#include "test_runner.h" + +int main(int argc, char* argv[]) { + argparse::ArgumentParser program("query_tester"); + + program.add_argument("query").required().help("name of the query to run"); + program.add_argument("--num-iterations").default_value(1).scan<'i', int>(); + program.add_argument("--cpu-threads") + .help("size to use for the CPU thread pool, default controlled by Arrow") + .scan<'i', int>(); + program.add_argument("--io-threads") + .help("size to use for the I/O thread pool, default controlled by Arrow") + .scan<'i', int>(); + program.add_argument("--validate") + .help("if set the program will validate the query results") + .default_value(false) + .implicit_value(true); + + try { + program.parse_args(argc, argv); + } catch (const std::runtime_error& err) { + std::cerr << err.what() << std::endl; + std::cerr << program; + return 1; + } + + arrow::qtest::QueryTestOptions options; + options.query_name = program.get("query"); + options.cpu_threads = program.present("--cpu-threads"); + options.io_threads = program.present("--io-threads"); + options.validate = program.get("--validate"); + options.num_iterations = program.get("--num-iterations"); + options.executable_path = argv[0]; + + arrow::Result result = + arrow::qtest::RunQueryTest(options); + if (!result.ok()) { + std::cout << "Error encountered running test: " << result.status() << std::endl; + return 1; + } + + arrow::Status report_status = arrow::qtest::ReportResult(*result); + if (!report_status.ok()) { + std::cout << "Error encountered reporting status: " << result.status() << std::endl; + return 1; + } + + return 0; +} \ No newline at end of file diff --git a/dev/qtester/test_runner.cc b/dev/qtester/test_runner.cc new file mode 100644 index 00000000000..72f3eaf1a23 --- /dev/null +++ b/dev/qtester/test_runner.cc @@ -0,0 +1,219 @@ +#include "test_runner.h" +#include "builtin_queries.h" + +#include + +#include +#include +#include + +namespace std_fs = std::filesystem; +namespace cp = arrow::compute; + +namespace arrow::qtest { + +Status ValidateOptions(const QueryTestOptions& options) { + if (options.cpu_threads && *options.cpu_threads <= 0) { + return Status::Invalid("cpu-threads must be > 0"); + } + if (options.io_threads && *options.io_threads <= 0) { + return Status::Invalid("io-threads must be > 0"); + } + if (options.num_iterations <= 0) { + return Status::Invalid("num-iterations must be > 0"); + } + if (options.validate) { + return Status::NotImplemented("validation has not yet been implemented"); + } + return Status::OK(); +} + +namespace { +Result DoGetRootDirectory(const std::string& executable_path) { + std_fs::path path = std_fs::absolute(std_fs::path(executable_path)); + while (true) { + if (std_fs::is_directory(path / "queries") && + std_fs::is_directory(path / "datasets")) { + return path; + } + if (path.has_parent_path() && path != path.parent_path()) { + path = path.parent_path(); + } else { + return Status::Invalid( + "Could not locate the root directory. Did you perhaps move or copy the " + "query_tester executable outside of the project directory?"); + } + } +} + +Result GetRootDirectory(const std::string& executable) { + static Result cached_root_directory = DoGetRootDirectory(executable); + return cached_root_directory; +} + +Result> PathToBuffer(const std_fs::path& path) { + fs::LocalFileSystem local_fs; + ARROW_ASSIGN_OR_RAISE(fs::FileInfo file_info, local_fs.GetFileInfo(path)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr in_stream, + local_fs.OpenInputStream(path)); + return in_stream->Read(file_info.size()); +} + +Result> DeclsToPlan( + const std::vector& decls) { + ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); + for (const auto& decl : decls) { + ARROW_RETURN_NOT_OK(decl.AddToPlan(plan.get())); + } + return plan; +} + +Result> LoadQueryFromSubstraitJson( + const std_fs::path& path, const engine::ConsumerFactory& consumer_factory) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr json_bytes, PathToBuffer(path)); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr plan_bytes, + engine::internal::SubstraitFromJSON("Plan", json_bytes->ToString())); + ARROW_ASSIGN_OR_RAISE(std::vector decls, + engine::DeserializePlan(*plan_bytes, consumer_factory)); + return DeclsToPlan(decls); +} + +Result> LoadQueryFromSubstraitBinary( + const std_fs::path& path, const engine::ConsumerFactory& consumer_factory) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan_bytes, PathToBuffer(path)); + ARROW_ASSIGN_OR_RAISE(std::vector decls, + engine::DeserializePlan(*plan_bytes, consumer_factory)); + return DeclsToPlan(decls); +} + +Result> LoadQueryFromPath( + const std_fs::path& path, const std::string& extension, + const engine::ConsumerFactory& consumer_factory) { + if (extension == "substrait.pb.json") { + return LoadQueryFromSubstraitJson(path, consumer_factory); + } + if (extension == "substrait.pb") { + return LoadQueryFromSubstraitBinary(path, consumer_factory); + } + + return Status::Invalid("No handler for query file format ", extension); +} + +class QueryResultUpdatingConsumer : public cp::SinkNodeConsumer { + public: + explicit QueryResultUpdatingConsumer(QueryTestResult* result) : result_(result) {} + + arrow::Status Consume(cp::ExecBatch batch) override { + std::lock_guard lg(mutex_); + result_->iterations[iteration_].num_rows_processed += batch.length; + result_->iterations[iteration_].num_bytes_processed += batch.TotalBufferSize(); + return arrow::Status::OK(); + } + + arrow::Future<> Finish() override { + result_->iterations[iteration_].end_time = std::chrono::high_resolution_clock::now(); + return arrow::Future<>::MakeFinished(); + } + + void Start(std::size_t iteration) { + iteration_ = iteration; + result_->iterations.emplace_back(); + result_->iterations[iteration_].start_time = + std::chrono::high_resolution_clock::now(); + } + + private: + QueryTestResult* result_; + std::mutex mutex_; + std::size_t iteration_ = 0; +}; + +Result>> LoadQueryFromFiles( + const std::string& root_path, const std::string& query_name, + const engine::ConsumerFactory& consumer_factory) { + for (const auto& entry : + std_fs::directory_iterator(std_fs::path(root_path) / "queries")) { + auto entry_path_str = entry.path().filename().string(); + auto first_dot_idx = entry_path_str.find('.'); + if (first_dot_idx != std::string::npos) { + auto stem = entry_path_str.substr(0, first_dot_idx); + if (stem == query_name) { + auto extension = entry_path_str.substr(first_dot_idx + 1); + return LoadQueryFromPath(entry.path(), extension, consumer_factory); + } + } + } + return std::nullopt; +} + +Result>> LoadQueryFromBuiltin( + const std::string& query_name, const engine::ConsumerFactory& consumer_factory) { + const auto& builtin_queries_map = GetBuiltinQueries(); + const auto& query = builtin_queries_map.find(query_name); + if (query == builtin_queries_map.end()) { + return std::nullopt; + } + std::shared_ptr consumer = consumer_factory(); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, query->second(consumer)); + return plan; +} + +Status InitializeArrow(const QueryTestOptions& options) { + if (options.cpu_threads) { + ARROW_RETURN_NOT_OK( + arrow::internal::GetCpuThreadPool()->SetCapacity(*options.cpu_threads)); + } + if (options.io_threads) { + ARROW_RETURN_NOT_OK(arrow::io::SetIOThreadPoolCapacity(*options.io_threads)); + } + return Status::OK(); +} + +} // namespace + +Result> LoadQuery( + const std::string& root_path, const std::string& query_name, + const engine::ConsumerFactory& consumer_factory) { + ARROW_ASSIGN_OR_RAISE(std::optional> maybe_query, + LoadQueryFromFiles(root_path, query_name, consumer_factory)); + if (maybe_query) { + return *maybe_query; + } + + ARROW_ASSIGN_OR_RAISE(maybe_query, LoadQueryFromBuiltin(query_name, consumer_factory)); + if (maybe_query) { + return *maybe_query; + } + + return Status::Invalid("Could not find any query file or builtin query named ", + query_name); +} + +Result RunQueryTest(const QueryTestOptions& options) { + ARROW_ASSIGN_OR_RAISE(auto root_path, GetRootDirectory(options.executable_path)); + ARROW_RETURN_NOT_OK(ValidateOptions(options)); + ARROW_RETURN_NOT_OK(InitializeArrow(options)); + QueryTestResult result; + auto consumer = std::make_shared(&result); + auto consumer_factory = [consumer] { return consumer; }; + for (int i = 0; i < options.num_iterations; i++) { + consumer->Start(i); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, + LoadQuery(root_path, options.query_name, consumer_factory)); + ARROW_RETURN_NOT_OK(plan->StartProducing()); + ARROW_RETURN_NOT_OK(plan->finished().status()); + } + + return result; +} + +Status ReportResult(const QueryTestResult& result) { + std::cout << "Average Duration: " << result.average_duration_seconds() + << "s (+/- " << result.stderr_duration_seconds() << "s)" << std::endl; + std::cout << "Average Output Rows/S: " << result.average_rps() << "rps" << std::endl; + std::cout << "Average Output Bytes/S: " << result.average_bps() << "bps" << std::endl; + return Status::OK(); +} + +} // namespace arrow::qtest \ No newline at end of file diff --git a/dev/qtester/test_runner.h b/dev/qtester/test_runner.h new file mode 100644 index 00000000000..74685cd471e --- /dev/null +++ b/dev/qtester/test_runner.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace arrow { +namespace qtest { + +struct QueryTestOptions { + /// Name of the query to run, will look for a query input file in the queries folder + std::string query_name; + /// Number of CPU threads to initialize Arrow with. By default Arrow will base this + /// on std::thread::hardware_concurrency + std::optional cpu_threads; + /// Number of I/O threads to initialize Arrow with. By default Arrow will use 8 + std::optional io_threads; + /// Number of iterations of the query to run, defaults to a single run + int num_iterations = 1; + /// If true, validate the query results, if possible + bool validate = false; + /// Path to the query_tester executable, used to locate queries & datasets + std::string executable_path; +}; + +struct QueryIterationResult { + uint64_t num_rows_processed = 0; + uint64_t num_bytes_processed = 0; + std::chrono::high_resolution_clock::time_point start_time; + std::chrono::high_resolution_clock::time_point end_time; + + double duration_seconds() const { + return std::chrono::duration(end_time - start_time).count(); + } +}; + +struct QueryTestResult { + std::vector iterations; + + inline uint64_t total_bytes_processed() const { + uint64_t sum = 0; + for (const auto& iteration : iterations) { + sum += iteration.num_bytes_processed; + } + return sum; + } + + inline uint64_t total_rows_processed() const { + uint64_t sum = 0; + for (const auto& iteration : iterations) { + sum += iteration.num_rows_processed; + } + return sum; + } + + inline double total_duration_seconds() const { + double sum = 0; + for (const auto& iteration : iterations) { + sum += iteration.duration_seconds(); + } + return sum; + } + + inline double average_duration_seconds() const { + return total_duration_seconds() / iterations.size(); + } + + inline double stderr_duration_seconds() const { + double avg = average_duration_seconds(); + double err_sum = 0; + for (const auto& iteration : iterations) { + err_sum += std::abs(iteration.duration_seconds() - avg); + } + return err_sum / iterations.size(); + } + + inline double average_bps() const { + return total_bytes_processed() / total_duration_seconds(); + } + + inline double average_rps() const { + return total_rows_processed() / total_duration_seconds(); + } +}; + +/// Load a query and return the execution plan +/// +/// The folder ${CWD}/queries will be searched for a file whose basename (everything +/// before the first '.' matches query_name). The extension will be used to figure +/// out how to convert the file to an execution plan. Supported extensions are: +/// +/// .substrait.pb.json - Loads a Substrait plan using the JSON protobuf format +/// .substrait.pb - Loads a Substrait plan using the binary protobuf format +Result> LoadQuery( + const std::string& root_path, const std::string& query_name, + const engine::ConsumerFactory& consumer_factory); +/// Validate the options (will be run automatically by RunQueryTest) +Status ValidateOptions(const QueryTestOptions& options); +/// Run a query test. +/// +/// This will load the query, download and prepare any neccesary data, +/// run the query the specified number of times, and then generate a report +Result RunQueryTest(const QueryTestOptions& options); + +/// Print a query test result +Status ReportResult(const QueryTestResult& result); + +} // namespace qtest +} // namespace arrow From 6848a8b60eaf661813e7a78d1f9ed58e646975de Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 8 Mar 2022 15:40:11 -1000 Subject: [PATCH 09/11] Added in empty datasets directory. It will be a destination for downloaded datasets in the future and is needed for the query tester to recognize the root directory. --- dev/qtester/datasets/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 dev/qtester/datasets/.gitignore diff --git a/dev/qtester/datasets/.gitignore b/dev/qtester/datasets/.gitignore new file mode 100644 index 00000000000..5e7d2734cfc --- /dev/null +++ b/dev/qtester/datasets/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore From 239b20fd9d9db8055912873147c406e2b072468e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 8 Mar 2022 17:38:04 -1000 Subject: [PATCH 10/11] ARROW-15877: Moved the standalone query-tester executable into the cpp directory --- cpp/CMakeLists.txt | 8 ++ cpp/cmake_modules/DefineOptions.cmake | 2 + .../tools/query-tester}/.gitignore | 0 .../tools/query-tester}/CMakeLists.txt | 28 +------ .../tools/query-tester}/builtin_queries.cc | 6 +- .../tools/query-tester}/builtin_queries.h | 6 +- .../tools/query-tester}/datasets/.gitignore | 0 .../queries/tpch1.substrait.pb.json | 0 .../tools/query-tester}/query_tester.cc | 17 +++- .../tools/query-tester}/test_runner.cc | 82 ++++++++++++------- .../tools/query-tester}/test_runner.h | 9 +- dev/qtester/.clang-tidy | 22 ----- 12 files changed, 96 insertions(+), 84 deletions(-) rename {dev/qtester => cpp/tools/query-tester}/.gitignore (100%) rename {dev/qtester => cpp/tools/query-tester}/CMakeLists.txt (65%) rename {dev/qtester => cpp/tools/query-tester}/builtin_queries.cc (98%) rename {dev/qtester => cpp/tools/query-tester}/builtin_queries.h (82%) rename {dev/qtester => cpp/tools/query-tester}/datasets/.gitignore (100%) rename {dev/qtester => cpp/tools/query-tester}/queries/tpch1.substrait.pb.json (100%) rename {dev/qtester => cpp/tools/query-tester}/query_tester.cc (79%) rename {dev/qtester => cpp/tools/query-tester}/test_runner.cc (72%) rename {dev/qtester => cpp/tools/query-tester}/test_runner.h (96%) delete mode 100644 dev/qtester/.clang-tidy diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c350787bfe9..5c5bf96fa52 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -330,6 +330,10 @@ if(ARROW_BUILD_BENCHMARKS set(ARROW_TESTING ON) endif() +if(ARROW_BUILD_QUERY_TESTER) + set(ARROW_ENGINE ON) +endif() + if(ARROW_GANDIVA) set(ARROW_WITH_RE2 ON) endif() @@ -967,6 +971,10 @@ if(ARROW_SKYHOOK) add_subdirectory(src/skyhook) endif() +if(ARROW_BUILD_QUERY_TESTER) + add_subdirectory(tools/query-tester) +endif() + if(ARROW_BUILD_EXAMPLES) add_custom_target(runexample ctest -L example) add_subdirectory(examples/arrow) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 05fc14bbc72..bf3d778dde0 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -163,6 +163,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_BUILD_BENCHMARKS "Build the Arrow micro benchmarks" OFF) + define_option(ARROW_BUILD_QUERY_TESTER "Build the Arrow engine query testing tool" OFF) + # Reference benchmarks are used to compare to naive implementation, or # discover various hardware limits. define_option(ARROW_BUILD_BENCHMARKS_REFERENCE diff --git a/dev/qtester/.gitignore b/cpp/tools/query-tester/.gitignore similarity index 100% rename from dev/qtester/.gitignore rename to cpp/tools/query-tester/.gitignore diff --git a/dev/qtester/CMakeLists.txt b/cpp/tools/query-tester/CMakeLists.txt similarity index 65% rename from dev/qtester/CMakeLists.txt rename to cpp/tools/query-tester/CMakeLists.txt index b35260f4134..93a2abb51bd 100644 --- a/dev/qtester/CMakeLists.txt +++ b/cpp/tools/query-tester/CMakeLists.txt @@ -1,16 +1,5 @@ -cmake_minimum_required(VERSION 3.19) -project(arrow-query-tester) - -set(CMAKE_CXX_STANDARD 17) -if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++") -endif() -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - include(ExternalProject) -# Add Arrow -find_package(Arrow REQUIRED COMPONENTS dataset parquet engine) # Argparse is a modern library for interpreting CLI args set(ARGPARSE_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/argparse_ep-install") set(ARGPARSE_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARGPARSE_PREFIX}") @@ -21,10 +10,6 @@ externalproject_add(argparse URL https://github.com/p-ranav/argparse/archive/refs/tags/v2.2.tar.gz URL_HASH "SHA256=f0fc6ab7e70ac24856c160f44ebb0dd79dc1f7f4a614ee2810d42bb73799872b") -if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(CMAKE_CXX_CLANG_TIDY "clang-tidy-12") -endif() - function(ADD_PROGRAM TARGET) set(options) set(one_value_args) @@ -39,21 +24,16 @@ function(ADD_PROGRAM TARGET) ${TARGET}.cc ${ARG_EXTRA_SOURCES} ) + add_dependencies(${TARGET} arrow) add_dependencies(${TARGET} argparse) target_include_directories(${TARGET} SYSTEM PRIVATE "${ARGPARSE_INCLUDE_DIR}") target_link_libraries( ${TARGET} arrow_shared - arrow_dataset - arrow_engine - parquet + arrow_engine_shared + parquet_shared ) - if (MSVC) - target_compile_options(${TARGET} PRIVATE /W4 /WX) - else () - target_compile_options(${TARGET} PRIVATE -Wall -Wextra -Wpedantic -Werror) - endif () - + set_property(TARGET ${TARGET} PROPERTY CXX_STANDARD 17) endfunction() add_program(query_tester EXTRA_SOURCES builtin_queries.cc test_runner.cc) diff --git a/dev/qtester/builtin_queries.cc b/cpp/tools/query-tester/builtin_queries.cc similarity index 98% rename from dev/qtester/builtin_queries.cc rename to cpp/tools/query-tester/builtin_queries.cc index d4a6da5f404..0c11e29d5c5 100644 --- a/dev/qtester/builtin_queries.cc +++ b/cpp/tools/query-tester/builtin_queries.cc @@ -6,7 +6,8 @@ namespace cp = arrow::compute; -namespace arrow::qtest { +namespace arrow { +namespace qtest { namespace { @@ -98,4 +99,5 @@ const std::unordered_map& GetBuiltinQueries() { return builtin_queries_map; } -} // namespace arrow::qtest \ No newline at end of file +} // namespace qtest +} // namespace arrow diff --git a/dev/qtester/builtin_queries.h b/cpp/tools/query-tester/builtin_queries.h similarity index 82% rename from dev/qtester/builtin_queries.h rename to cpp/tools/query-tester/builtin_queries.h index b84e8c98f0e..450644a9575 100644 --- a/dev/qtester/builtin_queries.h +++ b/cpp/tools/query-tester/builtin_queries.h @@ -7,11 +7,13 @@ #include #include -namespace arrow::qtest { +namespace arrow { +namespace qtest { using QueryPlanFactory = std::function>( std::shared_ptr)>; const std::unordered_map& GetBuiltinQueries(); -} // namespace arrow::qtest \ No newline at end of file +} // namespace qtest +} // namespace arrow diff --git a/dev/qtester/datasets/.gitignore b/cpp/tools/query-tester/datasets/.gitignore similarity index 100% rename from dev/qtester/datasets/.gitignore rename to cpp/tools/query-tester/datasets/.gitignore diff --git a/dev/qtester/queries/tpch1.substrait.pb.json b/cpp/tools/query-tester/queries/tpch1.substrait.pb.json similarity index 100% rename from dev/qtester/queries/tpch1.substrait.pb.json rename to cpp/tools/query-tester/queries/tpch1.substrait.pb.json diff --git a/dev/qtester/query_tester.cc b/cpp/tools/query-tester/query_tester.cc similarity index 79% rename from dev/qtester/query_tester.cc rename to cpp/tools/query-tester/query_tester.cc index f9d924532a3..914ef0799c1 100644 --- a/dev/qtester/query_tester.cc +++ b/cpp/tools/query-tester/query_tester.cc @@ -1,7 +1,18 @@ #include +#include +#include + #include "test_runner.h" +template +arrow::util::optional ToArrow(std::optional std_opt) { + if (std_opt) { + return *std_opt; + } + return arrow::util::nullopt; +} + int main(int argc, char* argv[]) { argparse::ArgumentParser program("query_tester"); @@ -28,11 +39,11 @@ int main(int argc, char* argv[]) { arrow::qtest::QueryTestOptions options; options.query_name = program.get("query"); - options.cpu_threads = program.present("--cpu-threads"); - options.io_threads = program.present("--io-threads"); + options.cpu_threads = ToArrow(program.present("--cpu-threads")); + options.io_threads = ToArrow(program.present("--io-threads")); options.validate = program.get("--validate"); options.num_iterations = program.get("--num-iterations"); - options.executable_path = argv[0]; + options.executable_path = std::filesystem::absolute(argv[0]); arrow::Result result = arrow::qtest::RunQueryTest(options); diff --git a/dev/qtester/test_runner.cc b/cpp/tools/query-tester/test_runner.cc similarity index 72% rename from dev/qtester/test_runner.cc rename to cpp/tools/query-tester/test_runner.cc index 72f3eaf1a23..c0475dce9a0 100644 --- a/dev/qtester/test_runner.cc +++ b/cpp/tools/query-tester/test_runner.cc @@ -2,15 +2,16 @@ #include "builtin_queries.h" #include +#include #include #include #include -namespace std_fs = std::filesystem; namespace cp = arrow::compute; -namespace arrow::qtest { +namespace arrow { +namespace qtest { Status ValidateOptions(const QueryTestOptions& options) { if (options.cpu_threads && *options.cpu_threads <= 0) { @@ -29,20 +30,38 @@ Status ValidateOptions(const QueryTestOptions& options) { } namespace { + +fs::LocalFileSystem* local_fs() { + static std::unique_ptr local_fs = + std::unique_ptr(new fs::LocalFileSystem()); + return local_fs.get(); +} + +bool IsDirectory(const std::string& path) { + Result maybe_file_info = local_fs()->GetFileInfo(path); + if (!maybe_file_info.ok()) { + return false; + } + return maybe_file_info->IsDirectory(); +} + Result DoGetRootDirectory(const std::string& executable_path) { - std_fs::path path = std_fs::absolute(std_fs::path(executable_path)); + std::string path = executable_path; while (true) { - if (std_fs::is_directory(path / "queries") && - std_fs::is_directory(path / "datasets")) { - return path; + std::string potential_root = fs::internal::JoinAbstractPath( + std::vector{path, "tools", "query-tester"}); + if (IsDirectory(fs::internal::JoinAbstractPath( + std::vector{potential_root, "queries"}))) { + return potential_root; } - if (path.has_parent_path() && path != path.parent_path()) { - path = path.parent_path(); - } else { + std::pair parent_info = + fs::internal::GetAbstractPathParent(path); + if (parent_info.first.empty()) { return Status::Invalid( - "Could not locate the root directory. Did you perhaps move or copy the " - "query_tester executable outside of the project directory?"); + "Could not locate the tools/query-tester directory. Did you perhaps move or " + "copy the query_tester executable outside of the project directory?"); } + path = parent_info.first; } } @@ -51,7 +70,7 @@ Result GetRootDirectory(const std::string& executable) { return cached_root_directory; } -Result> PathToBuffer(const std_fs::path& path) { +Result> PathToBuffer(const std::string& path) { fs::LocalFileSystem local_fs; ARROW_ASSIGN_OR_RAISE(fs::FileInfo file_info, local_fs.GetFileInfo(path)); ARROW_ASSIGN_OR_RAISE(std::shared_ptr in_stream, @@ -69,7 +88,7 @@ Result> DeclsToPlan( } Result> LoadQueryFromSubstraitJson( - const std_fs::path& path, const engine::ConsumerFactory& consumer_factory) { + const std::string& path, const engine::ConsumerFactory& consumer_factory) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr json_bytes, PathToBuffer(path)); ARROW_ASSIGN_OR_RAISE( std::shared_ptr plan_bytes, @@ -80,7 +99,7 @@ Result> LoadQueryFromSubstraitJson( } Result> LoadQueryFromSubstraitBinary( - const std_fs::path& path, const engine::ConsumerFactory& consumer_factory) { + const std::string& path, const engine::ConsumerFactory& consumer_factory) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan_bytes, PathToBuffer(path)); ARROW_ASSIGN_OR_RAISE(std::vector decls, engine::DeserializePlan(*plan_bytes, consumer_factory)); @@ -88,7 +107,7 @@ Result> LoadQueryFromSubstraitBinary( } Result> LoadQueryFromPath( - const std_fs::path& path, const std::string& extension, + const std::string& path, const std::string& extension, const engine::ConsumerFactory& consumer_factory) { if (extension == "substrait.pb.json") { return LoadQueryFromSubstraitJson(path, consumer_factory); @@ -129,30 +148,36 @@ class QueryResultUpdatingConsumer : public cp::SinkNodeConsumer { std::size_t iteration_ = 0; }; -Result>> LoadQueryFromFiles( +Result>> LoadQueryFromFiles( const std::string& root_path, const std::string& query_name, const engine::ConsumerFactory& consumer_factory) { - for (const auto& entry : - std_fs::directory_iterator(std_fs::path(root_path) / "queries")) { - auto entry_path_str = entry.path().filename().string(); - auto first_dot_idx = entry_path_str.find('.'); + std::string queries_path = + fs::internal::JoinAbstractPath(std::vector{root_path, "queries"}); + fs::FileSelector selector; + selector.base_dir = queries_path; + selector.recursive = false; + ARROW_ASSIGN_OR_RAISE(std::vector query_files, + local_fs()->GetFileInfo(selector)); + for (const auto& query_file : query_files) { + auto query_file_str = query_file.base_name(); + auto first_dot_idx = query_file_str.find('.'); if (first_dot_idx != std::string::npos) { - auto stem = entry_path_str.substr(0, first_dot_idx); + auto stem = query_file_str.substr(0, first_dot_idx); if (stem == query_name) { - auto extension = entry_path_str.substr(first_dot_idx + 1); - return LoadQueryFromPath(entry.path(), extension, consumer_factory); + auto extension = query_file_str.substr(first_dot_idx + 1); + return LoadQueryFromPath(query_file.path(), extension, consumer_factory); } } } - return std::nullopt; + return util::nullopt; } -Result>> LoadQueryFromBuiltin( +Result>> LoadQueryFromBuiltin( const std::string& query_name, const engine::ConsumerFactory& consumer_factory) { const auto& builtin_queries_map = GetBuiltinQueries(); const auto& query = builtin_queries_map.find(query_name); if (query == builtin_queries_map.end()) { - return std::nullopt; + return util::nullopt; } std::shared_ptr consumer = consumer_factory(); ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, query->second(consumer)); @@ -175,7 +200,7 @@ Status InitializeArrow(const QueryTestOptions& options) { Result> LoadQuery( const std::string& root_path, const std::string& query_name, const engine::ConsumerFactory& consumer_factory) { - ARROW_ASSIGN_OR_RAISE(std::optional> maybe_query, + ARROW_ASSIGN_OR_RAISE(util::optional> maybe_query, LoadQueryFromFiles(root_path, query_name, consumer_factory)); if (maybe_query) { return *maybe_query; @@ -216,4 +241,5 @@ Status ReportResult(const QueryTestResult& result) { return Status::OK(); } -} // namespace arrow::qtest \ No newline at end of file +} // namespace qtest +} // namespace arrow diff --git a/dev/qtester/test_runner.h b/cpp/tools/query-tester/test_runner.h similarity index 96% rename from dev/qtester/test_runner.h rename to cpp/tools/query-tester/test_runner.h index 74685cd471e..d90b5f75b5f 100644 --- a/dev/qtester/test_runner.h +++ b/cpp/tools/query-tester/test_runner.h @@ -5,8 +5,11 @@ #include #include -#include +#include +#include +#include #include +#include namespace arrow { namespace qtest { @@ -16,9 +19,9 @@ struct QueryTestOptions { std::string query_name; /// Number of CPU threads to initialize Arrow with. By default Arrow will base this /// on std::thread::hardware_concurrency - std::optional cpu_threads; + util::optional cpu_threads; /// Number of I/O threads to initialize Arrow with. By default Arrow will use 8 - std::optional io_threads; + util::optional io_threads; /// Number of iterations of the query to run, defaults to a single run int num_iterations = 1; /// If true, validate the query results, if possible diff --git a/dev/qtester/.clang-tidy b/dev/qtester/.clang-tidy deleted file mode 100644 index bcdacd174be..00000000000 --- a/dev/qtester/.clang-tidy +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. ---- -Checks: '*,-llvmlibc*,-cert-err58-cpp,-modernize-use-trailing-return-type,-fuchsia-*,-cppcoreguidelines-*, - -readability-magic-numbers,-clang-analyzer-cplusplus.NewDelete,-clang-analyzer-cplusplus.NewDeleteLeaks, - -readability-function-cognitive-complexity, -hicpp-special-member-functions, -bugprone-exception-escape' -WarningsAsErrors: '*' -FormatStyle: 'file' From a3b4362b0b4302d46e966fe7d115f45ff363a41d Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 8 Mar 2022 17:58:43 -1000 Subject: [PATCH 11/11] ARROW-15877: ExecContext was not using the thread pool --- cpp/tools/query-tester/builtin_queries.cc | 5 +++-- cpp/tools/query-tester/builtin_queries.h | 2 +- cpp/tools/query-tester/test_runner.cc | 17 +++++++++++------ cpp/tools/query-tester/test_runner.h | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cpp/tools/query-tester/builtin_queries.cc b/cpp/tools/query-tester/builtin_queries.cc index 0c11e29d5c5..7ec799a357a 100644 --- a/cpp/tools/query-tester/builtin_queries.cc +++ b/cpp/tools/query-tester/builtin_queries.cc @@ -12,8 +12,9 @@ namespace qtest { namespace { Result> Tpch1( - std::shared_ptr consumer) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, cp::ExecPlan::Make()); + std::shared_ptr consumer, cp::ExecContext* exec_context) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, + cp::ExecPlan::Make(exec_context)); ARROW_ASSIGN_OR_RAISE(cp::TpchGen gen, cp::TpchGen::Make(plan.get(), 1)); ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/tools/query-tester/builtin_queries.h b/cpp/tools/query-tester/builtin_queries.h index 450644a9575..1584d00d5a6 100644 --- a/cpp/tools/query-tester/builtin_queries.h +++ b/cpp/tools/query-tester/builtin_queries.h @@ -11,7 +11,7 @@ namespace arrow { namespace qtest { using QueryPlanFactory = std::function>( - std::shared_ptr)>; + std::shared_ptr, compute::ExecContext*)>; const std::unordered_map& GetBuiltinQueries(); diff --git a/cpp/tools/query-tester/test_runner.cc b/cpp/tools/query-tester/test_runner.cc index c0475dce9a0..ad64e32fa46 100644 --- a/cpp/tools/query-tester/test_runner.cc +++ b/cpp/tools/query-tester/test_runner.cc @@ -173,14 +173,16 @@ Result>> LoadQueryFromFiles( } Result>> LoadQueryFromBuiltin( - const std::string& query_name, const engine::ConsumerFactory& consumer_factory) { + const std::string& query_name, const engine::ConsumerFactory& consumer_factory, + cp::ExecContext* exec_context) { const auto& builtin_queries_map = GetBuiltinQueries(); const auto& query = builtin_queries_map.find(query_name); if (query == builtin_queries_map.end()) { return util::nullopt; } std::shared_ptr consumer = consumer_factory(); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, query->second(consumer)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, + query->second(consumer, exec_context)); return plan; } @@ -199,14 +201,15 @@ Status InitializeArrow(const QueryTestOptions& options) { Result> LoadQuery( const std::string& root_path, const std::string& query_name, - const engine::ConsumerFactory& consumer_factory) { + const engine::ConsumerFactory& consumer_factory, cp::ExecContext* exec_context) { ARROW_ASSIGN_OR_RAISE(util::optional> maybe_query, LoadQueryFromFiles(root_path, query_name, consumer_factory)); if (maybe_query) { return *maybe_query; } - ARROW_ASSIGN_OR_RAISE(maybe_query, LoadQueryFromBuiltin(query_name, consumer_factory)); + ARROW_ASSIGN_OR_RAISE(maybe_query, + LoadQueryFromBuiltin(query_name, consumer_factory, exec_context)); if (maybe_query) { return *maybe_query; } @@ -219,13 +222,15 @@ Result RunQueryTest(const QueryTestOptions& options) { ARROW_ASSIGN_OR_RAISE(auto root_path, GetRootDirectory(options.executable_path)); ARROW_RETURN_NOT_OK(ValidateOptions(options)); ARROW_RETURN_NOT_OK(InitializeArrow(options)); + cp::ExecContext exec_context(default_memory_pool(), internal::GetCpuThreadPool()); QueryTestResult result; auto consumer = std::make_shared(&result); auto consumer_factory = [consumer] { return consumer; }; for (int i = 0; i < options.num_iterations; i++) { consumer->Start(i); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - LoadQuery(root_path, options.query_name, consumer_factory)); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr plan, + LoadQuery(root_path, options.query_name, consumer_factory, &exec_context)); ARROW_RETURN_NOT_OK(plan->StartProducing()); ARROW_RETURN_NOT_OK(plan->finished().status()); } diff --git a/cpp/tools/query-tester/test_runner.h b/cpp/tools/query-tester/test_runner.h index d90b5f75b5f..8eb4789986f 100644 --- a/cpp/tools/query-tester/test_runner.h +++ b/cpp/tools/query-tester/test_runner.h @@ -100,7 +100,7 @@ struct QueryTestResult { /// .substrait.pb - Loads a Substrait plan using the binary protobuf format Result> LoadQuery( const std::string& root_path, const std::string& query_name, - const engine::ConsumerFactory& consumer_factory); + const engine::ConsumerFactory& consumer_factory, compute::ExecContext* exec_context); /// Validate the options (will be run automatically by RunQueryTest) Status ValidateOptions(const QueryTestOptions& options); /// Run a query test.