From ef9337b95f3267605332ded73f926d38eb8f1a5f Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 22 Feb 2022 21:00:05 -0800 Subject: [PATCH 01/34] Add TPC-H Generator --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compute/exec/CMakeLists.txt | 2 + cpp/src/arrow/compute/exec/tpch_benchmark.cc | 175 + cpp/src/arrow/compute/exec/tpch_node.cc | 3704 ++++++++++++++++++ cpp/src/arrow/compute/exec/tpch_node.h | 69 + cpp/src/arrow/compute/kernels/vector_sort.cc | 5 +- 6 files changed, 3954 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/compute/exec/tpch_benchmark.cc create mode 100644 cpp/src/arrow/compute/exec/tpch_node.cc create mode 100644 cpp/src/arrow/compute/exec/tpch_node.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index fb5d603d6e8..57483266757 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -399,6 +399,7 @@ if(ARROW_COMPUTE) compute/exec/sink_node.cc compute/exec/source_node.cc compute/exec/task_util.cc + compute/exec/tpch_node.cc compute/exec/union_node.cc compute/exec/util.cc compute/function.cc diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index 53ba77b6088..bca6ec2c6e2 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -37,6 +37,8 @@ add_arrow_compute_test(util_test PREFIX "arrow-compute") add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(tpch_benchmark PREFIX "arrow-compute") + if(ARROW_BUILD_OPENMP_BENCHMARKS) find_package(OpenMP REQUIRED) add_arrow_benchmark(hash_join_benchmark diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc new file mode 100644 index 00000000000..963782333cf --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/testing/future_util.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/util/make_unique.h" +#include "arrow/compute/cast.h" + +namespace arrow +{ +namespace compute +{ + +std::shared_ptr Plan_Q1(AsyncGenerator> &sink_gen, int scale_factor) +{ + ExecContext *ctx = default_exec_context(); + *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(ctx); + TpchGen gen = *TpchGen::Make(plan.get(), scale_factor); + + ExecNode *lineitem = *gen.Lineitem( + { + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_TAX", + "L_DISCOUNT", + "L_SHIPDATE", + "L_RETURNFLAG", + "L_LINESTATUS" + }); + + std::shared_ptr sept_2_1998 = std::make_shared(10471); // September 2, 1998 is 10471 days after January 1, 1970 + Expression filter = less_equal(field_ref("L_SHIPDATE"), literal(std::move(sept_2_1998))); + FilterNodeOptions filter_opts(filter); + + Expression l_returnflag = field_ref("L_RETURNFLAG"); + Expression l_linestatus = field_ref("L_LINESTATUS"); + Expression quantity = field_ref("L_QUANTITY"); + Expression base_price = field_ref("L_EXTENDEDPRICE"); + + std::shared_ptr decimal_1 = std::make_shared(Decimal128{0, 100}, decimal(12, 2)); + Expression discount_multiplier = call("subtract", { literal(decimal_1), field_ref("L_DISCOUNT") }); + Expression tax_multiplier = call("add", { literal(decimal_1), field_ref("L_TAX") }); + Expression disc_price = call("multiply", { field_ref("L_EXTENDEDPRICE"), discount_multiplier }); + Expression charge = call("multiply", + { + call("cast", + { + call("multiply", { field_ref("L_EXTENDEDPRICE"), discount_multiplier }) + }, compute::CastOptions::Unsafe(decimal(12, 2))), + tax_multiplier + }); + Expression discount = field_ref("L_DISCOUNT"); + + std::vector projection_list = + { + l_returnflag, + l_linestatus, + quantity, + base_price, + disc_price, + charge, + quantity, + base_price, + discount + }; + std::vector project_names = + { + "l_returnflag", + "l_linestatus", + "sum_qty", + "sum_base_price", + "sum_disc_price", + "sum_charge", + "avg_qty", + "avg_price", + "avg_disc" + }; + ProjectNodeOptions project_opts(std::move(projection_list)); + + ScalarAggregateOptions sum_opts = ScalarAggregateOptions::Defaults(); + CountOptions count_opts(CountOptions::CountMode::ALL); + std::vector aggs = + { + { "hash_sum", &sum_opts }, + { "hash_sum", &sum_opts }, + { "hash_sum", &sum_opts }, + { "hash_sum", &sum_opts }, + { "hash_mean", &sum_opts }, + { "hash_mean", &sum_opts }, + { "hash_mean", &sum_opts }, + { "hash_count", &count_opts } + }; + + std::vector cols = + { + 2, 3, 4, 5, 6, 7, 8, 2 + }; + + std::vector names = + { + "sum_qty", + "sum_base_price", + "sum_disc_price", + "sum_charge", + "avg_qty", + "avg_price", + "avg_disc", + "count_order" + }; + + std::vector keys = { "L_RETURNFLAG", "L_LINESTATUS" }; + AggregateNodeOptions agg_opts(aggs, cols, names, keys); + + SortKey l_returnflag_key("L_RETURNFLAG"); + SortKey l_linestatus_key("L_LINESTATUS"); + SortOptions sort_opts({ l_returnflag_key, l_linestatus_key }); + OrderBySinkNodeOptions order_by_opts(sort_opts, &sink_gen); + + Declaration filter_decl("filter", { Declaration::Input(lineitem) }, filter_opts); + Declaration project_decl("project", project_opts); + Declaration aggregate_decl("aggregate", agg_opts); + Declaration orderby_decl("order_by_sink", order_by_opts); + + Declaration q1 = Declaration::Sequence( + { + filter_decl, + project_decl, + aggregate_decl, + orderby_decl + }); + std::ignore = *q1.AddToPlan(plan.get()); + return plan; +} + +static void BM_Tpch_Q1(benchmark::State &st) +{ + for(auto _ : st) + { + st.PauseTiming(); + AsyncGenerator> sink_gen; + std::shared_ptr plan = Plan_Q1(sink_gen, st.range(0)); + st.ResumeTiming(); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); +#ifndef NDEBUG + st.PauseTiming(); + for(auto &batch : res) + std::cout << batch.ToString() << std::endl; + st.ResumeTiming(); +#endif + } +} + +//BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 1000)->ArgNames({ "SF" }); +BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); +} +} diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc new file mode 100644 index 00000000000..842bf828574 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -0,0 +1,3704 @@ +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/future.h" +#include "arrow/util/unreachable.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace arrow +{ + using internal::checked_cast; + + namespace compute + { + class TpchText + { + public: + Status Init(); + Result GenerateComments( + size_t num_comments, + size_t min_length, + size_t max_length, + random::pcg32_fast &rng); + + private: + void GenerateWord(size_t &offset, const char **words, size_t num_choices); + void GenerateNoun(size_t &offset); + void GenerateVerb(size_t &offset); + void GenerateAdjective(size_t &offset); + void GenerateAdverb(size_t &offset); + void GeneratePreposition(size_t &offset); + void GenerateAuxiliary(size_t &offset); + void GenerateTerminator(size_t &offset); + + void GenerateNounPhrase(size_t &offset); + void GenerateVerbPhrase(size_t &offset); + void GeneratePrepositionalPhrase(size_t &offset); + + void GenerateSentence(size_t &offset); + + std::unique_ptr text_; + random::pcg32_fast rng_; + static constexpr size_t kTextBytes = 300 * 1024 * 1024; // 300 MB + }; + + class TpchTableGenerator + { + public: + using OutputBatchCallback = std::function; + using FinishedCallback = std::function; + using GenerateFn = std::function; + using ScheduleCallback = std::function; + using AbortCallback = std::function; + + virtual Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) = 0; + + virtual Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) = 0; + + void Abort(AbortCallback abort_callback) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + abort_callback(); + } + } + + virtual std::shared_ptr schema() const = 0; + + virtual ~TpchTableGenerator() = default; + + protected: + std::atomic done_ = { false }; + std::atomic batches_generated_ = { 0 }; + }; + + int GetNumDigits(int64_t x) + { + // This if statement chain is for MAXIMUM SPEED + /* + ., + . _,'f----.._ + |\ ,-'"/ | ,' + |,_ ,--. / + /,-. ,'`. (_ + f o| o|__ "`-. + ,-._.,--'_ `. _.,-` + `"' ___.,'` j,-' + `-.__.,--' + */ + // Source: https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c + ARROW_DCHECK(x >= 0); + if(x < 10ll) return 1; + if(x < 100ll) return 2; + if(x < 1000ll) return 3; + if(x < 10000ll) return 4; + if(x < 100000ll) return 5; + if(x < 1000000ll) return 6; + if(x < 10000000ll) return 7; + if(x < 100000000ll) return 8; + if(x < 1000000000ll) return 9; + if(x < 10000000000ll) return 10; + if(x < 100000000000ll) return 11; + if(x < 1000000000000ll) return 12; + if(x < 10000000000000ll) return 13; + if(x < 100000000000000ll) return 14; + if(x < 1000000000000000ll) return 15; + if(x < 10000000000000000ll) return 16; + if(x < 100000000000000000ll) return 17; + if(x < 1000000000000000000ll) return 18; + return -1; + } + + void AppendNumberPaddedToNineDigits(char *out, int64_t x) + { + // We do all of this to avoid calling snprintf, which does a lot of crazy + // locale stuff. On Windows and MacOS this can get suuuuper slow + int num_digits = GetNumDigits(x); + int num_padding_zeros = std::max(9 - num_digits, 0); + std::memset(out, '0', static_cast(num_padding_zeros)); + while(x > 0) + { + *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); + num_digits -= 1; + x /= 10; + } + } + + Result> SetOutputColumns( + const std::vector &columns, + const std::vector> &types, + const std::unordered_map &name_map, + std::vector &gen_list) + { + gen_list.clear(); + std::vector> fields; + if(columns.empty()) + { + for(auto pair : name_map) + { + int col_idx = pair.second; + fields.push_back(field(pair.first, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } + else + { + for(const std::string &col : columns) + { + auto entry = name_map.find(col); + if(entry == name_map.end()) + return Status::Invalid("Not a valid column name"); + int col_idx = static_cast(entry->second); + fields.push_back(field(col, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } + } + + static TpchText g_text; + + Status TpchText::Init() + { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + size_t offset = 0; + while(offset < kTextBytes) + GenerateSentence(offset); + return Status::OK(); + } + + Result TpchText::GenerateComments( + size_t num_comments, + size_t min_length, + size_t max_length, + random::pcg32_fast &rng) + { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); + int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); + offsets[0] = 0; + for(size_t i = 1; i <= num_comments; i++) + offsets[i] = offsets[i - 1] + length_dist(rng); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, AllocateBuffer(offsets[num_comments])); + char *comments = reinterpret_cast(comment_buffer->mutable_data()); + for(size_t i = 0; i < num_comments; i++) + { + size_t length = offsets[i + 1] - offsets[i]; + std::uniform_int_distribution offset_dist(0, kTextBytes - length); + size_t offset_in_text = offset_dist(rng); + std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); + } + ArrayData ad(utf8(), num_comments, { nullptr, std::move(comment_buffer), std::move(offset_buffer) }); + return std::move(ad); + } + + Result RandomVString( + random::pcg32_fast &rng, + int64_t num_rows, + int32_t min_length, + int32_t max_length) + { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((num_rows + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t i = 1; i <= num_rows; i++) + offsets[i] = offsets[i - 1] + length_dist(rng); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[num_rows])); + char *str = reinterpret_cast(str_buff->mutable_data()); + + // Spec says to pick random alphanumeric characters from a set of at least + // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, + // so 52 total for upper and lower case, and 10 possible digits gives 62 + // characters... + // dbgen solves this by including a space and a comma as well, so we'll + // copy that. + const char alpha_numerics[65] = + "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; + std::uniform_int_distribution char_dist(0, 63); + for(int32_t i = 0; i < offsets[num_rows]; i++) + str[i] = alpha_numerics[char_dist(rng)]; + + ArrayData ad(utf8(), num_rows, { nullptr, std::move(str_buff), std::move(offset_buff) }); + return std::move(ad); + } + + void AppendNumber(char *&out, int num_digits, int32_t x) + { + out += (num_digits - 1); + while(x > 0) + { + *out-- = x % 10; + x /= 10; + } + x += num_digits; + } + + void GeneratePhoneNumber( + char *out, + random::pcg32_fast &rng, + int32_t country) + { + std::uniform_int_distribution three_digit(100, 999); + std::uniform_int_distribution four_digit(1000, 9999); + + int32_t country_code = country + 10; + int32_t l1 = three_digit(rng); + int32_t l2 = three_digit(rng); + int32_t l3 = four_digit(rng); + AppendNumber(out, 2, country_code); + *out++ = '-'; + AppendNumber(out, 3, l1); + *out++ = '-'; + AppendNumber(out, 3, l2); + *out++ = '-'; + AppendNumber(out, 4, l3); + } + + static constexpr uint32_t STARTDATE = 8035; // January 1, 1992 is 8035 days after January 1, 1970 + static constexpr uint32_t CURRENTDATE = 9298; // June 17, 1995 is 9298 days after January 1, 1970 + static constexpr uint32_t ENDDATE = 10591; // December 12, 1998 is 10591 days after January 1, 1970 + + const char *NameParts[] = + { + "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", + "blush", "brown", "burlywood", "burnished", "chartreuse", "chiffon", "chocolate", "coral", + "cornflower", "cornsilk", "cream", "cyan", "dark", "deep", "dim", "dodger", "drab", "firebrick", + "floral", "forest", "frosted", "gainsboro", "ghost", "goldenrod", "green", "grey", "honeydew", + "hot", "indian", "ivory", "khaki", "lace", "lavender", "lawn", "lemon", "light", "lime", "linen", + "magenta", "maroon", "medium", "metallic", "midnight", "mint", "misty", "moccasin", "navajo", + "navy", "olive", "orange", "orchid", "pale", "papaya", "peach", "peru", "pink", "plum", "powder", + "puff", "purple", "red", "rose", "rosy", "royal", "saddle", "salmon", "sandy", "seashell", "sienna", + "sky", "slate", "smoke", "snow", "spring", "steel", "tan", "thistle", "tomato", "turquoise", "violet", + "wheat", "white", "yellow", + }; + static constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); + + const char *Types_1[] = + { + "STANDARD ", "SMALL ", "MEDIUM ", "LARGE ", "ECONOMY ", "PROMO ", + }; + static constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); + + const char *Types_2[] = + { + "ANODIZED ", "BURNISHED ", "PLATED ", "POLISHED ", "BRUSHED ", + }; + static constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); + + const char *Types_3[] = + { + "TIN", "NICKEL", "BRASS", "STEEL", "COPPER", + }; + static constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); + + const char *Containers_1[] = + { + "SM ", "LG ", "MD ", "JUMBO ", "WRAP ", + }; + static constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); + + const char *Containers_2[] = + { + "CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM", + }; + static constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); + + const char *Segments[] = + { + "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD", + }; + static constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); + + const char *Priorities[] = + { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", + }; + static constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); + + const char *Instructions[] = + { + "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN", + }; + static constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); + + const char *Modes[] = + { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", + }; + static constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); + + const char *Nouns[] = + { + "foxes ", "ideas ", "theodolites ", "pinto beans ", "instructions ", "dependencies ", "excuses ", + "platelets ", "asymptotes ", "courts ", "dolphins ", "multipliers ", "sautemes ", "warthogs ", "frets ", + "dinos ", "attainments ", "somas ", "Tiresias '", "patterns ", "forges ", "braids ", "hockey players ", "frays ", + "warhorses ", "dugouts ", "notomis ", "epitaphs ", "pearls ", "tithes ", "waters ", "orbits ", "gifts ", "sheaves ", + "depths ", "sentiments ", "decoys ", "realms ", "pains ", "grouches ", "escapades ", + }; + static constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); + + const char *Verbs[] = + { + "sleep ", "wake ", "are ", "cajole ", "haggle ", "nag ", "use ", "boost ", "affix ", "detect ", "integrate ", + "maintain ", "nod ", "was ", "lose ", "sublate ", "solve ", "thrash ", "promise ", "engage ", "hinder ", + "print ", "x-ray ", "breach ", "eat ", "grow ", "impress ", "mold ", "poach ", "serve ", "run ", "dazzle ", + "snooze ", "doze ", "unwind ", "kindle ", "play ", "hang ", "believe ", "doubt ", + }; + static constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); + + const char *Adjectives[] = + { + "furious ", "sly ", "careful ", "blithe ", "quick ", "fluffy ", "slow ", "quiet ", "ruthless ", "thin ", + "close ", "dogged ", "daring ", "brave ", "stealthy ", "permanent ", "enticing ", "idle ", "busy ", + "regular ", "final ", "ironic ", "even ", "bold ", "silent ", + }; + static constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); + + const char *Adverbs[] = + { + "sometimes ", "always ", "never ", "furiously ", "slyly ", "carefully ", "blithely ", "quickly ", "fluffily ", + "slowly ", "quietly ", "ruthlessly ", "thinly ", "closely ", "doggedly ", "daringly ", "bravely ", "stealthily ", + "permanently ", "enticingly ", "idly ", "busily ", "regularly ", "finally ", "ironically ", "evenly ", "boldly ", + "silently ", + }; + static constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); + + const char *Prepositions[] = + { + "about ", "above ", "according to ", "across ", "after ", "against ", "along ", "alongside of ", "among ", + "around ", "at ", "atop ", "before ", "behind ", "beneath ", "beside ", "besides ", "between ", "beyond ", + "beyond ", "by ", "despite ", "during ", "except ", "for ", "from ", "in place of ", "inside ", "instead of ", + "into ", "near ", "of ", "on ", "outside ", "over ", "past ", "since ", "through ", "throughout ", "to ", + "toward ", "under ", "until ", "up ", "upon ", "without ", "with ", "within ", + }; + static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); + + const char *Auxiliaries[] = + { + "do ", "may ", "might ", "shall ", "will ", "would ", "can ", "could ", "should ", "ought to ", "must ", + "will have to ", "shall have to ", "could have to ", "should have to ", "must have to ", "need to ", "try to ", + }; + static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); + + const char *Terminators[] = + { + ".", ";", ":", "?", "!", "--", + }; + static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + + void TpchText::GenerateWord(size_t &offset, const char **words, size_t num_choices) + { + std::uniform_int_distribution dist(0, num_choices - 1); + const char *word = words[dist(rng_)]; + size_t bytes_left = kTextBytes - offset; + size_t length = std::strlen(word); + size_t bytes_to_copy = std::min(bytes_left, length); + std::memcpy(text_->mutable_data() + offset, word, bytes_to_copy); + offset += bytes_to_copy; + } + + void TpchText::GenerateNoun(size_t &offset) + { + GenerateWord(offset, Nouns, kNumNouns); + } + + void TpchText::GenerateVerb(size_t &offset) + { + GenerateWord(offset, Verbs, kNumVerbs); + } + + void TpchText::GenerateAdjective(size_t &offset) + { + GenerateWord(offset, Adjectives, kNumAdjectives); + } + + void TpchText::GenerateAdverb(size_t &offset) + { + GenerateWord(offset, Adverbs, kNumAdverbs); + } + + void TpchText::GeneratePreposition(size_t &offset) + { + GenerateWord(offset, Prepositions, kNumPrepositions); + } + + void TpchText::GenerateAuxiliary(size_t &offset) + { + GenerateWord(offset, Auxiliaries, kNumAuxiliaries); + } + + void TpchText::GenerateTerminator(size_t &offset) + { + GenerateWord(offset, Terminators, kNumTerminators); + } + + void TpchText::GenerateNounPhrase(size_t &offset) + { + std::uniform_int_distribution dist(0, 3); + const char *comma_space = ", "; + switch(dist(rng_)) + { + case 0: + GenerateNoun(offset); + break; + case 1: + GenerateAdjective(offset); + GenerateNoun(offset); + break; + case 2: + GenerateAdjective(offset); + GenerateWord(offset, &comma_space, 1); + GenerateAdjective(offset); + GenerateNoun(offset); + break; + case 3: + GenerateAdverb(offset); + GenerateAdjective(offset); + GenerateNoun(offset); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + } + + void TpchText::GenerateVerbPhrase(size_t &offset) + { + std::uniform_int_distribution dist(0, 3); + switch(dist(rng_)) + { + case 0: + GenerateVerb(offset); + break; + case 1: + GenerateAuxiliary(offset); + GenerateVerb(offset); + break; + case 2: + GenerateVerb(offset); + GenerateAdverb(offset); + break; + case 3: + GenerateAuxiliary(offset); + GenerateVerb(offset); + GenerateAdverb(offset); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + } + + void TpchText::GeneratePrepositionalPhrase(size_t &offset) + { + const char *the_space = "the "; + GeneratePreposition(offset); + GenerateWord(offset, &the_space, 1); + GenerateNounPhrase(offset); + } + + void TpchText::GenerateSentence(size_t &offset) + { + std::uniform_int_distribution dist(0, 4); + switch(dist(rng_)) + { + case 0: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GenerateTerminator(offset); + break; + case 1: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateTerminator(offset); + break; + case 2: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GenerateNounPhrase(offset); + GenerateTerminator(offset); + break; + case 3: + GenerateNounPhrase(offset); + GenerateVerbPhrase(offset); + GenerateNounPhrase(offset); + GenerateTerminator(offset); + break; + case 4: + GenerateNounPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateVerbPhrase(offset); + GenerateNounPhrase(offset); + GenerateTerminator(offset); + break; + case 5: + GenerateNounPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateVerbPhrase(offset); + GeneratePrepositionalPhrase(offset); + GenerateTerminator(offset); + break; + default: + Unreachable("Random number should be between 0 and 5 inclusive"); + break; + } + } + + using GenerateColumnFn = std::function; + class PartAndPartSupplierGenerator + { + public: + Status Init( + size_t num_threads, + int64_t batch_size, + int scale_factor) + { + if(!inited_) + { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + thread_local_data_.resize(num_threads); + for(ThreadLocalData &tld : thread_local_data_) + { + // 5 is the maximum number of different strings we need to concatenate + tld.string_indices.resize(5 * batch_size_); + } + part_rows_to_generate_ = scale_factor_ * 200000; + } + return Status::OK(); + } + + Result> SetPartOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, part_types_, part_name_map_, part_cols_); + } + + Result> SetPartSuppOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, partsupp_types_, partsupp_name_map_, partsupp_cols_); + } + + Result> NextPartBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(part_output_queue_mutex_); + if(!part_output_queue_.empty()) + { + ExecBatch batch = std::move(part_output_queue_.front()); + part_output_queue_.pop(); + return std::move(batch); + } + else if(part_rows_generated_ == part_rows_to_generate_) + { + return util::nullopt; + } + else + { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = std::min( + batch_size_, + part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.clear(); + tld.part.resize(PART::kNumCols); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for(int col : part_cols_) + RETURN_NOT_OK(part_generators_[col](thread_index)); + for(int col : partsupp_cols_) + RETURN_NOT_OK(partsupp_generators_[col](thread_index)); + + std::vector part_result(part_cols_.size()); + for(size_t i = 0; i < part_cols_.size(); i++) + { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + if(!partsupp_cols_.empty()) + { + std::vector partsupp_results; + for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) + { + std::vector partsupp_result(partsupp_cols_.size()); + for(size_t icol = 0; icol < partsupp_cols_.size(); icol++) + { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(partsupp_output_queue_mutex_); + for(ExecBatch &eb : partsupp_results) + { + partsupp_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(part_result)); + } + + Result> NextPartSuppBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(part_output_queue_mutex_); + if(!part_output_queue_.empty()) + { + ExecBatch batch = std::move(part_output_queue_.front()); + part_output_queue_.pop(); + return std::move(batch); + } + else if(part_rows_generated_ == part_rows_to_generate_) + { + return util::nullopt; + } + else + { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = std::min( + batch_size_, + part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.clear(); + tld.part.resize(PART::kNumCols); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for(int col : part_cols_) + RETURN_NOT_OK(part_generators_[col](thread_index)); + for(int col : partsupp_cols_) + RETURN_NOT_OK(partsupp_generators_[col](thread_index)); + if(!part_cols_.empty()) + { + std::vector part_result(part_cols_.size()); + for(size_t i = 0; i < part_cols_.size(); i++) + { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch part_batch, ExecBatch::Make(std::move(part_result))); + { + std::lock_guard lock(part_output_queue_mutex_); + part_output_queue_.emplace(std::move(part_batch)); + } + } + std::vector partsupp_results; + for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) + { + std::vector partsupp_result(partsupp_cols_.size()); + for(size_t icol = 0; icol < partsupp_cols_.size(); icol++) + { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(partsupp_output_queue_mutex_); + for(size_t i = 1; i < partsupp_results.size(); i++) + partsupp_output_queue_.emplace(std::move(partsupp_results[i])); + } + return std::move(partsupp_results[0]); + } + + private: +#define FOR_EACH_PART_COLUMN(F) \ + F(P_PARTKEY) \ + F(P_NAME) \ + F(P_MFGR) \ + F(P_BRAND) \ + F(P_TYPE) \ + F(P_SIZE) \ + F(P_CONTAINER) \ + F(P_RETAILPRICE) \ + F(P_COMMENT) + +#define FOR_EACH_PARTSUPP_COLUMN(F) \ + F(PS_PARTKEY) \ + F(PS_SUPPKEY) \ + F(PS_AVAILQTY) \ + F(PS_SUPPLYCOST) \ + F(PS_COMMENT) \ + +#define MAKE_ENUM(col) col, + struct PART + { + enum + { + FOR_EACH_PART_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + struct PARTSUPP + { + enum + { + FOR_EACH_PARTSUPP_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) \ + { #col, PART::col }, + const std::unordered_map part_name_map_ = + { + FOR_EACH_PART_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_STRING_MAP(col) \ + { #col, PARTSUPP::col }, + const std::unordered_map partsupp_name_map_ = + { + FOR_EACH_PARTSUPP_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector part_generators_ = + { + FOR_EACH_PART_COLUMN(MAKE_FN_ARRAY) + }; + std::vector partsupp_generators_ = + { + FOR_EACH_PARTSUPP_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_LINEITEM_COLUMN +#undef FOR_EACH_ORDERS_COLUMN + + const std::vector> part_types_ = + { + int32(), + utf8(), + fixed_size_binary(25), + fixed_size_binary(10), + utf8(), + int32(), + fixed_size_binary(10), + decimal(12, 2), + utf8(), + }; + + const std::vector> partsupp_types_ = + { + int32(), + int32(), + int32(), + decimal(12, 2), + utf8(), + }; + + Status AllocatePartBatch(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.part[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.part_to_generate * byte_width)); + ArrayData ad(part_types_[column], tld.part_to_generate, { nullptr, std::move(buff) }); + tld.part[column] = std::move(ad); + return Status::OK(); + } + + Status P_PARTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_PARTKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_PARTKEY)); + int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.part_to_generate; i++) + { + p_partkey[i] = (tld.partkey_start + i + 1); + ARROW_DCHECK(1 <= p_partkey[i] && p_partkey[i] <= part_rows_to_generate_); + } + } + return Status::OK(); + } + + Status P_NAME(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_NAME].kind() == Datum::NONE) + { + std::uniform_int_distribution dist(0, static_cast(kNumNameParts - 1)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + size_t string_length = 0; + for(int ipart = 0; ipart < 5; ipart++) + { + uint8_t name_part_index = dist(tld.rng); + tld.string_indices[irow * 5 + ipart] = name_part_index; + string_length += std::strlen(NameParts[name_part_index]); + } + // Add 4 because there is a space between each word (i.e. four spaces) + offsets[irow + 1] = offsets[irow] + string_length + 4; + } + // Add an extra byte for the space after in the very last string. + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); + char *strings = reinterpret_cast(string_buffer->mutable_data()); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + char *row = strings + offsets[irow]; + for(int ipart = 0; ipart < 5; ipart++) + { + uint8_t name_part_index = tld.string_indices[irow * 5 + ipart]; + const char *part = NameParts[name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + *row++ = ' '; + } + } + ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + Datum datum(ad); + tld.part[PART::P_NAME] = std::move(datum); + } + return Status::OK(); + } + + Status P_MFGR(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_MFGR].kind() == Datum::NONE) + { + std::uniform_int_distribution dist(1, 5); + const char *manufacturer = "Manufacturer#"; + const size_t manufacturer_length = std::strlen(manufacturer); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR)); + char *p_mfgr = reinterpret_cast(tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width); + char mfgr_id = '0' + dist(tld.rng); + *(p_mfgr + byte_width * irow + manufacturer_length) = mfgr_id; + } + } + return Status::OK(); + } + + Status P_BRAND(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_MFGR].kind() == Datum::NONE) + { + RETURN_NOT_OK(P_MFGR(thread_index)); + std::uniform_int_distribution dist(1, 5); + const char *brand = "Brand#"; + const size_t brand_length = std::strlen(brand); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); + const char *p_mfgr = reinterpret_cast( + tld.part[PART::P_MFGR].array()->buffers[1]->data()); + char *p_brand = reinterpret_cast(tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_BRAND]); + int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); + const size_t mfgr_id_offset = std::strlen("Manufacturer#"); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); + char brand_id = '0' + dist(tld.rng); + std::strncpy(p_brand + byte_width * irow, brand, byte_width); + *(p_brand + byte_width * irow + brand_length) = mfgr_id; + *(p_brand + byte_width * irow + brand_length + 1) = brand_id; + } + } + return Status::OK(); + } + + Status P_TYPE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_TYPE].kind() == Datum::NONE) + { + using D = std::uniform_int_distribution; + D dists[] = + { + D{ 0, static_cast(kNumTypes_1 - 1) }, + D{ 0, static_cast(kNumTypes_2 - 1) }, + D{ 0, static_cast(kNumTypes_3 - 1) }, + }; + + const char **types[] = { Types_1, Types_2, Types_3 }; + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + size_t string_length = 0; + for(int ipart = 0; ipart < 3; ipart++) + { + uint8_t name_part_index = dists[ipart](tld.rng); + tld.string_indices[irow * 3 + ipart] = name_part_index; + string_length += std::strlen(types[ipart][name_part_index]); + } + // Add 4 because there is a space between each word (i.e. 2 spaces) + offsets[irow + 1] = offsets[irow] + string_length + 2; + } + // Add an extra byte for the space after in the very last string. + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); + char *strings = reinterpret_cast(string_buffer->mutable_data()); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + char *row = strings + offsets[irow]; + for(int ipart = 0; ipart < 3; ipart++) + { + uint8_t name_part_index = tld.string_indices[irow * 3 + ipart]; + const char *part = types[ipart][name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + *row++ = ' '; + } + } + ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + Datum datum(ad); + tld.part[PART::P_TYPE] = std::move(datum); + } + return Status::OK(); + } + + Status P_SIZE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_SIZE].kind() == Datum::NONE) + { + std::uniform_int_distribution dist(1, 50); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_SIZE)); + int32_t *p_size = reinterpret_cast( + tld.part[PART::P_SIZE].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.part_to_generate; i++) + p_size[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status P_CONTAINER(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_CONTAINER].kind() == Datum::NONE) + { + std::uniform_int_distribution dist1(0, static_cast(kNumContainers_1 - 1)); + std::uniform_int_distribution dist2(0, static_cast(kNumContainers_2 - 1)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER)); + char *p_container = reinterpret_cast( + tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_CONTAINER]); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + int container1_idx = dist1(tld.rng); + int container2_idx = dist2(tld.rng); + const char *container1 = Containers_1[container1_idx]; + const char *container2 = Containers_2[container2_idx]; + size_t container1_length = std::strlen(container1); + size_t container2_length = std::strlen(container2); + + char *row = p_container + byte_width * irow; + // Abuse strncpy to zero out the rest of the array + std::strncpy(row, container1, byte_width); + row[container1_length] = ' '; + std::memcpy(row + container1_length + 1, container2, container2_length); + } + } + return Status::OK(); + } + + Status P_RETAILPRICE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_RETAILPRICE].kind() == Datum::NONE) + { + RETURN_NOT_OK(P_PARTKEY(thread_index)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_RETAILPRICE)); + const int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + Decimal128 *p_retailprice = reinterpret_cast( + tld.part[PART::P_RETAILPRICE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.part_to_generate; irow++) + { + int32_t partkey = p_partkey[irow]; + int64_t retail_price = (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + p_retailprice[irow] = { retail_price }; + } + } + return Status::OK(); + } + + Status P_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PART::P_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(batch_size_, 5, 22, tld.rng)); + } + return Status::OK(); + } + + Status InitPartsupp(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + tld.generated_partsupp.reset(); + tld.partsupp.clear(); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + tld.partsupp.resize(num_batches); + for(std::vector &batch : tld.partsupp) + { + batch.clear(); + batch.resize(PARTSUPP::kNumCols); + } + return Status::OK(); + } + + Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + int32_t byte_width = arrow::internal::GetByteWidth(*partsupp_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(partsupp_types_[column], batch_size_, { nullptr, std::move(buff) }); + tld.partsupp[ibatch][column] = std::move(ad); + return Status::OK(); + } + + Status PS_PARTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_PARTKEY]) + { + tld.generated_partsupp[PARTSUPP::PS_PARTKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_PARTKEY)); + int32_t *ps_partkey = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for(int64_t irun = 0; irun < next_run;) + { + for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) + ps_partkey[batch_offset++] = p_partkey[ipart]; + if(ipartsupp == kPartSuppRowsPerPart) + { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_SUPPKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_SUPPKEY]) + { + tld.generated_partsupp[PARTSUPP::PS_SUPPKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t *p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + const int32_t S = scale_factor_ * 10000; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); + int32_t *ps_suppkey = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for(int64_t irun = 0; irun < next_run;) + { + for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) + { + int32_t supplier = static_cast(ipartsupp); + int32_t partkey = p_partkey[ipart]; + ps_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + if(ipartsupp == kPartSuppRowsPerPart) + { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_AVAILQTY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_AVAILQTY]) + { + tld.generated_partsupp[PARTSUPP::PS_AVAILQTY] = true; + std::uniform_int_distribution dist(1, 9999); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_AVAILQTY)); + int32_t *ps_availqty = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for(int64_t irun = 0; irun < next_run; irun++) + ps_availqty[irun] = dist(tld.rng); + + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_SUPPLYCOST(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST]) + { + tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST] = true; + std::uniform_int_distribution dist(100, 100000); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for(int64_t irow = 0; irow < ps_to_generate; ibatch++) + { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPLYCOST)); + Decimal128 *ps_supplycost = reinterpret_cast( + tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->buffers[1]->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for(int64_t irun = 0; irun < next_run; irun++) + ps_supplycost[irun] = { dist(tld.rng) }; + + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.part[PARTSUPP::PS_COMMENT].kind() == Datum::NONE) + { + int64_t irow = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) + { + int64_t num_rows = std::min(batch_size_, ps_to_generate - irow); + ARROW_ASSIGN_OR_RAISE( + tld.partsupp[ibatch][PARTSUPP::PS_COMMENT], g_text.GenerateComments(num_rows, 49, 198, tld.rng)); + irow += num_rows; + } + } + return Status::OK(); + } + + struct ThreadLocalData + { + std::vector part; + std::vector string_indices; + int64_t part_to_generate; + int64_t partkey_start; + + std::vector> partsupp; + std::bitset generated_partsupp; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex part_output_queue_mutex_; + std::mutex partsupp_output_queue_mutex_; + std::queue part_output_queue_; + std::queue partsupp_output_queue_; + int64_t batch_size_; + int scale_factor_; + int64_t part_rows_to_generate_; + int64_t part_rows_generated_; + std::vector part_cols_; + std::vector partsupp_cols_; + + static constexpr int64_t kPartSuppRowsPerPart = 4; + }; + + class OrdersAndLineItemGenerator + { + public: + Status Init( + size_t num_threads, + int64_t batch_size, + int scale_factor) + { + if(!inited_) + { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + thread_local_data_.resize(num_threads); + for(ThreadLocalData &tld : thread_local_data_) + { + tld.items_per_order.resize(batch_size_); + } + orders_rows_to_generate_ = scale_factor_ * 150000 * 10; + } + return Status::OK(); + } + + Result> SetOrdersOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, orders_types_, orders_name_map_, orders_cols_); + } + + Result> SetLineItemOutputColumns(const std::vector &cols) + { + return SetOutputColumns(cols, lineitem_types_, lineitem_name_map_, lineitem_cols_); + } + + Result> NextOrdersBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(orders_output_queue_mutex_); + if(!orders_output_queue_.empty()) + { + ExecBatch batch = std::move(orders_output_queue_.front()); + orders_output_queue_.pop(); + return std::move(batch); + } + else if(orders_rows_generated_ == orders_rows_to_generate_) + { + return util::nullopt; + } + else + { + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = std::min( + batch_size_, + orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + } + } + tld.orders.clear(); + tld.orders.resize(ORDERS::kNumCols); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.first_batch_offset = 0; + tld.generated_lineitem.reset(); + + for(int col : orders_cols_) + RETURN_NOT_OK(orders_generators_[col](thread_index)); + for(int col : lineitem_cols_) + RETURN_NOT_OK(lineitem_generators_[col](thread_index)); + + std::vector orders_result(orders_cols_.size()); + for(size_t i = 0; i < orders_cols_.size(); i++) + { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + if(!lineitem_cols_.empty()) + { + std::vector lineitem_results; + for(size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) + { + std::vector lineitem_result(lineitem_cols_.size()); + for(size_t icol = 0; icol < lineitem_cols_.size(); icol++) + { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(lineitem_output_queue_mutex_); + for(ExecBatch &eb : lineitem_results) + { + lineitem_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(orders_result)); + } + + Result> NextLineItemBatch(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ExecBatch queued; + bool from_queue = false; + { + std::lock_guard lock(lineitem_output_queue_mutex_); + if(!lineitem_output_queue_.empty()) + { + queued = std::move(lineitem_output_queue_.front()); + lineitem_output_queue_.pop(); + from_queue = true; + } + } + tld.first_batch_offset = 0; + if(from_queue) + { + ARROW_DCHECK(queued.length <= batch_size_); + tld.first_batch_offset = queued.length; + if(queued.length == batch_size_) + return std::move(queued); + } + { + std::lock_guard lock(orders_output_queue_mutex_); + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = std::min( + batch_size_, + orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + if(orders_rows_generated_ == orders_rows_to_generate_) + { + if(from_queue) + return std::move(queued); + return util::nullopt; + } + } + tld.orders.clear(); + tld.orders.resize(ORDERS::kNumCols); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.generated_lineitem.reset(); + if(from_queue) + { + for(size_t i = 0; i < lineitem_cols_.size(); i++) + if(tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) + tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); + } + + for(int col : orders_cols_) + RETURN_NOT_OK(orders_generators_[col](thread_index)); + for(int col : lineitem_cols_) + RETURN_NOT_OK(lineitem_generators_[col](thread_index)); + + if(!orders_cols_.empty()) + { + std::vector orders_result(orders_cols_.size()); + for(size_t i = 0; i < orders_cols_.size(); i++) + { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch orders_batch, ExecBatch::Make(std::move(orders_result))); + { + std::lock_guard lock(orders_output_queue_mutex_); + orders_output_queue_.emplace(std::move(orders_batch)); + } + } + std::vector lineitem_results; + for(size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) + { + std::vector lineitem_result(lineitem_cols_.size()); + for(size_t icol = 0; icol < lineitem_cols_.size(); icol++) + { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(lineitem_output_queue_mutex_); + for(size_t i = 1; i < lineitem_results.size(); i++) + lineitem_output_queue_.emplace(std::move(lineitem_results[i])); + } + return std::move(lineitem_results[0]); + } + + private: +#define FOR_EACH_ORDERS_COLUMN(F) \ + F(O_ORDERKEY) \ + F(O_CUSTKEY) \ + F(O_ORDERSTATUS) \ + F(O_TOTALPRICE) \ + F(O_ORDERDATE) \ + F(O_ORDERPRIORITY) \ + F(O_CLERK) \ + F(O_SHIPPRIORITY) \ + F(O_COMMENT) + +#define FOR_EACH_LINEITEM_COLUMN(F) \ + F(L_ORDERKEY) \ + F(L_PARTKEY) \ + F(L_SUPPKEY) \ + F(L_LINENUMBER) \ + F(L_QUANTITY) \ + F(L_EXTENDEDPRICE) \ + F(L_DISCOUNT) \ + F(L_TAX) \ + F(L_RETURNFLAG) \ + F(L_LINESTATUS) \ + F(L_SHIPDATE) \ + F(L_COMMITDATE) \ + F(L_RECEIPTDATE) \ + F(L_SHIPINSTRUCT) \ + F(L_SHIPMODE) \ + F(L_COMMENT) + +#define MAKE_ENUM(col) col, + struct ORDERS + { + enum + { + FOR_EACH_ORDERS_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + struct LINEITEM + { + enum + { + FOR_EACH_LINEITEM_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) \ + { #col, ORDERS::col }, + const std::unordered_map orders_name_map_ = + { + FOR_EACH_ORDERS_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_STRING_MAP(col) \ + { #col, LINEITEM::col }, + const std::unordered_map lineitem_name_map_ = + { + FOR_EACH_LINEITEM_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector orders_generators_ = + { + FOR_EACH_ORDERS_COLUMN(MAKE_FN_ARRAY) + }; + std::vector lineitem_generators_ = + { + FOR_EACH_LINEITEM_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_LINEITEM_COLUMN +#undef FOR_EACH_ORDERS_COLUMN + + const std::vector> orders_types_ = + { + int32(), + int32(), + fixed_size_binary(1), + decimal(12, 2), + date32(), + fixed_size_binary(15), + fixed_size_binary(15), + int32(), + utf8() + }; + + const std::vector> lineitem_types_ = + { + int32(), + int32(), + int32(), + int32(), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + fixed_size_binary(1), + fixed_size_binary(1), + date32(), + date32(), + date32(), + fixed_size_binary(25), + fixed_size_binary(10), + utf8(), + }; + + Status AllocateOrdersBatch(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.orders_to_generate * byte_width)); + ArrayData ad(orders_types_[column], tld.orders_to_generate, { nullptr, std::move(buff) }); + tld.orders[column] = std::move(ad); + return Status::OK(); + } + + Status O_ORDERKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERKEY)); + int32_t *o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + o_orderkey[i] = (tld.orderkey_start + i + 1); + ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= orders_rows_to_generate_); + } + } + return Status::OK(); + } + + Status O_CUSTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_CUSTKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CUSTKEY)); + + // Spec says it must be a random number between 1 and SF*150000 that is not + // divisible by 3. Rather than repeatedly generating numbers until we get to + // a non-divisible-by-3 number, we just generate a number between + // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. + std::uniform_int_distribution base_dist(0, scale_factor_ * 50000 - 1); + std::uniform_int_distribution offset_dist(1, 2); + int32_t *o_custkey = reinterpret_cast( + tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + o_custkey[i] = 3 * base_dist(tld.rng) + offset_dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERSTATUS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERSTATUS].kind() == Datum::NONE) + { + RETURN_NOT_OK(L_LINESTATUS(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERSTATUS)); + + char *o_orderstatus = reinterpret_cast( + tld.orders[ORDERS::O_ORDERSTATUS].array()->buffers[1]->mutable_data()); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + bool all_f = true; + bool all_o = true; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + const char *l_linestatus = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->data()); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++, batch_offset++) + { + all_f &= l_linestatus[batch_offset] == 'F'; + all_o &= l_linestatus[batch_offset] == 'O'; + } + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + ARROW_DCHECK(!(all_f && all_o)); + if(all_f) + o_orderstatus[iorder] = 'F'; + else if(all_o) + o_orderstatus[iorder] = 'O'; + else + o_orderstatus[iorder] = 'P'; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_TOTALPRICE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_TOTALPRICE].kind() == Datum::NONE) + { + RETURN_NOT_OK(L_EXTENDEDPRICE(thread_index)); + RETURN_NOT_OK(L_TAX(thread_index)); + RETURN_NOT_OK(L_DISCOUNT(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_TOTALPRICE)); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + int64_t sum = 0; + Decimal128 *o_totalprice = reinterpret_cast( + tld.orders[ORDERS::O_TOTALPRICE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + const Decimal128 *l_extendedprice = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->data()); + const Decimal128 *l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->data()); + const Decimal128 *l_discount = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->data()); + + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++, batch_offset++) + { + int64_t eprice = static_cast(l_extendedprice[batch_offset]); + int64_t tax = static_cast(l_tax[batch_offset]); + int64_t discount = static_cast(l_discount[batch_offset]); + sum += (eprice * (100 + tax) * (100 - discount)); + } + if(iline == tld.items_per_order[iorder]) + { + sum /= 100 * 100; + o_totalprice[iorder] = { sum }; + iline = 0; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_ORDERDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERDATE].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERDATE)); + + std::uniform_int_distribution dist(STARTDATE, ENDDATE - 151); + uint32_t *o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + o_orderdate[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERPRIORITY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY)); + int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_ORDERPRIORITY]); + std::uniform_int_distribution dist(0, kNumPriorities - 1); + char *o_orderpriority = reinterpret_cast( + tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + const char *str = Priorities[dist(tld.rng)]; + std::strncpy(o_orderpriority + i * byte_width, str, byte_width); + } + } + return Status::OK(); + } + + Status O_CLERK(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); + int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_CLERK]); + std::uniform_int_distribution dist(1, scale_factor_ * 1000); + char *o_clerk = reinterpret_cast( + tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + const char *clerk = "Clerk#"; + const size_t clerk_length = std::strlen(clerk); + int64_t clerk_number = dist(tld.rng); + char *output = o_clerk + i * byte_width; + std::strncpy(output, clerk, byte_width); + AppendNumberPaddedToNineDigits(output + clerk_length, clerk_number); + } + } + return Status::OK(); + } + + Status O_SHIPPRIORITY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_SHIPPRIORITY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_SHIPPRIORITY)); + int32_t *o_shippriority = reinterpret_cast( + tld.orders[ORDERS::O_SHIPPRIORITY].array()->buffers[1]->mutable_data()); + std::memset(o_shippriority, 0, tld.orders_to_generate * sizeof(int32_t)); + } + return Status::OK(); + } + + Status O_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(batch_size_, 19, 78, tld.rng)); + } + return Status::OK(); + } + + Status GenerateRowCounts(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + std::uniform_int_distribution length_dist(1, 7); + tld.lineitem_to_generate = 0; + tld.items_per_order.clear(); + for(int64_t i = 0; i < tld.orders_to_generate; i++) + { + int64_t length = length_dist(tld.rng); + tld.items_per_order.push_back(length); + tld.lineitem_to_generate += length; + } + size_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; + tld.lineitem.clear(); + tld.lineitem.resize(num_batches); + for(std::vector &batch : tld.lineitem) + { + batch.clear(); + batch.resize(LINEITEM::kNumCols); + } + return Status::OK(); + } + + Status AllocateLineItemBufferIfNeeded(size_t thread_index, size_t ibatch, int column, size_t &out_batch_offset) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.lineitem[ibatch][column].kind() == Datum::NONE) + { + int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); + tld.lineitem[ibatch][column] = std::move(ad); + out_batch_offset = 0; + } + if(ibatch == 0) + out_batch_offset = tld.first_batch_offset; + return Status::OK(); + } + + Status L_ORDERKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_ORDERKEY]) + { + tld.generated_lineitem[LINEITEM::L_ORDERKEY] = true; + RETURN_NOT_OK(O_ORDERKEY(thread_index)); + const int32_t *o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_ORDERKEY, batch_offset)); + int32_t *l_linenumber = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->buffers[1]->mutable_data()); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_linenumber[batch_offset++] = o_orderkey[iorder]; + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_PARTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_PARTKEY]) + { + tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; + + size_t ibatch = 0; + std::uniform_int_distribution dist(1, scale_factor_ * 200000); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_PARTKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t *l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_partkey[batch_offset] = dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SUPPKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SUPPKEY]) + { + tld.generated_lineitem[LINEITEM::L_SUPPKEY] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 3); + const int32_t S = scale_factor_ * 10000; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset = 0; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SUPPKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t *l_suppkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->buffers[1]->mutable_data()); + const int32_t *l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + for(int64_t i = 0; i < next_run; i++) + { + int32_t supplier = dist(tld.rng); + int32_t partkey = l_partkey[batch_offset]; + // Fun fact: the parentheses for this expression are unbalanced in the TPC-H spec. + l_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINENUMBER(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_LINENUMBER]) + { + tld.generated_lineitem[LINEITEM::L_LINENUMBER] = true; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_LINENUMBER, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t *l_linenumber = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->buffers[1]->mutable_data()); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + { + l_linenumber[batch_offset++] = (iline + 1); + ARROW_DCHECK(1 <= (iline + 1) && (iline + 1) <= 7); + } + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_QUANTITY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_QUANTITY]) + { + tld.generated_lineitem[LINEITEM::L_QUANTITY] = true; + + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 50); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_QUANTITY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128 *l_quantity = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->mutable_data()); + for(int64_t i = 0; i < next_run; i++) + { + // Multiply by 100 because the type is decimal(12, 2), so the decimal goes after two digits + int64_t quantity = dist(tld.rng) * 100; + l_quantity[batch_offset++] = { quantity }; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_EXTENDEDPRICE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE]) + { + tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + RETURN_NOT_OK(L_QUANTITY(thread_index)); + size_t ibatch = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_EXTENDEDPRICE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + const int32_t *l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + const Decimal128 *l_quantity = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->data()); + Decimal128 *l_extendedprice = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + int64_t partkey = static_cast(l_partkey[batch_offset]); + // Divide by 100 to recover the integer representation (not Decimal). + int64_t quantity = static_cast(l_quantity[batch_offset]) / 100; + + // Spec says to divide by 100, but that happens automatically due to this being stored + // to two decimal points. + int64_t retail_price = (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + int64_t extended_price = retail_price * quantity; + l_extendedprice[batch_offset] = { extended_price }; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_DISCOUNT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_DISCOUNT]) + { + tld.generated_lineitem[LINEITEM::L_DISCOUNT] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 10); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_DISCOUNT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128 *l_discount = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_discount[batch_offset] = { dist(tld.rng) }; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_TAX(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_TAX]) + { + tld.generated_lineitem[LINEITEM::L_TAX] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 8); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_TAX, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128 *l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_tax[batch_offset] = { dist(tld.rng) }; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RETURNFLAG(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_RETURNFLAG]) + { + tld.generated_lineitem[LINEITEM::L_RETURNFLAG] = true; + RETURN_NOT_OK(L_RECEIPTDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_RETURNFLAG, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_returnflag = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->buffers[1]->mutable_data()); + const uint32_t *l_receiptdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + if(l_receiptdate[batch_offset] <= CURRENTDATE) + { + uint32_t r = dist(tld.rng); + l_returnflag[batch_offset] = (r % 2 == 1) ? 'R' : 'A'; + } + else + { + l_returnflag[batch_offset] = 'N'; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINESTATUS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_LINESTATUS]) + { + tld.generated_lineitem[LINEITEM::L_LINESTATUS] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_LINESTATUS, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_linestatus = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->mutable_data()); + const uint32_t *l_shipdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + if(l_shipdate[batch_offset] > CURRENTDATE) + l_linestatus[batch_offset] = 'O'; + else + l_linestatus[batch_offset] = 'F'; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SHIPDATE]) + { + tld.generated_lineitem[LINEITEM::L_SHIPDATE] = true; + RETURN_NOT_OK(O_ORDERDATE(thread_index)); + const int32_t *o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(1, 121); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPDATE, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t *l_shipdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_shipdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMITDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_COMMITDATE]) + { + tld.generated_lineitem[LINEITEM::L_COMMITDATE] = true; + const int32_t *o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(30, 90); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_COMMITDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t *l_commitdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->buffers[1]->mutable_data()); + for(int64_t irun = 0; irun < next_run;) + { + for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_commitdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if(iline == tld.items_per_order[iorder]) + { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RECEIPTDATE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_RECEIPTDATE]) + { + tld.generated_lineitem[LINEITEM::L_RECEIPTDATE] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 30); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_RECEIPTDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t *l_receiptdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->buffers[1]->mutable_data()); + const uint32_t *l_shipdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + l_receiptdate[batch_offset] = l_shipdate[batch_offset] + dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPINSTRUCT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) + { + tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true; + int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[LINEITEM::L_SHIPINSTRUCT]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumInstructions - 1); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPINSTRUCT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_shipinstruct = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + const char *str = Instructions[dist(tld.rng)]; + // Note that we don't have to memset the buffer to 0 because strncpy pads each string + // with 0's anyway + std::strncpy(l_shipinstruct + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPMODE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) + { + tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true; + int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[LINEITEM::L_SHIPMODE]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumModes - 1); + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPMODE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char *l_shipmode = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->buffers[1]->mutable_data()); + + for(int64_t i = 0; i < next_run; i++, batch_offset++) + { + const char *str = Modes[dist(tld.rng)]; + std::strncpy(l_shipmode + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->length = static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(!tld.generated_lineitem[LINEITEM::L_COMMENT]) + { + tld.generated_lineitem[LINEITEM::L_COMMENT] = true; + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) + { + // Comments are kind of sneaky: we always generate the full batch and then just bump the length + if(tld.lineitem[ibatch][LINEITEM::L_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.lineitem[ibatch][LINEITEM::L_COMMENT], g_text.GenerateComments(batch_size_, 10, 43, tld.rng)); + batch_offset = 0; + } + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + batch_offset += next_run; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMENT].array()->length = batch_offset; + } + } + return Status::OK(); + } + + struct ThreadLocalData + { + std::vector orders; + int64_t orders_to_generate; + int64_t orderkey_start; + + std::vector> lineitem; + std::vector items_per_order; + int64_t lineitem_to_generate; + int64_t first_batch_offset; + std::bitset generated_lineitem; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex orders_output_queue_mutex_; + std::mutex lineitem_output_queue_mutex_; + std::queue orders_output_queue_; + std::queue lineitem_output_queue_; + int64_t batch_size_; + int scale_factor_; + int64_t orders_rows_to_generate_; + int64_t orders_rows_generated_; + std::vector orders_cols_; + std::vector lineitem_cols_; + }; + + class SupplierGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = scale_factor_ * 10000; + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( + columns, + types_, + name_map_, + gen_list_)); + + random::pcg32_fast rng; + std::uniform_int_distribution dist(0, rows_to_generate_ - 1); + size_t num_special_rows = static_cast(5 * scale_factor_); + std::unordered_set good_rows_set; + while(good_rows_set.size() < num_special_rows) + { + good_rows_set.insert(dist(rng)); + } + std::unordered_set bad_rows_set; + while(bad_rows_set.size() < num_special_rows) + { + int64_t bad_row; + do + { + bad_row = dist(rng); + } while(good_rows_set.find(bad_row) != good_rows_set.end()); + } + good_rows_.clear(); + bad_rows_.clear(); + good_rows_.insert(good_rows_.end(), good_rows_set.begin(), good_rows_set.end()); + bad_rows_.insert(bad_rows_.end(), bad_rows_set.begin(), bad_rows_set.end()); + std::sort(good_rows_.begin(), good_rows_.end()); + std::sort(bad_rows_.begin(), bad_rows_.end()); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + thread_local_data_.resize(num_threads); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: +#define FOR_EACH_COLUMN(F) \ + F(S_SUPPKEY) \ + F(S_NAME) \ + F(S_ADDRESS) \ + F(S_NATIONKEY) \ + F(S_PHONE) \ + F(S_ACCTBAL) \ + F(S_COMMENT) + +#define MAKE_ENUM(col) col, + struct SUPPLIER + { + enum + { + FOR_EACH_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; +#undef MAKE_ENUM +#define MAKE_STRING_MAP(col) \ + { #col, SUPPLIER::col }, + const std::unordered_map name_map_ = + { + FOR_EACH_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector generators_ = + { + FOR_EACH_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_COLUMN + + std::vector> types_ = + { + int32(), + fixed_size_binary(25), + utf8(), + int32(), + fixed_size_binary(15), + decimal(12, 2), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) + { + if(done_.load()) + return Status::OK(); + ThreadLocalData &tld = thread_local_data_[thread_index]; + tld.suppkey_start = rows_generated_.fetch_add(batch_size_); + if(tld.suppkey_start >= rows_to_generate_) + return Status::OK(); + + tld.to_generate = std::min(batch_size_, + rows_to_generate_ - tld.suppkey_start); + bool is_last_batch = tld.to_generate < batch_size_; + + tld.batch.clear(); + tld.batch.resize(SUPPLIER::kNumCols); + for(int col : gen_list_) + RETURN_NOT_OK(generators_[col](thread_index)); + + std::vector result(gen_list_.size()); + for(size_t i = 0; i < gen_list_.size(); i++) + { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + batches_generated_++; + output_callback_(std::move(eb)); + if(is_last_batch) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(types_[column], tld.to_generate, { nullptr, std::move(buff) }); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status S_SUPPKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_SUPPKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_SUPPKEY)); + int32_t *s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + s_suppkey[irow] = (tld.suppkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status S_NAME(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_NAME].kind() == Datum::NONE) + { + RETURN_NOT_OK(S_SUPPKEY(thread_index)); + const int32_t *s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data()); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[SUPPLIER::S_NAME]); + char *s_name = reinterpret_cast( + tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data()); + // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022 + const char *supplier = "Supplie#r"; + const size_t supplier_length = std::strlen(supplier); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + char *out = s_name + byte_width * irow; + std::memcpy(out, supplier, supplier_length); + AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); + } + } + return Status::OK(); + } + + Status S_ADDRESS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_ADDRESS].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE( + tld.batch[SUPPLIER::S_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status S_NATIONKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_NATIONKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t *s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + s_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status S_PHONE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_PHONE].kind() == Datum::NONE) + { + RETURN_NOT_OK(S_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[SUPPLIER::S_PHONE]); + const int32_t *s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data()); + char *s_phone = reinterpret_cast( + tld.batch[SUPPLIER::S_PHONE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + GeneratePhoneNumber( + s_phone + irow * byte_width, + tld.rng, + s_nationkey[irow]); + } + } + return Status::OK(); + } + + Status S_ACCTBAL(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_ACCTBAL].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_ACCTBAL)); + Decimal128 *s_acctbal = reinterpret_cast( + tld.batch[SUPPLIER::S_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + s_acctbal[irow] = { dist(tld.rng) }; + } + return Status::OK(); + } + + Status S_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(batch_size_, 25, 100, tld.rng)); + ModifyComments(thread_index, "Recommends", good_rows_); + ModifyComments(thread_index, "Complaints", bad_rows_); + } + return Status::OK(); + } + + void ModifyComments( + size_t thread_index, + const char *review, + const std::vector &indices) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + const int32_t *offsets = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->data()); + char *str = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->mutable_data()); + const char *customer = "Customer"; + const size_t customer_length = std::strlen(customer); + const size_t review_length = std::strlen(review); + + auto it = std::lower_bound(indices.begin(), indices.end(), tld.suppkey_start); + for(; it != indices.end() && *it < tld.suppkey_start + tld.to_generate; it++) + { + int64_t idx_in_batch = *it - tld.suppkey_start; + char *out = str + offsets[idx_in_batch]; + int32_t str_length = offsets[idx_in_batch + 1] - offsets[idx_in_batch]; + std::uniform_int_distribution gap_dist(0, str_length - customer_length - review_length); + int32_t gap = gap_dist(tld.rng); + int32_t total_length = customer_length + gap + review_length; + std::uniform_int_distribution start_dist(0, str_length - total_length); + int32_t start = start_dist(tld.rng); + std::memcpy(out + start, customer, customer_length); + std::memcpy(out + start + gap, review, review_length); + } + } + + struct ThreadLocalData + { + random::pcg32_fast rng; + int64_t suppkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + std::vector good_rows_; + std::vector bad_rows_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_; + std::atomic rows_generated_; + int scale_factor_; + int64_t batch_size_; + std::vector gen_list_; + std::shared_ptr schema_; + }; + + class PartGenerator : public TpchTableGenerator + { + public: + PartGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + { + batches_generated_.store(0); + } + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetPartOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextPartBatch(thread_index)); + if(done_.load() || !maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class PartSuppGenerator : public TpchTableGenerator + { + public: + PartSuppGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + { + batches_generated_.store(0); + } + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetPartSuppOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextPartSuppBatch(thread_index)); + if(done_.load() || !maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class CustomerGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = scale_factor_ * 150000; + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( + columns, + types_, + name_map_, + gen_list_)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + thread_local_data_.resize(num_threads); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: +#define FOR_EACH_COLUMN(F) \ + F(C_CUSTKEY) \ + F(C_NAME) \ + F(C_ADDRESS) \ + F(C_NATIONKEY) \ + F(C_PHONE) \ + F(C_ACCTBAL) \ + F(C_MKTSEGMENT) \ + F(C_COMMENT) + +#define MAKE_ENUM(col) col, + struct CUSTOMER + { + enum + { + FOR_EACH_COLUMN(MAKE_ENUM) + kNumCols, + }; + }; +#undef MAKE_ENUM +#define MAKE_STRING_MAP(col) \ + { #col, CUSTOMER::col }, + const std::unordered_map name_map_ = + { + FOR_EACH_COLUMN(MAKE_STRING_MAP) + }; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector generators_ = + { + FOR_EACH_COLUMN(MAKE_FN_ARRAY) + }; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_COLUMN + + std::vector> types_ = + { + int32(), + utf8(), + utf8(), + int32(), + fixed_size_binary(15), + decimal(12, 2), + fixed_size_binary(10), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) + { + if(done_.load()) + return Status::OK(); + ThreadLocalData &tld = thread_local_data_[thread_index]; + tld.custkey_start = rows_generated_.fetch_add(batch_size_); + if(tld.custkey_start >= rows_to_generate_) + return Status::OK(); + + tld.to_generate = std::min(batch_size_, + rows_to_generate_ - tld.custkey_start); + bool is_last_batch = tld.to_generate < batch_size_; + + tld.batch.clear(); + tld.batch.resize(CUSTOMER::kNumCols); + for(int col : gen_list_) + RETURN_NOT_OK(generators_[col](thread_index)); + + std::vector result(gen_list_.size()); + for(size_t i = 0; i < gen_list_.size(); i++) + { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + batches_generated_++; + output_callback_(std::move(eb)); + if(is_last_batch) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(types_[column], tld.to_generate, { nullptr, std::move(buff) }); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status C_CUSTKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_CUSTKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_CUSTKEY)); + int32_t *c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + c_custkey[irow] = (tld.custkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status C_NAME(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_NAME].kind() == Datum::NONE) + { + RETURN_NOT_OK(C_CUSTKEY(thread_index)); + const int32_t *c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.to_generate + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + const char *customer = "Customer#"; + const size_t customer_length = std::strlen(customer); + offsets[0] = 0; + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + int num_digits = GetNumDigits(c_custkey[irow]); + int num_chars = std::max(num_digits, 9); + offsets[irow + 1] = offsets[irow] + num_chars + customer_length; + } + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[tld.to_generate])); + char *str = reinterpret_cast(str_buff->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + char *out = str + offsets[irow]; + std::memcpy(out, customer, customer_length); + AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); + } + ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(str_buff), std::move(offset_buff) }); + tld.batch[CUSTOMER::C_NAME] = std::move(ad); + } + return Status::OK(); + } + + Status C_ADDRESS(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_ADDRESS].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE( + tld.batch[CUSTOMER::C_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status C_NATIONKEY(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_NATIONKEY].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t *c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + c_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status C_PHONE(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_PHONE].kind() == Datum::NONE) + { + RETURN_NOT_OK(C_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[CUSTOMER::C_PHONE]); + const int32_t *c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data()); + char *c_phone = reinterpret_cast( + tld.batch[CUSTOMER::C_PHONE].array()->buffers[1]->mutable_data()); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + GeneratePhoneNumber( + c_phone + irow * byte_width, + tld.rng, + c_nationkey[irow]); + } + } + return Status::OK(); + } + + Status C_ACCTBAL(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_ACCTBAL].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_ACCTBAL)); + Decimal128 *c_acctbal = reinterpret_cast( + tld.batch[CUSTOMER::C_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + c_acctbal[irow] = { dist(tld.rng) }; + } + return Status::OK(); + } + + Status C_MKTSEGMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) + { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT)); + int32_t byte_width = arrow::internal::GetByteWidth(*types_[CUSTOMER::C_MKTSEGMENT]); + char *c_mktsegment = reinterpret_cast( + tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(0, kNumSegments - 1); + for(int64_t irow = 0; irow < tld.to_generate; irow++) + { + char *out = c_mktsegment + irow * byte_width; + int str_idx = dist(tld.rng); + std::strncpy(out, Segments[str_idx], byte_width); + } + } + return Status::OK(); + } + + Status C_COMMENT(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + if(tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) + { + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(batch_size_, 29, 116, tld.rng)); + } + return Status::OK(); + } + + struct ThreadLocalData + { + random::pcg32_fast rng; + int64_t custkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_; + std::atomic rows_generated_; + int scale_factor_; + int64_t batch_size_; + std::vector gen_list_; + std::shared_ptr schema_; + }; + + class OrdersGenerator : public TpchTableGenerator + { + public: + OrdersGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + { + batches_generated_.store(0); + } + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetOrdersOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextOrdersBatch(thread_index)); + if(done_.load() || !maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class LineitemGenerator : public TpchTableGenerator + { + public: + LineitemGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) + {} + + Status Init( + std::vector columns, + int scale_factor, + int64_t batch_size) override + { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, + gen_->SetLineItemOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override + { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + Status ProduceCallback(size_t thread_index) + { + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextLineItemBatch(thread_index)); + if(!maybe_batch.has_value()) + { + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + { + finished_callback_(batches_generated_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + batches_generated_++; + output_callback_(std::move(batch)); + return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + int64_t scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; + }; + + class NationGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int /*scale_factor*/, + int64_t /*batch_size*/) override + { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns( + columns, + types_, + name_map_, + column_indices_)); + return Status::OK(); + } + + Status StartProducing( + size_t /*num_threads*/, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override + { + std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(N_NATIONKEY, sizeof(N_NATIONKEY)); + ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_NATIONKEY_buffer) }); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr N_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); + char *N_NAME = reinterpret_cast(N_NAME_buffer->mutable_data()); + for(size_t i = 0; i < kRowCount; i++) + std::strncpy(N_NAME + kNameByteWidth * i, country_names_[i], kNameByteWidth); + ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, { nullptr, std::move(N_NAME_buffer) }); + + std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(N_REGIONKEY, sizeof(N_REGIONKEY)); + ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_REGIONKEY_buffer) }); + + ARROW_ASSIGN_OR_RAISE(Datum N_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 114, rng_)); + + std::vector fields = + { + std::move(N_NATIONKEY_arraydata), + std::move(N_NAME_arraydata), + std::move(N_REGIONKEY_arraydata), + std::move(N_COMMENT_datum) + }; + + std::vector result; + for(const int &col : column_indices_) + result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + private: + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 25; + static constexpr int32_t kNameByteWidth = 25; + const int32_t N_NATIONKEY[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; + const char *country_names_[kRowCount] = + { + "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", "FRANCE", "GERMANY", + "INDONESIA", "IRAQ", "IRAN", "JAPAN", "JORDAN", "KENYA", "MOROCCO", "MOZAMBIQUE", "PERU", + "CHINA", "ROMANIA", "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES" + }; + const int32_t N_REGIONKEY[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; + + struct NATION + { + enum + { + N_NATIONKEY, + N_NAME, + N_REGIONKEY, + N_COMMENT, + }; + }; + + const std::unordered_map name_map_ = + { + { "N_NATIONKEY", NATION::N_NATIONKEY }, + { "N_NAME", NATION::N_NAME }, + { "N_REGIONKEY", NATION::N_REGIONKEY }, + { "N_COMMENT", NATION::N_COMMENT }, + }; + + std::vector> types_ = + { + int32(), + fixed_size_binary(kNameByteWidth), + int32(), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; + }; + + class RegionGenerator : public TpchTableGenerator + { + public: + Status Init( + std::vector columns, + int /*scale_factor*/, + int64_t /*batch_size*/) override + { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns( + columns, + types_, + name_map_, + column_indices_)); + return Status::OK(); + } + + Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override + { + std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(R_REGIONKEY, sizeof(R_REGIONKEY)); + ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(R_REGIONKEY_buffer) }); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr R_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); + char *R_NAME_data = reinterpret_cast(R_NAME_buffer->mutable_data()); + for(size_t i = 0; i < kRowCount; i++) + std::strncpy(R_NAME_data + kNameByteWidth * i, region_names_[i], kNameByteWidth); + ArrayData R_NAME_arraydata(types_[static_cast(REGION::R_NAME)], kRowCount, { nullptr, std::move(R_NAME_buffer) }); + + ARROW_ASSIGN_OR_RAISE(Datum R_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 115, rng_)); + + std::vector fields = { std::move(R_REGIONKEY_arraydata), std::move(R_NAME_arraydata), std::move(R_COMMENT_datum) }; + std::vector result; + for(const int &col : column_indices_) + result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override + { + return schema_; + } + + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 5; + static constexpr int32_t kNameByteWidth = 25; + const int32_t R_REGIONKEY[kRowCount] = { 0, 1, 2, 3, 4 }; + const char *region_names_[kRowCount] = + { + "AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST" + }; + + struct REGION + { + enum + { + R_REGIONKEY, + R_NAME, + R_COMMENT, + kNumColumns, + }; + }; + + const std::unordered_map name_map_ = + { + { "R_REGIONKEY", REGION::R_REGIONKEY }, + { "R_NAME", REGION::R_NAME }, + { "R_COMMENT", REGION::R_COMMENT }, + }; + + const std::vector> types_ = + { + int32(), + fixed_size_binary(kNameByteWidth), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; + }; + + class TpchNode : public ExecNode + { + public: + TpchNode(ExecPlan *plan, + std::unique_ptr generator) + : ExecNode(plan, {}, {}, generator->schema(), /*num_outputs=*/1), + generator_(std::move(generator)) + { + } + + const char *kind_name() const override + { + return "TpchNode"; + } + + [[noreturn]] + static void NoInputs() + { + Unreachable("TPC-H node should never have any inputs"); + } + + [[noreturn]] + void InputReceived(ExecNode *, ExecBatch) override + { + NoInputs(); + } + + [[noreturn]] + void ErrorReceived(ExecNode *, Status) override + { + NoInputs(); + } + + [[noreturn]] + void InputFinished(ExecNode *, int) override + { + NoInputs(); + } + + Status StartProducing() override + { + finished_ = Future<>::Make(); + return generator_->StartProducing( + thread_indexer_.Capacity(), + [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, + [this](int64_t num_batches) { this->FinishedCallback(num_batches); }, + [this](std::function func) -> Status { return this->ScheduleTaskCallback(std::move(func)); } + ); + } + + void PauseProducing(ExecNode *output) override {} + void ResumeProducing(ExecNode *output) override {} + + void StopProducing(ExecNode *output) override + { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override + { + generator_->Abort([this]() { this->finished_.MarkFinished(); }); + } + + Future<> finished() override + { + return finished_; + } + + private: + void OutputBatchCallback(ExecBatch batch) + { + outputs_[0]->InputReceived(this, std::move(batch)); + } + + void FinishedCallback(int64_t total_num_batches) + { + outputs_[0]->InputFinished(this, static_cast(total_num_batches)); + finished_.MarkFinished(); + } + + Status ScheduleTaskCallback(std::function func) + { + auto executor = plan_->exec_context()->executor(); + if (executor) + { + RETURN_NOT_OK(executor->Spawn([this, func] + { + size_t thread_index = thread_indexer_(); + Status status = func(thread_index); + if (!status.ok()) + { + StopProducing(); + ErrorIfNotOk(status); + return; + } + })); + } + else + { + return func(0); + } + return Status::OK(); + } + + std::unique_ptr generator_; + + Future<> finished_ = Future<>::MakeFinished(); + ThreadIndexer thread_indexer_; + }; + + Result TpchGen::Make(ExecPlan *plan, int scale_factor, int64_t batch_size) + { + static bool has_inited_text = false; + if(!has_inited_text) + { + RETURN_NOT_OK(g_text.Init()); + has_inited_text = true; + } + TpchGen result(plan, scale_factor, batch_size); + return result; + } + + template + Result TpchGen::CreateNode(std::vector columns) + { + std::unique_ptr generator = arrow::internal::make_unique(); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Supplier(std::vector columns) + { + return CreateNode(std::move(columns)); + } + + Result TpchGen::Part(std::vector columns) + { + if(!part_and_part_supp_generator_) + { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::PartSupp(std::vector columns) + { + if(!part_and_part_supp_generator_) + { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Customer(std::vector columns) + { + return CreateNode(std::move(columns)); + } + + Result TpchGen::Orders(std::vector columns) + { + if(!orders_and_line_item_generator_) + { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Lineitem(std::vector columns) + { + if(!orders_and_line_item_generator_) + { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, std::move(generator)); + } + + Result TpchGen::Nation(std::vector columns) + { + return CreateNode(std::move(columns)); + } + + Result TpchGen::Region(std::vector columns) + { + return CreateNode(std::move(columns)); + } + } +} diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h new file mode 100644 index 00000000000..dc282aae981 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/pcg_random.h" +#include +#include + +namespace arrow +{ + namespace compute + { + class OrdersAndLineItemGenerator; + class PartAndPartSupplierGenerator; + + class TpchGen + { + public: + static Result Make(ExecPlan *plan, int scale_factor = 1, int64_t batch_size = 4096); + + Result Supplier(std::vector columns = {}); + Result Part(std::vector columns = {}); + Result PartSupp(std::vector columns = {}); + Result Customer(std::vector columns = {}); + Result Orders(std::vector columns = {}); + Result Lineitem(std::vector columns = {}); + Result Nation(std::vector columns = {}); + Result Region(std::vector columns = {}); + + private: + TpchGen(ExecPlan *plan, int scale_factor, int64_t batch_size) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + orders_and_line_item_generator_(nullptr) + {} + + template + Result CreateNode(std::vector columns); + + ExecPlan *plan_; + int scale_factor_; + int64_t batch_size_; + + std::shared_ptr part_and_part_supp_generator_; + std::shared_ptr orders_and_line_item_generator_; + }; + } +} diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 7b2968253b3..ffa3b30a5d4 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -890,7 +890,8 @@ class TableSorter { TableSorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const Table& table, const SortOptions& options) - : ctx_(ctx), + : status_(), + ctx_(ctx), table_(table), batches_(MakeBatches(table, &status_)), options_(options), @@ -1131,6 +1132,7 @@ class TableSorter { MergeNullsOnly(range_begin, range_middle, range_end, temp_indices, null_count); } + Status status_; ExecContext* ctx_; const Table& table_; const RecordBatchVector batches_; @@ -1141,7 +1143,6 @@ class TableSorter { uint64_t* indices_begin_; uint64_t* indices_end_; Comparator comparator_; - Status status_; }; // ---------------------------------------------------------------------- From a1514d876c8e43dc1639ee466b0c8e410b88bc8a Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 2 Mar 2022 15:08:17 -0600 Subject: [PATCH 02/34] Draft of R bindings --- r/DESCRIPTION | 1 + r/NAMESPACE | 1 + r/R/arrowExports.R | 4 +++ r/R/tpch.R | 36 ++++++++++++++++++++++ r/man/tpch_dbgen.Rd | 20 ++++++++++++ r/src/arrowExports.cpp | 17 +++++++++++ r/src/compute-exec.cpp | 55 +++++++++++++++++++++++++++++++++ r/tests/testthat/test-tpch.R | 59 ++++++++++++++++++++++++++++++++++++ 8 files changed, 193 insertions(+) create mode 100644 r/R/tpch.R create mode 100644 r/man/tpch_dbgen.Rd create mode 100644 r/tests/testthat/test-tpch.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 36a55c05b26..ecbbfb79ac2 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -126,4 +126,5 @@ Collate: 'reexports-bit64.R' 'reexports-tidyselect.R' 'schema.R' + 'tpch.R' 'util.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index ae06e8e03aa..b24cad1fdb4 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -290,6 +290,7 @@ export(time64) export(timestamp) export(to_arrow) export(to_duckdb) +export(tpch_dbgen) export(type) export(uint16) export(uint32) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index e56e157413e..94f05f1482b 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -444,6 +444,10 @@ ExecNode_TableSourceNode <- function(plan, table) { .Call(`_arrow_ExecNode_TableSourceNode`, plan, table) } +Tpch_Dbgen <- function(plan, scale_factor, table_name) { + .Call(`_arrow_Tpch_Dbgen`, plan, scale_factor, table_name) +} + RecordBatch__cast <- function(batch, schema, options) { .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/tpch.R b/r/R/tpch.R new file mode 100644 index 00000000000..78c2d112584 --- /dev/null +++ b/r/R/tpch.R @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +tpch_tables <- c("customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier") + + +#' Generate a RecordBatchReader with TPC-H data in it +#' +#' @param table the table to generate +#' @param scale_factor the scale factor to generate +#' +#' @return a RecordBatchReader that will contain the generated data +#' @export +#' +#' @keywords internal +tpch_dbgen <- function(table = tpch_tables, scale_factor) { + table <- match.arg(table) + + Tpch_Dbgen(arrow:::ExecPlan$create(), scale_factor, table) +} + + diff --git a/r/man/tpch_dbgen.Rd b/r/man/tpch_dbgen.Rd new file mode 100644 index 00000000000..88cc1cf1857 --- /dev/null +++ b/r/man/tpch_dbgen.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tpch.R +\name{tpch_dbgen} +\alias{tpch_dbgen} +\title{Generate a RecordBatchReader with TPC-H data in it} +\usage{ +tpch_dbgen(table = tpch_tables, scale_factor) +} +\arguments{ +\item{table}{the table to generate} + +\item{scale_factor}{the scale factor to generate} +} +\value{ +a RecordBatchReader that will contain the generated data +} +\description{ +Generate a RecordBatchReader with TPC-H data in it +} +\keyword{internal} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 8508b601703..1ec4c6f3ea6 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1749,6 +1749,23 @@ extern "C" SEXP _arrow_ExecNode_TableSourceNode(SEXP plan_sexp, SEXP table_sexp) } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr Tpch_Dbgen(const std::shared_ptr& plan, int scale_factor, std::string table_name); +extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input::type scale_factor(scale_factor_sexp); + arrow::r::Input::type table_name(table_name_sexp); + return cpp11::as_sexp(Tpch_Dbgen(plan, scale_factor, table_name)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ + Rf_error("Cannot call Tpch_Dbgen(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index f46c3cefb36..0625be981bb 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -291,4 +292,58 @@ std::shared_ptr ExecNode_TableSourceNode( return MakeExecNodeOrStop("table_source", plan.get(), {}, options); } +std::shared_ptr Tpch_Dbgen( + const std::shared_ptr& plan, + int scale_factor, + std::string table_name + ) { + + auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); + + compute::ExecNode *table; + if (table_name == "part") { + table = ValueOrStop(gen.Part()); + } else if (table_name == "supplier") { + table = ValueOrStop(gen.Supplier()); + } else if (table_name == "partsupp") { + table = ValueOrStop(gen.PartSupp()); + } else if (table_name == "customer") { + table = ValueOrStop(gen.Customer()); + } else if (table_name == "nation") { + table = ValueOrStop(gen.Nation()); + } else if (table_name == "lineitem") { + table = ValueOrStop(gen.Lineitem()); + } else if (table_name == "region") { + table = ValueOrStop(gen.Region()); + } else if (table_name == "orders") { + table = ValueOrStop(gen.Orders()); + } else { + cpp11::stop("That's not a valid table name"); + } + + arrow::AsyncGenerator> sink_gen; + + MakeExecNodeOrStop("sink", plan.get(), {table}, + compute::SinkNodeOptions{&sink_gen}); + + StopIfNotOk(plan->Validate()); + StopIfNotOk(plan->StartProducing()); + + // If the generator is destroyed before being completely drained, inform plan + std::shared_ptr stop_producing{nullptr, [plan](...) { + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + + if (not_finished_yet) { + plan->StopProducing(); + } + }}; + + return compute::MakeGeneratorReader( + table->output_schema(), + [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); +} + #endif diff --git a/r/tests/testthat/test-tpch.R b/r/tests/testthat/test-tpch.R new file mode 100644 index 00000000000..8077f76e4fd --- /dev/null +++ b/r/tests/testthat/test-tpch.R @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("tpch_dbgen()", { + lineitem_rbr <- tpch_dbgen("lineitem", 1) + lineitem_tab <- lineitem_rbr$read_table() + expect_identical(ncol(lineitem_tab), 16L) + + # and check a handful of types + expect_type_equal(lineitem_tab[["L_ORDERKEY"]], int32()) + expect_type_equal(lineitem_tab[["L_RECEIPTDATE"]], date32()) + + region_rbr <- tpch_dbgen("region", 1) + region_tab <- region_rbr$read_table() + expect_identical(dim(region_tab), c(5L, 3L)) + + # and check a handful of types + expect_type_equal(region_tab[["R_REGIONKEY"]], int32()) + expect_type_equal(region_tab[["R_COMMENT"]], string()) + + part_rbr <- tpch_dbgen("part", 1) + part_tab <- part_rbr$read_table() + expect_identical(dim(part_tab), c(200000L, 9L)) + + # and check a handful of types + expect_type_equal(part_tab[["R_PARTKEY"]], int32()) +}) + +# these two are tested above +tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region")) + +# nation segfaults +# supplier hangs +tpch_tables_up <- setdiff(tpch_tables_up, c("nation", "supplier")) + +# all of the rest below have an error with: +# Invalid: Arrays used to construct an ExecBatch must have equal length + +for (table_name in tpch_tables_up) { + test_that(paste0("Generating table: ", table_name), { + rbr <- tpch_dbgen(table_name, 1) + tab <- rbr$read_table() + expect_r6_class(tab, "Table") + }) +} From bd27962122bd07a07adc3d10215c570dfe52dc75 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Wed, 2 Mar 2022 18:08:41 -0800 Subject: [PATCH 03/34] Fix bugs, parallel text generation, rudimentary tests --- cpp/src/arrow/compute/exec/CMakeLists.txt | 2 + cpp/src/arrow/compute/exec/tpch_node.cc | 302 +++++++++++-------- cpp/src/arrow/compute/exec/tpch_node_test.cc | 203 +++++++++++++ 3 files changed, 383 insertions(+), 124 deletions(-) create mode 100644 cpp/src/arrow/compute/exec/tpch_node_test.cc diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index bca6ec2c6e2..1292213dc45 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -32,6 +32,8 @@ add_arrow_compute_test(hash_join_node_test hash_join_node_test.cc bloom_filter_test.cc key_hash_test.cc) +add_arrow_compute_test(hash_join_node_test PREFIX "arrow-compute") +add_arrow_compute_test(tpch_node_test PREFIX "arrow-compute") add_arrow_compute_test(union_node_test PREFIX "arrow-compute") add_arrow_compute_test(util_test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 842bf828574..445df7d08b9 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -22,7 +22,7 @@ namespace arrow class TpchText { public: - Status Init(); + Status InitIfNeeded(random::pcg32_fast &rng); Result GenerateComments( size_t num_comments, size_t min_length, @@ -30,24 +30,28 @@ namespace arrow random::pcg32_fast &rng); private: - void GenerateWord(size_t &offset, const char **words, size_t num_choices); - void GenerateNoun(size_t &offset); - void GenerateVerb(size_t &offset); - void GenerateAdjective(size_t &offset); - void GenerateAdverb(size_t &offset); - void GeneratePreposition(size_t &offset); - void GenerateAuxiliary(size_t &offset); - void GenerateTerminator(size_t &offset); + bool GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices); + bool GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr); - void GenerateNounPhrase(size_t &offset); - void GenerateVerbPhrase(size_t &offset); - void GeneratePrepositionalPhrase(size_t &offset); + bool GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - void GenerateSentence(size_t &offset); + bool GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr); + std::atomic done_ = { false }; + int64_t generated_offset_ = 0; + std::mutex text_guard_; std::unique_ptr text_; random::pcg32_fast rng_; - static constexpr size_t kTextBytes = 300 * 1024 * 1024; // 300 MB + static constexpr int64_t kChunkSize = 8192; + static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB }; class TpchTableGenerator @@ -150,11 +154,13 @@ namespace arrow std::vector> fields; if(columns.empty()) { + fields.resize(name_map.size()); + gen_list.resize(name_map.size()); for(auto pair : name_map) { int col_idx = pair.second; - fields.push_back(field(pair.first, types[col_idx])); - gen_list.push_back(col_idx); + fields[col_idx] = field(pair.first, types[col_idx]); + gen_list[col_idx] = col_idx; } return schema(std::move(fields)); } @@ -175,12 +181,39 @@ namespace arrow static TpchText g_text; - Status TpchText::Init() + Status TpchText::InitIfNeeded(random::pcg32_fast &rng) { - ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); - size_t offset = 0; - while(offset < kTextBytes) - GenerateSentence(offset); + if(done_.load()) + return Status::OK(); + + { + std::lock_guard lock(text_guard_); + if(!text_) + { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + } + } + char *out = reinterpret_cast(text_->mutable_data()); + char temp_buff[kChunkSize]; + while(done_.load() == false) + { + int64_t current_offset = 0; + int64_t offset = 0; + while(GenerateSentence(offset, rng, temp_buff)) + current_offset = offset; + + { + std::lock_guard lock(text_guard_); + if(done_.load()) + return Status::OK(); + int64_t bytes_remaining = kTextBytes - generated_offset_; + int64_t memcpy_size = std::min(offset, bytes_remaining); + std::memcpy(out + generated_offset_, temp_buff, memcpy_size); + generated_offset_ += memcpy_size; + if(generated_offset_ == kTextBytes) + done_.store(true); + } + } return Status::OK(); } @@ -190,6 +223,7 @@ namespace arrow size_t max_length, random::pcg32_fast &rng) { + RETURN_NOT_OK(InitIfNeeded(rng)); std::uniform_int_distribution length_dist(min_length, max_length); ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); @@ -206,7 +240,7 @@ namespace arrow size_t offset_in_text = offset_dist(rng); std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); } - ArrayData ad(utf8(), num_comments, { nullptr, std::move(comment_buffer), std::move(offset_buffer) }); + ArrayData ad(utf8(), num_comments, { nullptr, std::move(offset_buffer), std::move(comment_buffer) }); return std::move(ad); } @@ -237,7 +271,7 @@ namespace arrow for(int32_t i = 0; i < offsets[num_rows]; i++) str[i] = alpha_numerics[char_dist(rng)]; - ArrayData ad(utf8(), num_rows, { nullptr, std::move(str_buff), std::move(offset_buff) }); + ArrayData ad(utf8(), num_rows, { nullptr, std::move(offset_buff), std::move(str_buff) }); return std::move(ad); } @@ -246,10 +280,10 @@ namespace arrow out += (num_digits - 1); while(x > 0) { - *out-- = x % 10; + *out-- = '0' + (x % 10); x /= 10; } - x += num_digits; + out += num_digits; } void GeneratePhoneNumber( @@ -405,163 +439,176 @@ namespace arrow }; static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); - void TpchText::GenerateWord(size_t &offset, const char **words, size_t num_choices) + bool TpchText::GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices) { std::uniform_int_distribution dist(0, num_choices - 1); - const char *word = words[dist(rng_)]; - size_t bytes_left = kTextBytes - offset; + const char *word = words[dist(rng)]; size_t length = std::strlen(word); - size_t bytes_to_copy = std::min(bytes_left, length); - std::memcpy(text_->mutable_data() + offset, word, bytes_to_copy); - offset += bytes_to_copy; + if(offset + length > kChunkSize) + return false; + std::memcpy(arr + offset, word, length); + offset += length; + return true; } - void TpchText::GenerateNoun(size_t &offset) + bool TpchText::GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Nouns, kNumNouns); + return GenerateWord(offset, rng, arr, Nouns, kNumNouns); } - void TpchText::GenerateVerb(size_t &offset) + bool TpchText::GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Verbs, kNumVerbs); + return GenerateWord(offset, rng, arr, Verbs, kNumVerbs); } - void TpchText::GenerateAdjective(size_t &offset) + bool TpchText::GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Adjectives, kNumAdjectives); + return GenerateWord(offset, rng, arr, Adjectives, kNumAdjectives); } - void TpchText::GenerateAdverb(size_t &offset) + bool TpchText::GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Adverbs, kNumAdverbs); + return GenerateWord(offset, rng, arr, Adverbs, kNumAdverbs); } - void TpchText::GeneratePreposition(size_t &offset) + bool TpchText::GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Prepositions, kNumPrepositions); + return GenerateWord(offset, rng, arr, Prepositions, kNumPrepositions); } - void TpchText::GenerateAuxiliary(size_t &offset) + bool TpchText::GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Auxiliaries, kNumAuxiliaries); + return GenerateWord(offset, rng, arr, Auxiliaries, kNumAuxiliaries); } - void TpchText::GenerateTerminator(size_t &offset) + bool TpchText::GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr) { - GenerateWord(offset, Terminators, kNumTerminators); + bool result = GenerateWord(offset, rng, arr, Terminators, kNumTerminators); + // Swap the space with the terminator + if(result) + std::swap(*(arr + offset - 2), *(arr + offset - 1)); + return result; } - void TpchText::GenerateNounPhrase(size_t &offset) + bool TpchText::GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 3); const char *comma_space = ", "; + bool success = true; switch(dist(rng_)) { case 0: - GenerateNoun(offset); + success &= GenerateNoun(offset, rng, arr); break; case 1: - GenerateAdjective(offset); - GenerateNoun(offset); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); break; case 2: - GenerateAdjective(offset); - GenerateWord(offset, &comma_space, 1); - GenerateAdjective(offset); - GenerateNoun(offset); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateWord(offset, rng, arr, &comma_space, 1); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); break; case 3: - GenerateAdverb(offset); - GenerateAdjective(offset); - GenerateNoun(offset); + GenerateAdverb(offset, rng, arr); + GenerateAdjective(offset, rng, arr); + GenerateNoun(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 3 inclusive"); break; } + return success; } - void TpchText::GenerateVerbPhrase(size_t &offset) + bool TpchText::GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 3); + bool success = true; switch(dist(rng_)) { case 0: - GenerateVerb(offset); + success &= GenerateVerb(offset, rng, arr); break; case 1: - GenerateAuxiliary(offset); - GenerateVerb(offset); + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); break; case 2: - GenerateVerb(offset); - GenerateAdverb(offset); + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); break; case 3: - GenerateAuxiliary(offset); - GenerateVerb(offset); - GenerateAdverb(offset); + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 3 inclusive"); break; } + return success; } - void TpchText::GeneratePrepositionalPhrase(size_t &offset) + bool TpchText::GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { const char *the_space = "the "; - GeneratePreposition(offset); - GenerateWord(offset, &the_space, 1); - GenerateNounPhrase(offset); + bool success = true; + success &= GeneratePreposition(offset, rng, arr); + success &= GenerateWord(offset, rng, arr, &the_space, 1); + success &= GenerateNounPhrase(offset, rng, arr); + return success; } - void TpchText::GenerateSentence(size_t &offset) + bool TpchText::GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 4); + bool success = true; switch(dist(rng_)) { case 0: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 1: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 2: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GenerateNounPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 3: - GenerateNounPhrase(offset); - GenerateVerbPhrase(offset); - GenerateNounPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 4: - GenerateNounPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateVerbPhrase(offset); - GenerateNounPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; case 5: - GenerateNounPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateVerbPhrase(offset); - GeneratePrepositionalPhrase(offset); - GenerateTerminator(offset); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 5 inclusive"); break; } + return success; } using GenerateColumnFn = std::function; @@ -669,14 +716,17 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; { - std::lock_guard lock(part_output_queue_mutex_); - if(!part_output_queue_.empty()) + std::lock_guard lock(partsupp_output_queue_mutex_); + if(!partsupp_output_queue_.empty()) { - ExecBatch batch = std::move(part_output_queue_.front()); - part_output_queue_.pop(); - return std::move(batch); + ExecBatch result = std::move(partsupp_output_queue_.front()); + partsupp_output_queue_.pop(); + return std::move(result); } - else if(part_rows_generated_ == part_rows_to_generate_) + } + { + std::lock_guard lock(part_output_queue_mutex_); + if(part_rows_generated_ == part_rows_to_generate_) { return util::nullopt; } @@ -885,7 +935,7 @@ namespace arrow *row++ = ' '; } } - ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); Datum datum(ad); tld.part[PART::P_NAME] = std::move(datum); } @@ -916,7 +966,7 @@ namespace arrow Status P_BRAND(size_t thread_index) { ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_MFGR].kind() == Datum::NONE) + if(tld.part[PART::P_BRAND].kind() == Datum::NONE) { RETURN_NOT_OK(P_MFGR(thread_index)); std::uniform_int_distribution dist(1, 5); @@ -987,7 +1037,7 @@ namespace arrow *row++ = ' '; } } - ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(string_buffer), std::move(offset_buff) }); + ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); Datum datum(ad); tld.part[PART::P_TYPE] = std::move(datum); } @@ -1065,7 +1115,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.part[PART::P_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(batch_size_, 5, 22, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(tld.part_to_generate, 5, 22, tld.rng)); } return Status::OK(); } @@ -1222,7 +1272,7 @@ namespace arrow for(int64_t irun = 0; irun < next_run; irun++) ps_supplycost[irun] = { dist(tld.rng) }; - tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->length = next_run; irow += next_run; } } @@ -1594,8 +1644,11 @@ namespace arrow tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.orders_to_generate; i++) { - o_orderkey[i] = (tld.orderkey_start + i + 1); - ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= orders_rows_to_generate_); + int32_t orderkey_index = tld.orderkey_start + i; + int32_t index_of_run = orderkey_index / 8; + int32_t index_in_run = orderkey_index % 8; + o_orderkey[i] = (index_of_run * 32 + index_in_run + 1); + ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= 4 * orders_rows_to_generate_); } } return Status::OK(); @@ -1802,7 +1855,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(batch_size_, 19, 78, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(tld.orders_to_generate, 19, 78, tld.rng)); } return Status::OK(); } @@ -2444,6 +2497,7 @@ namespace arrow { bad_row = dist(rng); } while(good_rows_set.find(bad_row) != good_rows_set.end()); + bad_rows_set.insert(bad_row); } good_rows_.clear(); bad_rows_.clear(); @@ -2680,7 +2734,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(batch_size_, 25, 100, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(tld.to_generate, 25, 100, tld.rng)); ModifyComments(thread_index, "Recommends", good_rows_); ModifyComments(thread_index, "Complaints", bad_rows_); } @@ -2694,9 +2748,9 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; const int32_t *offsets = reinterpret_cast( - tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->data()); + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->data()); char *str = reinterpret_cast( - tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->mutable_data()); + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->mutable_data()); const char *customer = "Customer"; const size_t customer_length = std::strlen(customer); const size_t review_length = std::strlen(review); @@ -3057,7 +3111,7 @@ namespace arrow std::memcpy(out, customer, customer_length); AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); } - ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(str_buff), std::move(offset_buff) }); + ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(offset_buff), std::move(str_buff) }); tld.batch[CUSTOMER::C_NAME] = std::move(ad); } return Status::OK(); @@ -3153,7 +3207,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) { - ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(batch_size_, 29, 116, tld.rng)); + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(tld.to_generate, 29, 116, tld.rng)); } return Status::OK(); } @@ -3381,9 +3435,15 @@ namespace arrow const int32_t N_NATIONKEY[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; const char *country_names_[kRowCount] = { - "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", "FRANCE", "GERMANY", - "INDONESIA", "IRAQ", "IRAN", "JAPAN", "JORDAN", "KENYA", "MOROCCO", "MOZAMBIQUE", "PERU", - "CHINA", "ROMANIA", "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES" + "ALGERIA", "ARGENTINA", "BRAZIL", + "CANADA", "EGYPT", "ETHIOPIA", + "FRANCE", "GERMANY", "INDIA", + "INDONESIA", "IRAN", "IRAQ", + "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", + "CHINA", "ROMANIA", "SAUDI ARABIA", + "VIETNAM", "RUSSIA", "UNITED KINGDOM", + "UNITED STATES" }; const int32_t N_REGIONKEY[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; @@ -3619,12 +3679,6 @@ namespace arrow Result TpchGen::Make(ExecPlan *plan, int scale_factor, int64_t batch_size) { - static bool has_inited_text = false; - if(!has_inited_text) - { - RETURN_NOT_OK(g_text.Init()); - has_inited_text = true; - } TpchGen result(plan, scale_factor, batch_size); return result; } @@ -3659,7 +3713,7 @@ namespace arrow { part_and_part_supp_generator_ = std::make_shared(); } - std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); + std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); return plan_->EmplaceNode(plan_, std::move(generator)); } diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc new file mode 100644 index 00000000000..c844d7e88c1 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/api.h" +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/util.h" +#include "arrow/compute/kernels/row_encoder.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" +#include "arrow/testing/random.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/pcg_random.h" +#include "arrow/util/thread_pool.h" +#include "arrow/array/validate.h" + +namespace arrow +{ + namespace compute + { + void ValidateBatch(const ExecBatch &batch) + { + for(const Datum &d : batch.values) + ASSERT_OK(arrow::internal::ValidateArray(*d.array())); + } + + TEST(TpchNode, Supplier) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Supplier(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 10000); + } + + TEST(TpchNode, Part) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Part(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 200000); + } + + TEST(TpchNode, PartSupp) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.PartSupp(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 800000); + } + + TEST(TpchNode, Customer) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Customer(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 150000); + } + + TEST(TpchNode, Orders) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Orders(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 1500000); + } + + TEST(TpchNode, Lineitem) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Lineitem(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + for(auto &batch : res) + { + ValidateBatch(batch); + } + } + + TEST(TpchNode, Nation) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Nation(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 25); + } + + TEST(TpchNode, Region) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode *table = *gen.Region(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + int64_t num_rows = 0; + for(auto &batch : res) + { + ValidateBatch(batch); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 5); + } + } +} From 8543f5115b5b5c98f90eda2ddbff5d58324a54b7 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 3 Mar 2022 08:14:37 -0600 Subject: [PATCH 04/34] Uncommenting R tests, and a first stab at the filewriter C++ --- r/R/arrowExports.R | 4 ++ r/R/tpch.R | 20 ++++++++- r/src/arrowExports.cpp | 15 ++++--- r/src/compute-exec.cpp | 85 ++++++++++++++++++++++++++++++++++++ r/tests/testthat/test-tpch.R | 14 ++---- 5 files changed, 122 insertions(+), 16 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 94f05f1482b..01b73a71a96 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -448,6 +448,10 @@ Tpch_Dbgen <- function(plan, scale_factor, table_name) { .Call(`_arrow_Tpch_Dbgen`, plan, scale_factor, table_name) } +Tpch_Dbgen_Write <- function(plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions) { + invisible(.Call(`_arrow_Tpch_Dbgen_Write`, plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions)) +} + RecordBatch__cast <- function(batch, schema, options) { .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/tpch.R b/r/R/tpch.R index 78c2d112584..ef0e002a6e5 100644 --- a/r/R/tpch.R +++ b/r/R/tpch.R @@ -30,7 +30,25 @@ tpch_tables <- c("customer", "lineitem", "nation", "orders", "part", "partsupp", tpch_dbgen <- function(table = tpch_tables, scale_factor) { table <- match.arg(table) - Tpch_Dbgen(arrow:::ExecPlan$create(), scale_factor, table) + Tpch_Dbgen(ExecPlan$create(), scale_factor, table) } +tpch_dbgen_write <- function(table = tpch_tables, scale_factor, path, ...) { + table <- match.arg(table) + + path_and_fs <- get_path_and_filesystem(path) + + existing_data_behavior <- 0L + max_partitions <- 1024L + + Tpch_Dbgen_Write( + ExecPlan$create(), + scale_factor, + table, + path_and_fs$fs, + path_and_fs$path, + existing_data_behavior, + max_partitions + ) +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 1ec4c6f3ea6..8b60ae7bd00 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1751,18 +1751,23 @@ extern "C" SEXP _arrow_ExecNode_TableSourceNode(SEXP plan_sexp, SEXP table_sexp) // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Tpch_Dbgen(const std::shared_ptr& plan, int scale_factor, std::string table_name); -extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ +void Tpch_Dbgen_Write(const std::shared_ptr& plan, int scale_factor, std::string table_name, const std::shared_ptr& filesystem, std::string base_dir, arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions); +extern "C" SEXP _arrow_Tpch_Dbgen_Write(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input::type scale_factor(scale_factor_sexp); arrow::r::Input::type table_name(table_name_sexp); - return cpp11::as_sexp(Tpch_Dbgen(plan, scale_factor, table_name)); + arrow::r::Input&>::type filesystem(filesystem_sexp); + arrow::r::Input::type base_dir(base_dir_sexp); + arrow::r::Input::type existing_data_behavior(existing_data_behavior_sexp); + arrow::r::Input::type max_partitions(max_partitions_sexp); + Tpch_Dbgen_Write(plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions); + return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ - Rf_error("Cannot call Tpch_Dbgen(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_Tpch_Dbgen_Write(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){ + Rf_error("Cannot call Tpch_Dbgen_Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 0625be981bb..b00162f36e9 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -24,6 +24,11 @@ #include #include #include +// TODO: We probably don't want to add dataset + filesystem here, so instead we'll probably +// want to move the definition of Tpch_Dbgen_Write if it works +#include +#include +#include #include #include #include @@ -33,6 +38,10 @@ #include namespace compute = ::arrow::compute; +// TODO: We probably don't want to add dataset + fs here, so instead we'll probably +// want to move the definition of Tpch_Dbgen_Write if it works +namespace ds = ::arrow::dataset; +namespace fs = ::arrow::fs; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); @@ -346,4 +355,80 @@ std::shared_ptr Tpch_Dbgen( [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); } +// [[arrow::export]] +void Tpch_Dbgen_Write( + const std::shared_ptr& plan, + int scale_factor, + std::string table_name, + const std::shared_ptr& filesystem, std::string base_dir, + arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions +) { + auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); + + compute::ExecNode *table; + if (table_name == "part") { + table = ValueOrStop(gen.Part()); + } else if (table_name == "supplier") { + table = ValueOrStop(gen.Supplier()); + } else if (table_name == "partsupp") { + table = ValueOrStop(gen.PartSupp()); + } else if (table_name == "customer") { + table = ValueOrStop(gen.Customer()); + } else if (table_name == "nation") { + table = ValueOrStop(gen.Nation()); + } else if (table_name == "lineitem") { + table = ValueOrStop(gen.Lineitem()); + } else if (table_name == "region") { + table = ValueOrStop(gen.Region()); + } else if (table_name == "orders") { + table = ValueOrStop(gen.Orders()); + } else { + cpp11::stop("That's not a valid table name"); + } + + // TODO: unhardcode this once it's working + auto base_path = base_dir + "/parquet_dataset"; + filesystem->CreateDir(base_path); + + auto format = std::make_shared(); + + ds::FileSystemDatasetWriteOptions write_options; + write_options.file_write_options = format->DefaultWriteOptions(); + write_options.existing_data_behavior = ds::ExistingDataBehavior::kDeleteMatchingPartitions; + write_options.filesystem = filesystem; + write_options.base_dir = base_path; + write_options.partitioning = arrow::dataset::Partitioning::Default(); + write_options.basename_template = "part{i}.parquet"; + write_options.max_partitions = 1024; + + // TODO: this had a checked_cast in front of it in the code I adapted it from + // but I ran into namespace issues when doing it so I took it out to see if it + // worked, but maybe that's what's causing the sefault? + const ds::WriteNodeOptions options = + ds::WriteNodeOptions{write_options, table->output_schema()}; + + + MakeExecNodeOrStop("consuming_sink", plan.get(), {table}, options); + + cpp11::message("Just after consume"); + + StopIfNotOk(plan->Validate()); + + cpp11::message("Just after validate"); + + StopIfNotOk(plan->StartProducing()); + + // If the generator is destroyed before being completely drained, inform plan + std::shared_ptr stop_producing{nullptr, [plan](...) { + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); + + if (not_finished_yet) { + plan->StopProducing(); + } + }}; +} + #endif diff --git a/r/tests/testthat/test-tpch.R b/r/tests/testthat/test-tpch.R index 8077f76e4fd..eedf8954807 100644 --- a/r/tests/testthat/test-tpch.R +++ b/r/tests/testthat/test-tpch.R @@ -37,18 +37,12 @@ test_that("tpch_dbgen()", { expect_identical(dim(part_tab), c(200000L, 9L)) # and check a handful of types - expect_type_equal(part_tab[["R_PARTKEY"]], int32()) + expect_type_equal(part_tab[["P_PARTKEY"]], int32()) + expect_type_equal(part_tab[["P_NAME"]], string()) }) -# these two are tested above -tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region")) - -# nation segfaults -# supplier hangs -tpch_tables_up <- setdiff(tpch_tables_up, c("nation", "supplier")) - -# all of the rest below have an error with: -# Invalid: Arrays used to construct an ExecBatch must have equal length +# these three are tested above, but test that we can get tables for all the rest +tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region", "part")) for (table_name in tpch_tables_up) { test_that(paste0("Generating table: ", table_name), { From 9d54e611a033c1aff5076333072857be38616942 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Fri, 4 Mar 2022 23:55:24 -0800 Subject: [PATCH 05/34] Make it actually multithreaded --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 3 +- cpp/src/arrow/compute/exec/tpch_node.cc | 221 +++++++++++++------ cpp/src/arrow/compute/exec/tpch_node_test.cc | 1 + 3 files changed, 153 insertions(+), 72 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 963782333cf..9b4fad177e4 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -170,6 +170,7 @@ static void BM_Tpch_Q1(benchmark::State &st) } //BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 1000)->ArgNames({ "SF" }); -BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); +//BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); +BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({ "SF" }); } } diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 445df7d08b9..f9367b1131a 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -89,7 +89,7 @@ namespace arrow protected: std::atomic done_ = { false }; - std::atomic batches_generated_ = { 0 }; + std::atomic batches_outputted_ = { 0 }; }; int GetNumDigits(int64_t x) @@ -197,17 +197,17 @@ namespace arrow char temp_buff[kChunkSize]; while(done_.load() == false) { - int64_t current_offset = 0; - int64_t offset = 0; - while(GenerateSentence(offset, rng, temp_buff)) - current_offset = offset; + int64_t known_valid_offset = 0; + int64_t try_offset = 0; + while(GenerateSentence(try_offset, rng, temp_buff)) + known_valid_offset = try_offset; { std::lock_guard lock(text_guard_); if(done_.load()) return Status::OK(); int64_t bytes_remaining = kTextBytes - generated_offset_; - int64_t memcpy_size = std::min(offset, bytes_remaining); + int64_t memcpy_size = std::min(known_valid_offset, bytes_remaining); std::memcpy(out + generated_offset_, temp_buff, memcpy_size); generated_offset_ += memcpy_size; if(generated_offset_ == kTextBytes) @@ -283,7 +283,7 @@ namespace arrow *out-- = '0' + (x % 10); x /= 10; } - out += num_digits; + out += (num_digits + 1); } void GeneratePhoneNumber( @@ -506,7 +506,7 @@ namespace arrow break; case 2: success &= GenerateAdjective(offset, rng, arr); - success &= GenerateWord(offset, rng, arr, &comma_space, 1); + success &= GenerateWord(--offset, rng, arr, &comma_space, 1); success &= GenerateAdjective(offset, rng, arr); success &= GenerateNoun(offset, rng, arr); break; @@ -637,6 +637,16 @@ namespace arrow return Status::OK(); } + int64_t part_batches_generated() const + { + return part_batches_generated_.load(); + } + + int64_t partsupp_batches_generated() const + { + return partsupp_batches_generated_.load(); + } + Result> SetPartOutputColumns(const std::vector &cols) { return SetOutputColumns(cols, part_types_, part_name_map_, part_cols_); @@ -647,18 +657,20 @@ namespace arrow return SetOutputColumns(cols, partsupp_types_, partsupp_name_map_, partsupp_cols_); } - Result> NextPartBatch(size_t thread_index) + Result> NextPartBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(part_output_queue_mutex_); + bool all_generated = part_rows_generated_ == part_rows_to_generate_; if(!part_output_queue_.empty()) { ExecBatch batch = std::move(part_output_queue_.front()); part_output_queue_.pop(); return std::move(batch); } - else if(part_rows_generated_ == part_rows_to_generate_) + else if(all_generated) { return util::nullopt; } @@ -669,6 +681,10 @@ namespace arrow batch_size_, part_rows_to_generate_ - part_rows_generated_); part_rows_generated_ += tld.part_to_generate; + + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } @@ -712,8 +728,9 @@ namespace arrow return ExecBatch::Make(std::move(part_result)); } - Result> NextPartSuppBatch(size_t thread_index) + Result> NextPartSuppBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(partsupp_output_queue_mutex_); @@ -737,6 +754,9 @@ namespace arrow batch_size_, part_rows_to_generate_ - part_rows_generated_); part_rows_generated_ += tld.part_to_generate; + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } @@ -1120,13 +1140,20 @@ namespace arrow return Status::OK(); } + int64_t PartsuppBatchesToGenerate(size_t thread_index) + { + ThreadLocalData &tld = thread_local_data_[thread_index]; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + return num_batches; + } + Status InitPartsupp(size_t thread_index) { ThreadLocalData &tld = thread_local_data_[thread_index]; tld.generated_partsupp.reset(); tld.partsupp.clear(); - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + int64_t num_batches = PartsuppBatchesToGenerate(thread_index); tld.partsupp.resize(num_batches); for(std::vector &batch : tld.partsupp) { @@ -1321,7 +1348,10 @@ namespace arrow int64_t part_rows_generated_; std::vector part_cols_; std::vector partsupp_cols_; - + ThreadIndexer thread_indexer_; + + std::atomic part_batches_generated_ = { 0 }; + std::atomic partsupp_batches_generated_ = { 0 }; static constexpr int64_t kPartSuppRowsPerPart = 4; }; @@ -1349,6 +1379,16 @@ namespace arrow return Status::OK(); } + int64_t orders_batches_generated() const + { + return orders_batches_generated_.load(); + } + + int64_t lineitem_batches_generated() const + { + return lineitem_batches_generated_.load(); + } + Result> SetOrdersOutputColumns(const std::vector &cols) { return SetOutputColumns(cols, orders_types_, orders_name_map_, orders_cols_); @@ -1359,8 +1399,9 @@ namespace arrow return SetOutputColumns(cols, lineitem_types_, lineitem_name_map_, lineitem_cols_); } - Result> NextOrdersBatch(size_t thread_index) + Result> NextOrdersBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(orders_output_queue_mutex_); @@ -1381,6 +1422,7 @@ namespace arrow batch_size_, orders_rows_to_generate_ - orders_rows_generated_); orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1); ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } } @@ -1426,8 +1468,9 @@ namespace arrow return ExecBatch::Make(std::move(orders_result)); } - Result> NextLineItemBatch(size_t thread_index) + Result> NextLineItemBatch() { + size_t thread_index = thread_indexer_(); ThreadLocalData &tld = thread_local_data_[thread_index]; ExecBatch queued; bool from_queue = false; @@ -1450,18 +1493,20 @@ namespace arrow } { std::lock_guard lock(orders_output_queue_mutex_); - tld.orderkey_start = orders_rows_generated_; - tld.orders_to_generate = std::min( - batch_size_, - orders_rows_to_generate_ - orders_rows_generated_); - orders_rows_generated_ += tld.orders_to_generate; - ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); if(orders_rows_generated_ == orders_rows_to_generate_) { if(from_queue) return std::move(queued); return util::nullopt; } + + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = std::min( + batch_size_, + orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1ll); + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } tld.orders.clear(); tld.orders.resize(ORDERS::kNumCols); @@ -1469,6 +1514,7 @@ namespace arrow tld.generated_lineitem.reset(); if(from_queue) { + lineitem_batches_generated_.fetch_sub(1); for(size_t i = 0; i < lineitem_cols_.size(); i++) if(tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); @@ -1505,6 +1551,7 @@ namespace arrow ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); lineitem_results.emplace_back(std::move(eb)); } + lineitem_batches_generated_.fetch_add(static_cast(lineitem_results.size())); // Return the first batch, enqueue the rest. { std::lock_guard lock(lineitem_output_queue_mutex_); @@ -1872,7 +1919,7 @@ namespace arrow tld.items_per_order.push_back(length); tld.lineitem_to_generate += length; } - size_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; + int64_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; tld.lineitem.clear(); tld.lineitem.resize(num_batches); for(std::vector &batch : tld.lineitem) @@ -1889,13 +1936,17 @@ namespace arrow if(tld.lineitem[ibatch][column].kind() == Datum::NONE) { int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); + std::printf("Thread %lu, byte size %d\n", thread_index, byte_width); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); tld.lineitem[ibatch][column] = std::move(ad); out_batch_offset = 0; } - if(ibatch == 0) + else + { + ARROW_DCHECK(ibatch == 0); out_batch_offset = tld.first_batch_offset; + } return Status::OK(); } @@ -2461,6 +2512,10 @@ namespace arrow int64_t orders_rows_generated_; std::vector orders_cols_; std::vector lineitem_cols_; + ThreadIndexer thread_indexer_; + + std::atomic orders_batches_generated_ = { 0 }; + std::atomic lineitem_batches_generated_ = { 0 }; }; class SupplierGenerator : public TpchTableGenerator @@ -2518,7 +2573,9 @@ namespace arrow output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -2584,7 +2641,6 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.suppkey_start); - bool is_last_batch = tld.to_generate < batch_size_; tld.batch.clear(); tld.batch.resize(SUPPLIER::kNumCols); @@ -2598,15 +2654,14 @@ namespace arrow result[i] = tld.batch[col_idx]; } ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); - batches_generated_++; + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_outputted_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_outputted_before_this_one == (batches_to_generate - 1); output_callback_(std::move(eb)); if(is_last_batch) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - { - finished_callback_(batches_generated_.load()); - } + done_.store(true); + finished_callback_(batches_outputted_.load()); return Status::OK(); } return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); @@ -2657,7 +2712,7 @@ namespace arrow for(int64_t irow = 0; irow < tld.to_generate; irow++) { char *out = s_name + byte_width * irow; - std::memcpy(out, supplier, supplier_length); + std::strncpy(out, supplier, byte_width); AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); } } @@ -2799,7 +2854,6 @@ namespace arrow PartGenerator(std::shared_ptr gen) : gen_(std::move(gen)) { - batches_generated_.store(0); } Status Init( @@ -2825,7 +2879,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -2834,22 +2890,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextPartBatch(thread_index)); - if(done_.load() || !maybe_batch.has_value()) + gen_->NextPartBatch()); + if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->part_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } @@ -2868,7 +2928,6 @@ namespace arrow PartSuppGenerator(std::shared_ptr gen) : gen_(std::move(gen)) { - batches_generated_.store(0); } Status Init( @@ -2894,7 +2953,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -2903,22 +2964,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextPartSuppBatch(thread_index)); - if(done_.load() || !maybe_batch.has_value()) + gen_->NextPartSuppBatch()); + if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->partsupp_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } @@ -2961,7 +3026,9 @@ namespace arrow output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -3029,7 +3096,6 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.custkey_start); - bool is_last_batch = tld.to_generate < batch_size_; tld.batch.clear(); tld.batch.resize(CUSTOMER::kNumCols); @@ -3043,14 +3109,16 @@ namespace arrow result[i] = tld.batch[col_idx]; } ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); - batches_generated_++; + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_generated_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_generated_before_this_one == (batches_to_generate - 1); output_callback_(std::move(eb)); if(is_last_batch) { bool expected = false; if(done_.compare_exchange_strong(expected, true)) { - finished_callback_(batches_generated_.load()); + finished_callback_(batches_outputted_.load()); } return Status::OK(); } @@ -3238,7 +3306,6 @@ namespace arrow OrdersGenerator(std::shared_ptr gen) : gen_(std::move(gen)) { - batches_generated_.store(0); } Status Init( @@ -3264,7 +3331,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -3273,22 +3342,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextOrdersBatch(thread_index)); - if(done_.load() || !maybe_batch.has_value()) + gen_->NextOrdersBatch()); + if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->orders_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } @@ -3331,7 +3404,9 @@ namespace arrow finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + for(size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); } std::shared_ptr schema() const override @@ -3340,22 +3415,26 @@ namespace arrow } private: - Status ProduceCallback(size_t thread_index) + Status ProduceCallback(size_t) { + if(done_.load()) + return Status::OK(); ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextLineItemBatch(thread_index)); + gen_->NextLineItemBatch()); if(!maybe_batch.has_value()) { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) + int64_t batches_generated = gen_->lineitem_batches_generated(); + if(batches_generated == batches_outputted_.load()) { - finished_callback_(batches_generated_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); } return Status::OK(); } ExecBatch batch = std::move(*maybe_batch); - batches_generated_++; output_callback_(std::move(batch)); + batches_outputted_++; return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); } diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index c844d7e88c1..4273e18d4eb 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -58,6 +58,7 @@ namespace arrow for(auto &batch : res) { ValidateBatch(batch); + std::cout << batch.ToString() << std::endl; num_rows += batch.length; } ASSERT_EQ(num_rows, 10000); From 46987cd68b857c731e3e7cc79bc77770657c83bb Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Sat, 5 Mar 2022 12:47:39 -0800 Subject: [PATCH 06/34] Fill new arrays with empty Datums explicitly --- cpp/src/arrow/compute/exec/tpch_node.cc | 25 ++++++++------------ cpp/src/arrow/compute/exec/tpch_node_test.cc | 1 - 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index f9367b1131a..877fc85ab63 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -688,8 +688,8 @@ namespace arrow ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } - tld.part.clear(); tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) @@ -760,8 +760,8 @@ namespace arrow ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); } } - tld.part.clear(); tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) @@ -1152,13 +1152,12 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; tld.generated_partsupp.reset(); - tld.partsupp.clear(); int64_t num_batches = PartsuppBatchesToGenerate(thread_index); tld.partsupp.resize(num_batches); for(std::vector &batch : tld.partsupp) { - batch.clear(); batch.resize(PARTSUPP::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); } return Status::OK(); } @@ -1426,8 +1425,8 @@ namespace arrow ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } } - tld.orders.clear(); tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); RETURN_NOT_OK(GenerateRowCounts(thread_index)); tld.first_batch_offset = 0; tld.generated_lineitem.reset(); @@ -1508,8 +1507,8 @@ namespace arrow orders_batches_generated_.fetch_add(1ll); ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); } - tld.orders.clear(); tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); RETURN_NOT_OK(GenerateRowCounts(thread_index)); tld.generated_lineitem.reset(); if(from_queue) @@ -1920,12 +1919,11 @@ namespace arrow tld.lineitem_to_generate += length; } int64_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; - tld.lineitem.clear(); tld.lineitem.resize(num_batches); for(std::vector &batch : tld.lineitem) { - batch.clear(); batch.resize(LINEITEM::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); } return Status::OK(); } @@ -1936,17 +1934,14 @@ namespace arrow if(tld.lineitem[ibatch][column].kind() == Datum::NONE) { int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); - std::printf("Thread %lu, byte size %d\n", thread_index, byte_width); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); tld.lineitem[ibatch][column] = std::move(ad); out_batch_offset = 0; } - else - { - ARROW_DCHECK(ibatch == 0); + if(ibatch == 0) out_batch_offset = tld.first_batch_offset; - } + return Status::OK(); } @@ -2642,8 +2637,8 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.suppkey_start); - tld.batch.clear(); tld.batch.resize(SUPPLIER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); for(int col : gen_list_) RETURN_NOT_OK(generators_[col](thread_index)); @@ -3097,8 +3092,8 @@ namespace arrow tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.custkey_start); - tld.batch.clear(); tld.batch.resize(CUSTOMER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); for(int col : gen_list_) RETURN_NOT_OK(generators_[col](thread_index)); diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 4273e18d4eb..c844d7e88c1 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -58,7 +58,6 @@ namespace arrow for(auto &batch : res) { ValidateBatch(batch); - std::cout << batch.ToString() << std::endl; num_rows += batch.length; } ASSERT_EQ(num_rows, 10000); From b90d134f462bd4d757029cc4c04850ac945fcf23 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 8 Mar 2022 12:01:15 -0800 Subject: [PATCH 07/34] Add some tests, fix some bugs --- cpp/src/arrow/compute/exec/tpch_node.cc | 95 +++--- cpp/src/arrow/compute/exec/tpch_node.h | 6 +- cpp/src/arrow/compute/exec/tpch_node_test.cc | 288 ++++++++++++++++++- 3 files changed, 338 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 877fc85ab63..496b44a1dc0 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -49,7 +49,6 @@ namespace arrow int64_t generated_offset_ = 0; std::mutex text_guard_; std::unique_ptr text_; - random::pcg32_fast rng_; static constexpr int64_t kChunkSize = 8192; static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB }; @@ -65,7 +64,7 @@ namespace arrow virtual Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) = 0; virtual Status StartProducing( @@ -495,7 +494,7 @@ namespace arrow std::uniform_int_distribution dist(0, 3); const char *comma_space = ", "; bool success = true; - switch(dist(rng_)) + switch(dist(rng)) { case 0: success &= GenerateNoun(offset, rng, arr); @@ -526,7 +525,7 @@ namespace arrow { std::uniform_int_distribution dist(0, 3); bool success = true; - switch(dist(rng_)) + switch(dist(rng)) { case 0: success &= GenerateVerb(offset, rng, arr); @@ -565,7 +564,7 @@ namespace arrow { std::uniform_int_distribution dist(0, 4); bool success = true; - switch(dist(rng_)) + switch(dist(rng)) { case 0: success &= GenerateNounPhrase(offset, rng, arr); @@ -618,7 +617,7 @@ namespace arrow Status Init( size_t num_threads, int64_t batch_size, - int scale_factor) + float scale_factor) { if(!inited_) { @@ -632,7 +631,7 @@ namespace arrow // 5 is the maximum number of different strings we need to concatenate tld.string_indices.resize(5 * batch_size_); } - part_rows_to_generate_ = scale_factor_ * 200000; + part_rows_to_generate_ = static_cast(scale_factor_ * 200000); } return Status::OK(); } @@ -693,7 +692,9 @@ namespace arrow RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) + { RETURN_NOT_OK(part_generators_[col](thread_index)); + } for(int col : partsupp_cols_) RETURN_NOT_OK(partsupp_generators_[col](thread_index)); @@ -995,17 +996,20 @@ namespace arrow RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); const char *p_mfgr = reinterpret_cast( tld.part[PART::P_MFGR].array()->buffers[1]->data()); - char *p_brand = reinterpret_cast(tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); + char *p_brand = reinterpret_cast( + tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_BRAND]); int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); const size_t mfgr_id_offset = std::strlen("Manufacturer#"); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char *row = p_brand + byte_width * irow; char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); char brand_id = '0' + dist(tld.rng); - std::strncpy(p_brand + byte_width * irow, brand, byte_width); - *(p_brand + byte_width * irow + brand_length) = mfgr_id; - *(p_brand + byte_width * irow + brand_length + 1) = brand_id; + std::strncpy(row, brand, byte_width); + *(row + brand_length) = mfgr_id; + *(row + brand_length + 1) = brand_id; + irow += 0; } } return Status::OK(); @@ -1038,11 +1042,9 @@ namespace arrow tld.string_indices[irow * 3 + ipart] = name_part_index; string_length += std::strlen(types[ipart][name_part_index]); } - // Add 4 because there is a space between each word (i.e. 2 spaces) - offsets[irow + 1] = offsets[irow] + string_length + 2; + offsets[irow + 1] = offsets[irow] + string_length; } - // Add an extra byte for the space after in the very last string. - ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate])); char *strings = reinterpret_cast(string_buffer->mutable_data()); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { @@ -1054,7 +1056,6 @@ namespace arrow size_t length = std::strlen(part); std::memcpy(row, part, length); row += length; - *row++ = ' '; } } ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); @@ -1100,10 +1101,8 @@ namespace arrow size_t container2_length = std::strlen(container2); char *row = p_container + byte_width * irow; - // Abuse strncpy to zero out the rest of the array std::strncpy(row, container1, byte_width); - row[container1_length] = ' '; - std::memcpy(row + container1_length + 1, container2, container2_length); + std::memcpy(row + container1_length, container2, container2_length); } } return Status::OK(); @@ -1225,7 +1224,7 @@ namespace arrow int64_t ipartsupp = 0; int64_t ipart = 0; int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - const int32_t S = scale_factor_ * 10000; + const int32_t S = static_cast(scale_factor_ * 10000); for(int64_t irow = 0; irow < ps_to_generate; ibatch++) { RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); @@ -1342,7 +1341,7 @@ namespace arrow std::queue part_output_queue_; std::queue partsupp_output_queue_; int64_t batch_size_; - int scale_factor_; + float scale_factor_; int64_t part_rows_to_generate_; int64_t part_rows_generated_; std::vector part_cols_; @@ -1360,7 +1359,7 @@ namespace arrow Status Init( size_t num_threads, int64_t batch_size, - int scale_factor) + float scale_factor) { if(!inited_) { @@ -1373,7 +1372,7 @@ namespace arrow { tld.items_per_order.resize(batch_size_); } - orders_rows_to_generate_ = scale_factor_ * 150000 * 10; + orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); } return Status::OK(); } @@ -1711,7 +1710,8 @@ namespace arrow // divisible by 3. Rather than repeatedly generating numbers until we get to // a non-divisible-by-3 number, we just generate a number between // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. - std::uniform_int_distribution base_dist(0, scale_factor_ * 50000 - 1); + int32_t sf_50k = static_cast(scale_factor_ * 50000); + std::uniform_int_distribution base_dist(0, sf_50k - 1); std::uniform_int_distribution offset_dist(1, 2); int32_t *o_custkey = reinterpret_cast( tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); @@ -1867,7 +1867,8 @@ namespace arrow { RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_CLERK]); - std::uniform_int_distribution dist(1, scale_factor_ * 1000); + int64_t max_clerk_id = static_cast(scale_factor_ * 1000); + std::uniform_int_distribution dist(1, max_clerk_id); char *o_clerk = reinterpret_cast( tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.orders_to_generate; i++) @@ -1991,7 +1992,8 @@ namespace arrow tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; size_t ibatch = 0; - std::uniform_int_distribution dist(1, scale_factor_ * 200000); + int32_t max_partkey = static_cast(scale_factor_ * 200000); + std::uniform_int_distribution dist(1, max_partkey); for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { size_t batch_offset; @@ -2020,7 +2022,7 @@ namespace arrow size_t ibatch = 0; std::uniform_int_distribution dist(0, 3); - const int32_t S = scale_factor_ * 10000; + const int32_t S = static_cast(scale_factor_ * 10000); for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { size_t batch_offset = 0; @@ -2502,7 +2504,7 @@ namespace arrow std::queue orders_output_queue_; std::queue lineitem_output_queue_; int64_t batch_size_; - int scale_factor_; + float scale_factor_; int64_t orders_rows_to_generate_; int64_t orders_rows_generated_; std::vector orders_cols_; @@ -2518,12 +2520,12 @@ namespace arrow public: Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; batch_size_ = batch_size; - rows_to_generate_ = scale_factor_ * 10000; + rows_to_generate_ = static_cast(scale_factor_ * 10000); rows_generated_.store(0); ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, @@ -2537,7 +2539,8 @@ namespace arrow std::unordered_set good_rows_set; while(good_rows_set.size() < num_special_rows) { - good_rows_set.insert(dist(rng)); + int64_t row = dist(rng); + good_rows_set.insert(row); } std::unordered_set bad_rows_set; while(bad_rows_set.size() < num_special_rows) @@ -2817,7 +2820,7 @@ namespace arrow std::uniform_int_distribution start_dist(0, str_length - total_length); int32_t start = start_dist(tld.rng); std::memcpy(out + start, customer, customer_length); - std::memcpy(out + start + gap, review, review_length); + std::memcpy(out + start + customer_length + gap, review, review_length); } } @@ -2837,7 +2840,7 @@ namespace arrow ScheduleCallback schedule_callback_; int64_t rows_to_generate_; std::atomic rows_generated_; - int scale_factor_; + float scale_factor_; int64_t batch_size_; std::vector gen_list_; std::shared_ptr schema_; @@ -2853,7 +2856,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -2912,7 +2915,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -2927,7 +2930,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -2986,7 +2989,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -2996,7 +2999,7 @@ namespace arrow public: Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -3289,7 +3292,7 @@ namespace arrow ScheduleCallback schedule_callback_; int64_t rows_to_generate_; std::atomic rows_generated_; - int scale_factor_; + float scale_factor_; int64_t batch_size_; std::vector gen_list_; std::shared_ptr schema_; @@ -3305,7 +3308,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -3364,7 +3367,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -3378,7 +3381,7 @@ namespace arrow Status Init( std::vector columns, - int scale_factor, + float scale_factor, int64_t batch_size) override { scale_factor_ = scale_factor; @@ -3437,7 +3440,7 @@ namespace arrow FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - int64_t scale_factor_; + float scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -3447,7 +3450,7 @@ namespace arrow public: Status Init( std::vector columns, - int /*scale_factor*/, + float /*scale_factor*/, int64_t /*batch_size*/) override { ARROW_ASSIGN_OR_RAISE(schema_, @@ -3557,7 +3560,7 @@ namespace arrow public: Status Init( std::vector columns, - int /*scale_factor*/, + float /*scale_factor*/, int64_t /*batch_size*/) override { ARROW_ASSIGN_OR_RAISE(schema_, @@ -3751,7 +3754,7 @@ namespace arrow ThreadIndexer thread_indexer_; }; - Result TpchGen::Make(ExecPlan *plan, int scale_factor, int64_t batch_size) + Result TpchGen::Make(ExecPlan *plan, float scale_factor, int64_t batch_size) { TpchGen result(plan, scale_factor, batch_size); return result; diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index dc282aae981..1d904a2b5f0 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -36,7 +36,7 @@ namespace arrow class TpchGen { public: - static Result Make(ExecPlan *plan, int scale_factor = 1, int64_t batch_size = 4096); + static Result Make(ExecPlan *plan, float scale_factor = 1.0f, int64_t batch_size = 4096); Result Supplier(std::vector columns = {}); Result Part(std::vector columns = {}); @@ -48,7 +48,7 @@ namespace arrow Result Region(std::vector columns = {}); private: - TpchGen(ExecPlan *plan, int scale_factor, int64_t batch_size) + TpchGen(ExecPlan *plan, float scale_factor, int64_t batch_size) : plan_(plan), scale_factor_(scale_factor), batch_size_(batch_size), @@ -59,7 +59,7 @@ namespace arrow Result CreateNode(std::vector columns); ExecPlan *plan_; - int scale_factor_; + float scale_factor_; int64_t batch_size_; std::shared_ptr part_and_part_supp_generator_; diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index c844d7e88c1..6253075b85f 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -33,6 +33,8 @@ #include "arrow/util/thread_pool.h" #include "arrow/array/validate.h" +#include + namespace arrow { namespace compute @@ -43,6 +45,227 @@ namespace arrow ASSERT_OK(arrow::internal::ValidateArray(*d.array())); } + void VerifyUniqueKey( + std::unordered_set &seen, + const Datum &d, + int32_t min, + int32_t max) + { + const int32_t *keys = reinterpret_cast(d.array()->buffers[1]->data()); + int64_t num_keys = d.length(); + for(int64_t i = 0; i < num_keys; i++) + { + ASSERT_TRUE(seen.find(keys[i]) == seen.end()); + ASSERT_LE(keys[i], max); + ASSERT_GE(keys[i], min); + seen.insert(keys[i]); + } + } + + void VerifyStringAndNumber_FixedWidth( + const Datum &strings, + const Datum &numbers, + int byte_width, + const char *prefix, + bool verify_padding = true) + { + int64_t length = strings.length(); + const char *str = reinterpret_cast( + strings.array()->buffers[1]->data()); + + const int32_t *nums = nullptr; + if(numbers.kind() != Datum::NONE) + { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast( + numbers.array()->buffers[1]->data()); + } + + size_t num_offset = std::strlen(prefix); + for(int64_t i = 0; i < length; i++) + { + const char *row = str + i * byte_width; + ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) << row << ", prefix=" << prefix << ", i=" << i; + const char *num_str = row + num_offset; + int64_t num = 0; + int ibyte = static_cast(num_offset); + for(; *num_str && ibyte < byte_width; ibyte++) + { + num *= 10; + ASSERT_TRUE(std::isdigit(*num_str)); + num += *num_str++ - '0'; + } + if(nums) + { + ASSERT_EQ(static_cast(num), nums[i]); + } + if(verify_padding) + { + int num_chars = ibyte - num_offset; + ASSERT_GE(num_chars, 9); + } + } + } + + void VerifyVString(const Datum &d, int min_length, int max_length) + { + int64_t length = d.length(); + const int32_t *off = reinterpret_cast( + d.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + d.array()->buffers[2]->data()); + for(int64_t i = 0; i < length; i++) + { + int32_t start = off[i]; + int32_t end = off[i + 1]; + int32_t length = end - start; + ASSERT_LE(length, max_length); + ASSERT_GE(length, min_length); + for(int32_t i = start; i < end; i++) + { + bool is_valid = std::isdigit(str[i]) || std::isalpha(str[i]) || str[i] == ',' || str[i] == ' '; + ASSERT_TRUE(is_valid) << "Character " << str[i] << " is not a digit, a letter, a comma, or a space"; + } + } + } + + void VerifyAllBetween(const Datum &d, int32_t min, int32_t max) + { + int64_t length = d.length(); + const int32_t *n = reinterpret_cast(d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + { + ASSERT_GE(n[i], min) << "Value must be between " << min << " and " << max << ", got " << n[i]; + ASSERT_LE(n[i], max) << "Value must be between " << min << " and " << max << ", got " << n[i]; + } + } + + void VerifyNationKey(const Datum &d) + { + VerifyAllBetween(d, 0, 24); + } + + void VerifyPhone(const Datum &d) + { + int64_t length = d.length(); + const char *phones = reinterpret_cast(d.array()->buffers[1]->data()); + constexpr int kByteWidth = 15; // This is common for all PHONE columns + for(int64_t i = 0; i < length; i++) + { + const char *row = phones + i * kByteWidth; + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + } + } + + void VerifyDecimalsBetween(const Datum &d, int64_t min, int64_t max) + { + int64_t length = d.length(); + const Decimal128 *decs = reinterpret_cast( + d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + { + int64_t val = static_cast(decs[i]); + ASSERT_LE(val, max); + ASSERT_GE(val, min); + } + } + + void VerifyCorrectNumberOfWords_Varlen(const Datum &d, int num_words) + { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const int32_t *offsets = reinterpret_cast( + d.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + d.array()->buffers[2]->data()); + + for(int64_t i = 0; i < length; i++) + { + int actual_num_spaces = 0; + + int32_t start = offsets[i]; + int32_t end = offsets[i + 1]; + int32_t str_len = end - start; + char tmp_str[256] = {}; + std::memcpy(tmp_str, str + start, str_len); + bool is_only_alphas_or_spaces = true; + for(int32_t j = offsets[i]; j < offsets[i + 1]; j++) + { + bool is_space = str[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(str[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) << "Words must be composed only of letters, got " << tmp_str; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) << "Wrong number of spaces in " << tmp_str; + } + } + + void VerifyCorrectNumberOfWords_FixedWidth(const Datum &d, int num_words, int byte_width) + { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const char *str = reinterpret_cast( + d.array()->buffers[1]->data()); + + for(int64_t i = 0; i < length; i++) + { + int actual_num_spaces = 0; + const char *row = str + i * byte_width; + bool is_only_alphas_or_spaces = true; + for(int32_t j = 0; j < byte_width && row[j]; j++) + { + bool is_space = row[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(row[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) << "Words must be composed only of letters, got " << row; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) << "Wrong number of spaces in " << row; + } + } + + void CountModifiedComments(const Datum &d, int &good_count, int &bad_count) + { + int64_t length = d.length(); + const int32_t *offsets = reinterpret_cast( + d.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + d.array()->buffers[2]->data()); + // Length of S_COMMENT is at most 100 + char tmp_string[101]; + for(int64_t i = 0; i < length; i++) + { + const char *row = str + offsets[i]; + int32_t row_length = offsets[i + 1] - offsets[i]; + std::memset(tmp_string, 0, sizeof(tmp_string)); + std::memcpy(tmp_string, row, row_length); + char *customer = std::strstr(tmp_string, "Customer"); + char *recommends = std::strstr(tmp_string, "Recommends"); + char *complaints = std::strstr(tmp_string, "Complaints"); + if(customer) + { + ASSERT_TRUE((recommends != nullptr) ^ (complaints != nullptr)); + if(recommends) + good_count++; + if(complaints) + bad_count++; + } + } + } + TEST(TpchNode, Supplier) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); @@ -54,13 +277,34 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 10000; int64_t num_rows = 0; + + std::unordered_set seen_suppkey; + int good_count = 0; + int bad_count = 0; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey( + seen_suppkey, + batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyStringAndNumber_FixedWidth(batch[1], batch[0], /*byte_width=*/25, "Supplie#r"); + VerifyVString(batch[2], /*min_length=*/10, /*max_length=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + CountModifiedComments(batch[6], good_count, bad_count); num_rows += batch.length; } - ASSERT_EQ(num_rows, 10000); + ASSERT_EQ(seen_suppkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + ASSERT_EQ(good_count, 5); + ASSERT_EQ(bad_count, 5); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Part) @@ -74,13 +318,47 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 200000; int64_t num_rows = 0; + + std::unordered_set seen_partkey; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey( + seen_partkey, + batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyCorrectNumberOfWords_Varlen( + batch[1], + /*num_words*=*/5); + VerifyStringAndNumber_FixedWidth( + batch[2], + Datum(), + /*byte_width=*/25, + "Manufacturer#", + /*verify_padding=*/false); + VerifyStringAndNumber_FixedWidth( + batch[3], + Datum(), + /*byte_width=*/10, + "Brand#", + /*verify_padding=*/false); + VerifyCorrectNumberOfWords_Varlen( + batch[4], + /*num_words=*/3); + VerifyAllBetween(batch[5], /*min=*/1, /*max=*/50); + VerifyCorrectNumberOfWords_FixedWidth( + batch[6], + /*num_words=*/2, + /*byte_width=*/10); num_rows += batch.length; } - ASSERT_EQ(num_rows, 200000); + ASSERT_EQ(seen_partkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, PartSupp) @@ -101,6 +379,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 800000); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Customer) @@ -121,6 +400,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 150000); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Orders) @@ -141,6 +421,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 1500000); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Lineitem) @@ -158,6 +439,7 @@ namespace arrow { ValidateBatch(batch); } + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Nation) @@ -178,6 +460,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 25); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Region) @@ -198,6 +481,7 @@ namespace arrow num_rows += batch.length; } ASSERT_EQ(num_rows, 5); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } } } From 426156360b1eea2bde815dbedcb2874211c6074b Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Wed, 9 Mar 2022 22:46:30 -0800 Subject: [PATCH 08/34] Finish writing basic sanity checks of generated data --- cpp/src/arrow/compute/exec/tpch_node.cc | 3 +- cpp/src/arrow/compute/exec/tpch_node_test.cc | 272 +++++++++++++++++-- 2 files changed, 249 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 496b44a1dc0..cdd7377975f 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -1197,6 +1197,7 @@ namespace arrow { for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) ps_partkey[batch_offset++] = p_partkey[ipart]; + if(ipartsupp == kPartSuppRowsPerPart) { ipartsupp = 0; @@ -1229,7 +1230,7 @@ namespace arrow { RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); int32_t *ps_suppkey = reinterpret_cast( - tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->buffers[1]->mutable_data()); + tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->buffers[1]->mutable_data()); int64_t next_run = std::min(batch_size_, ps_to_generate - irow); int64_t batch_offset = 0; diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 6253075b85f..2be2bafda8d 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -34,11 +34,16 @@ #include "arrow/array/validate.h" #include +#include namespace arrow { namespace compute { + static constexpr uint32_t STARTDATE = 8035; // January 1, 1992 is 8035 days after January 1, 1970 + static constexpr uint32_t CURRENTDATE = 9298; // June 17, 1995 is 9298 days after January 1, 1970 + static constexpr uint32_t ENDDATE = 10591; // December 12, 1998 is 10591 days after January 1, 1970 + void ValidateBatch(const ExecBatch &batch) { for(const Datum &d : batch.values) @@ -62,6 +67,36 @@ namespace arrow } } + void VerifyStringAndNumber_Single( + const char *row, + const char *prefix, + const int64_t i, + const int32_t *nums, + int byte_width, + bool verify_padding) + { + size_t num_offset = std::strlen(prefix); + ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) << row << ", prefix=" << prefix << ", i=" << i; + const char *num_str = row + num_offset; + int64_t num = 0; + int ibyte = static_cast(num_offset); + for(; *num_str && ibyte < byte_width; ibyte++) + { + num *= 10; + ASSERT_TRUE(std::isdigit(*num_str)); + num += *num_str++ - '0'; + } + if(nums) + { + ASSERT_EQ(static_cast(num), nums[i]); + } + if(verify_padding) + { + int num_chars = ibyte - num_offset; + ASSERT_GE(num_chars, 9); + } + } + void VerifyStringAndNumber_FixedWidth( const Datum &strings, const Datum &numbers, @@ -81,29 +116,46 @@ namespace arrow numbers.array()->buffers[1]->data()); } - size_t num_offset = std::strlen(prefix); for(int64_t i = 0; i < length; i++) { const char *row = str + i * byte_width; - ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) << row << ", prefix=" << prefix << ", i=" << i; - const char *num_str = row + num_offset; - int64_t num = 0; - int ibyte = static_cast(num_offset); - for(; *num_str && ibyte < byte_width; ibyte++) - { - num *= 10; - ASSERT_TRUE(std::isdigit(*num_str)); - num += *num_str++ - '0'; - } - if(nums) - { - ASSERT_EQ(static_cast(num), nums[i]); - } - if(verify_padding) - { - int num_chars = ibyte - num_offset; - ASSERT_GE(num_chars, 9); - } + VerifyStringAndNumber_Single(row, prefix, i, nums, byte_width, verify_padding); + } + } + + void VerifyStringAndNumber_Varlen( + const Datum &strings, + const Datum &numbers, + const char *prefix, + bool verify_padding = true) + { + int64_t length = strings.length(); + const int32_t *offsets = reinterpret_cast( + strings.array()->buffers[1]->data()); + const char *str = reinterpret_cast( + strings.array()->buffers[2]->data()); + + const int32_t *nums = nullptr; + if(numbers.kind() != Datum::NONE) + { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast( + numbers.array()->buffers[1]->data()); + } + + for(int64_t i = 0; i < length; i++) + { + char tmp_str[256] = {}; + int32_t start = offsets[i]; + int32_t str_len = offsets[i + 1] - offsets[i]; + std::memcpy(tmp_str, str + start, str_len); + VerifyStringAndNumber_Single( + tmp_str, + prefix, + i, + nums, + sizeof(tmp_str), + verify_padding); } } @@ -129,6 +181,18 @@ namespace arrow } } + void VerifyModuloBetween(const Datum &d, int32_t min, int32_t max, int32_t mod) + { + int64_t length = d.length(); + const int32_t *n = reinterpret_cast(d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + { + int32_t m = n[i] % mod; + ASSERT_GE(m, min) << "Value must be between " << min << " and " << max << " mod " << mod << ", " << n[i] << " % " << mod << " = " << m; + ASSERT_LE(m, max) << "Value must be between " << min << " and " << max << " mod " << mod << ", " << n[i] << " % " << mod << " = " << m; + } + } + void VerifyAllBetween(const Datum &d, int32_t min, int32_t max) { int64_t length = d.length(); @@ -214,7 +278,10 @@ namespace arrow } } - void VerifyCorrectNumberOfWords_FixedWidth(const Datum &d, int num_words, int byte_width) + void VerifyCorrectNumberOfWords_FixedWidth( + const Datum &d, + int num_words, + int byte_width) { int expected_num_spaces = num_words - 1; int64_t length = d.length(); @@ -237,6 +304,41 @@ namespace arrow } } + void VerifyOneOf(const Datum &d, const std::unordered_set &possibilities) + { + int64_t length = d.length(); + const char *col = reinterpret_cast( + d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + ASSERT_TRUE(possibilities.find(col[i]) != possibilities.end()); + } + + void VerifyOneOf( + const Datum &d, + int32_t byte_width, + const std::unordered_set &possibilities) + { + int64_t length = d.length(); + const char *col = reinterpret_cast( + d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + { + const char *row = col + i * byte_width; + char tmp_str[256] = {}; + std::memcpy(tmp_str, row, byte_width); + ASSERT_TRUE(possibilities.find(tmp_str) != possibilities.end()) << tmp_str << " is not a valid string."; + } + } + + void CountInstances(std::unordered_map &counts, const Datum &d) + { + int64_t length = d.length(); + const int32_t *nums = reinterpret_cast( + d.array()->buffers[1]->data()); + for(int64_t i = 0; i < length; i++) + counts[nums[i]]++; + } + void CountModifiedComments(const Datum &d, int &good_count, int &bad_count) { int64_t length = d.length(); @@ -372,13 +474,24 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 800000; int64_t num_rows = 0; + + std::unordered_map counts; for(auto &batch : res) { ValidateBatch(batch); + CountInstances(counts, batch[0]); + VerifyAllBetween(batch[2], 1, 9999); + VerifyDecimalsBetween(batch[3], 100, 100000); num_rows += batch.length; } - ASSERT_EQ(num_rows, 800000); + for(auto &partkey : counts) + ASSERT_EQ(partkey.second, 4) << "Key " << partkey.first << " has count " << partkey.second; + ASSERT_EQ(counts.size(), kExpectedRows / 4); + + ASSERT_EQ(num_rows, kExpectedRows); arrow::internal::GetCpuThreadPool()->WaitForIdle(); } @@ -393,13 +506,35 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + const int64_t kExpectedRows = 150000; int64_t num_rows = 0; + + std::unordered_set seen_custkey; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey( + seen_custkey, + batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyStringAndNumber_Varlen( + batch[1], + batch[0], + "Customer#"); + VerifyVString(batch[2], /*min=*/10, /*max=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + VerifyCorrectNumberOfWords_FixedWidth( + batch[6], + /*num_words=*/1, + /*byte_width=*/10); num_rows += batch.length; } - ASSERT_EQ(num_rows, 150000); + ASSERT_EQ(seen_custkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); arrow::internal::GetCpuThreadPool()->WaitForIdle(); } @@ -414,13 +549,39 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 1500000; int64_t num_rows = 0; + + std::unordered_set seen_orderkey; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey( + seen_orderkey, + batch[0], + /*min=*/1, + /*max=*/static_cast(4 * kExpectedRows)); + VerifyAllBetween(batch[1], /*min=*/1, /*max=*/static_cast(kExpectedRows)); + VerifyModuloBetween(batch[1], /*min=*/1, /*max=*/2, /*mod=*/3); + VerifyOneOf(batch[2], { 'F', 'O', 'P' }); + VerifyAllBetween(batch[4], STARTDATE, ENDDATE - 151); + VerifyOneOf(batch[5], + /*byte_width=*/15, + { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", + }); + VerifyStringAndNumber_FixedWidth( + batch[6], + Datum(), + /*byte_width=*/15, + "Clerk#", + /*verify_padding=*/true); + VerifyAllBetween(batch[7], /*min=*/0, /*max=*/0); num_rows += batch.length; } - ASSERT_EQ(num_rows, 1500000); + ASSERT_EQ(seen_orderkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); arrow::internal::GetCpuThreadPool()->WaitForIdle(); } @@ -435,9 +596,38 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + std::unordered_map counts; for(auto &batch : res) { ValidateBatch(batch); + CountInstances(counts, batch[0]); + VerifyAllBetween(batch[1], /*min=*/1, /*max=*/200000); + VerifyAllBetween(batch[3], /*min=*/1, /*max=*/7); + VerifyDecimalsBetween(batch[4], /*min=*/100, /*max=*/5000); + VerifyDecimalsBetween(batch[6], /*min=*/0, /*max=*/10); + VerifyDecimalsBetween(batch[7], /*min=*/0, /*max=*/8); + VerifyOneOf(batch[8], { 'R', 'A', 'N' }); + VerifyOneOf(batch[9], { 'O', 'F' }); + VerifyAllBetween(batch[10], STARTDATE + 1, ENDDATE - 151 + 121); + VerifyAllBetween(batch[11], STARTDATE + 30, ENDDATE - 151 + 90); + VerifyAllBetween(batch[12], STARTDATE + 2, ENDDATE - 151 + 121 + 30); + VerifyOneOf( + batch[13], + /*byte_width=*/25, + { + "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN", + }); + VerifyOneOf( + batch[14], + /*byte_width=*/10, + { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", + }); + } + for(auto &count : counts) + { + ASSERT_GE(count.second, 1); + ASSERT_LE(count.second, 7); } arrow::internal::GetCpuThreadPool()->WaitForIdle(); } @@ -453,13 +643,33 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 25; int64_t num_rows = 0; + + std::unordered_set seen_nationkey; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey(seen_nationkey, batch[0], 0, kExpectedRows - 1); + VerifyOneOf( + batch[1], + /*byte_width=*/25, + { + "ALGERIA", "ARGENTINA", "BRAZIL", + "CANADA", "EGYPT", "ETHIOPIA", + "FRANCE", "GERMANY", "INDIA", + "INDONESIA", "IRAN", "IRAQ", + "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", + "CHINA", "ROMANIA", "SAUDI ARABIA", + "VIETNAM", "RUSSIA", "UNITED KINGDOM", + "UNITED STATES" + }); + VerifyAllBetween(batch[2], 0, 4); num_rows += batch.length; } - ASSERT_EQ(num_rows, 25); + ASSERT_EQ(num_rows, kExpectedRows); arrow::internal::GetCpuThreadPool()->WaitForIdle(); } @@ -474,10 +684,22 @@ namespace arrow std::ignore = *sink.AddToPlan(plan.get()); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 5; int64_t num_rows = 0; + + std::unordered_set seen_regionkey; for(auto &batch : res) { ValidateBatch(batch); + VerifyUniqueKey(seen_regionkey, batch[0], 0, kExpectedRows - 1); + VerifyOneOf( + batch[1], + /*byte_width=*/25, + { + "AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST" + }); + num_rows += batch.length; } ASSERT_EQ(num_rows, 5); From fb1252897cab1ef7f9c04c06364ae504c210a220 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 10 Mar 2022 10:05:09 -0600 Subject: [PATCH 09/34] ass pcg subdir to cmake --- cpp/src/arrow/vendored/CMakeLists.txt | 1 + cpp/src/arrow/vendored/pcg/CMakeLists.txt | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 cpp/src/arrow/vendored/pcg/CMakeLists.txt diff --git a/cpp/src/arrow/vendored/CMakeLists.txt b/cpp/src/arrow/vendored/CMakeLists.txt index 8d4c323d28a..0fdabc49f7c 100644 --- a/cpp/src/arrow/vendored/CMakeLists.txt +++ b/cpp/src/arrow/vendored/CMakeLists.txt @@ -19,3 +19,4 @@ arrow_install_all_headers("arrow/vendored") add_subdirectory(datetime) add_subdirectory(double-conversion) +add_subdirectory(pcg) diff --git a/cpp/src/arrow/vendored/pcg/CMakeLists.txt b/cpp/src/arrow/vendored/pcg/CMakeLists.txt new file mode 100644 index 00000000000..b4f0aec561b --- /dev/null +++ b/cpp/src/arrow/vendored/pcg/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arrow_install_all_headers("arrow/vendored/pcg") From 5dd308921cd1a172adb1701945a8bbcfd2b321b8 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 11 Mar 2022 07:37:33 -0600 Subject: [PATCH 10/34] Weston's suggestions for writing --- r/src/compute-exec.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index b00162f36e9..a2bc5b5a342 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -418,17 +418,9 @@ void Tpch_Dbgen_Write( StopIfNotOk(plan->StartProducing()); - // If the generator is destroyed before being completely drained, inform plan - std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); + cpp11::message("Just after start"); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + StopIfNotOk(plan->finished().status()); } #endif From 004d1d4698e0a7afd503d82d3883312cd35b9fff Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 24 Mar 2022 13:28:33 -0700 Subject: [PATCH 11/34] oops, need to export --- r/src/arrowExports.cpp | 17 +++++++++++++++++ r/src/compute-exec.cpp | 1 + 2 files changed, 18 insertions(+) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 8b60ae7bd00..8a781cf0bea 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1749,6 +1749,23 @@ extern "C" SEXP _arrow_ExecNode_TableSourceNode(SEXP plan_sexp, SEXP table_sexp) } #endif +// compute-exec.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr Tpch_Dbgen(const std::shared_ptr& plan, int scale_factor, std::string table_name); +extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type plan(plan_sexp); + arrow::r::Input::type scale_factor(scale_factor_sexp); + arrow::r::Input::type table_name(table_name_sexp); + return cpp11::as_sexp(Tpch_Dbgen(plan, scale_factor, table_name)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ + Rf_error("Cannot call Tpch_Dbgen(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) void Tpch_Dbgen_Write(const std::shared_ptr& plan, int scale_factor, std::string table_name, const std::shared_ptr& filesystem, std::string base_dir, arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions); diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index a2bc5b5a342..9d3ccc73b52 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -301,6 +301,7 @@ std::shared_ptr ExecNode_TableSourceNode( return MakeExecNodeOrStop("table_source", plan.get(), {}, options); } +// [[arrow::export]] std::shared_ptr Tpch_Dbgen( const std::shared_ptr& plan, int scale_factor, From 0b8ff2c1739e1b9af69593830909071d1010418f Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Fri, 11 Mar 2022 11:44:18 -0800 Subject: [PATCH 12/34] Respond to Weston comments (except the task scheduler thing) --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 15 +- cpp/src/arrow/compute/exec/tpch_node.cc | 847 +++++++++---------- cpp/src/arrow/compute/exec/tpch_node.h | 9 + cpp/src/arrow/compute/exec/tpch_node_test.cc | 42 +- 4 files changed, 457 insertions(+), 456 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 9b4fad177e4..e015bcf2abd 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -28,7 +28,7 @@ namespace arrow namespace compute { -std::shared_ptr Plan_Q1(AsyncGenerator> &sink_gen, int scale_factor) +std::shared_ptr Plan_Q1(AsyncGenerator> *sink_gen, int scale_factor) { ExecContext *ctx = default_exec_context(); *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); @@ -132,7 +132,7 @@ std::shared_ptr Plan_Q1(AsyncGenerator> &sin SortKey l_returnflag_key("L_RETURNFLAG"); SortKey l_linestatus_key("L_LINESTATUS"); SortOptions sort_opts({ l_returnflag_key, l_linestatus_key }); - OrderBySinkNodeOptions order_by_opts(sort_opts, &sink_gen); + OrderBySinkNodeOptions order_by_opts(sort_opts, sink_gen); Declaration filter_decl("filter", { Declaration::Input(lineitem) }, filter_opts); Declaration project_decl("project", project_opts); @@ -156,21 +156,14 @@ static void BM_Tpch_Q1(benchmark::State &st) { st.PauseTiming(); AsyncGenerator> sink_gen; - std::shared_ptr plan = Plan_Q1(sink_gen, st.range(0)); + std::shared_ptr plan = Plan_Q1(&sink_gen, st.range(0)); st.ResumeTiming(); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); -#ifndef NDEBUG - st.PauseTiming(); - for(auto &batch : res) - std::cout << batch.ToString() << std::endl; - st.ResumeTiming(); -#endif } } -//BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 1000)->ArgNames({ "SF" }); -//BENCHMARK(BM_Tpch_Q1)->RangeMultiplier(10)->Range(1, 10)->ArgNames({ "SF" }); BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({ "SF" }); + } } diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index cdd7377975f..22de9bbad92 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -19,297 +19,6 @@ namespace arrow namespace compute { - class TpchText - { - public: - Status InitIfNeeded(random::pcg32_fast &rng); - Result GenerateComments( - size_t num_comments, - size_t min_length, - size_t max_length, - random::pcg32_fast &rng); - - private: - bool GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices); - bool GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr); - - bool GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - - bool GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr); - - std::atomic done_ = { false }; - int64_t generated_offset_ = 0; - std::mutex text_guard_; - std::unique_ptr text_; - static constexpr int64_t kChunkSize = 8192; - static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB - }; - - class TpchTableGenerator - { - public: - using OutputBatchCallback = std::function; - using FinishedCallback = std::function; - using GenerateFn = std::function; - using ScheduleCallback = std::function; - using AbortCallback = std::function; - - virtual Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) = 0; - - virtual Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) = 0; - - void Abort(AbortCallback abort_callback) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - { - abort_callback(); - } - } - - virtual std::shared_ptr schema() const = 0; - - virtual ~TpchTableGenerator() = default; - - protected: - std::atomic done_ = { false }; - std::atomic batches_outputted_ = { 0 }; - }; - - int GetNumDigits(int64_t x) - { - // This if statement chain is for MAXIMUM SPEED - /* - ., - . _,'f----.._ - |\ ,-'"/ | ,' - |,_ ,--. / - /,-. ,'`. (_ - f o| o|__ "`-. - ,-._.,--'_ `. _.,-` - `"' ___.,'` j,-' - `-.__.,--' - */ - // Source: https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c - ARROW_DCHECK(x >= 0); - if(x < 10ll) return 1; - if(x < 100ll) return 2; - if(x < 1000ll) return 3; - if(x < 10000ll) return 4; - if(x < 100000ll) return 5; - if(x < 1000000ll) return 6; - if(x < 10000000ll) return 7; - if(x < 100000000ll) return 8; - if(x < 1000000000ll) return 9; - if(x < 10000000000ll) return 10; - if(x < 100000000000ll) return 11; - if(x < 1000000000000ll) return 12; - if(x < 10000000000000ll) return 13; - if(x < 100000000000000ll) return 14; - if(x < 1000000000000000ll) return 15; - if(x < 10000000000000000ll) return 16; - if(x < 100000000000000000ll) return 17; - if(x < 1000000000000000000ll) return 18; - return -1; - } - - void AppendNumberPaddedToNineDigits(char *out, int64_t x) - { - // We do all of this to avoid calling snprintf, which does a lot of crazy - // locale stuff. On Windows and MacOS this can get suuuuper slow - int num_digits = GetNumDigits(x); - int num_padding_zeros = std::max(9 - num_digits, 0); - std::memset(out, '0', static_cast(num_padding_zeros)); - while(x > 0) - { - *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); - num_digits -= 1; - x /= 10; - } - } - - Result> SetOutputColumns( - const std::vector &columns, - const std::vector> &types, - const std::unordered_map &name_map, - std::vector &gen_list) - { - gen_list.clear(); - std::vector> fields; - if(columns.empty()) - { - fields.resize(name_map.size()); - gen_list.resize(name_map.size()); - for(auto pair : name_map) - { - int col_idx = pair.second; - fields[col_idx] = field(pair.first, types[col_idx]); - gen_list[col_idx] = col_idx; - } - return schema(std::move(fields)); - } - else - { - for(const std::string &col : columns) - { - auto entry = name_map.find(col); - if(entry == name_map.end()) - return Status::Invalid("Not a valid column name"); - int col_idx = static_cast(entry->second); - fields.push_back(field(col, types[col_idx])); - gen_list.push_back(col_idx); - } - return schema(std::move(fields)); - } - } - - static TpchText g_text; - - Status TpchText::InitIfNeeded(random::pcg32_fast &rng) - { - if(done_.load()) - return Status::OK(); - - { - std::lock_guard lock(text_guard_); - if(!text_) - { - ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); - } - } - char *out = reinterpret_cast(text_->mutable_data()); - char temp_buff[kChunkSize]; - while(done_.load() == false) - { - int64_t known_valid_offset = 0; - int64_t try_offset = 0; - while(GenerateSentence(try_offset, rng, temp_buff)) - known_valid_offset = try_offset; - - { - std::lock_guard lock(text_guard_); - if(done_.load()) - return Status::OK(); - int64_t bytes_remaining = kTextBytes - generated_offset_; - int64_t memcpy_size = std::min(known_valid_offset, bytes_remaining); - std::memcpy(out + generated_offset_, temp_buff, memcpy_size); - generated_offset_ += memcpy_size; - if(generated_offset_ == kTextBytes) - done_.store(true); - } - } - return Status::OK(); - } - - Result TpchText::GenerateComments( - size_t num_comments, - size_t min_length, - size_t max_length, - random::pcg32_fast &rng) - { - RETURN_NOT_OK(InitIfNeeded(rng)); - std::uniform_int_distribution length_dist(min_length, max_length); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); - int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); - offsets[0] = 0; - for(size_t i = 1; i <= num_comments; i++) - offsets[i] = offsets[i - 1] + length_dist(rng); - - ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, AllocateBuffer(offsets[num_comments])); - char *comments = reinterpret_cast(comment_buffer->mutable_data()); - for(size_t i = 0; i < num_comments; i++) - { - size_t length = offsets[i + 1] - offsets[i]; - std::uniform_int_distribution offset_dist(0, kTextBytes - length); - size_t offset_in_text = offset_dist(rng); - std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); - } - ArrayData ad(utf8(), num_comments, { nullptr, std::move(offset_buffer), std::move(comment_buffer) }); - return std::move(ad); - } - - Result RandomVString( - random::pcg32_fast &rng, - int64_t num_rows, - int32_t min_length, - int32_t max_length) - { - std::uniform_int_distribution length_dist(min_length, max_length); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((num_rows + 1) * sizeof(int32_t))); - int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); - offsets[0] = 0; - for(int64_t i = 1; i <= num_rows; i++) - offsets[i] = offsets[i - 1] + length_dist(rng); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[num_rows])); - char *str = reinterpret_cast(str_buff->mutable_data()); - - // Spec says to pick random alphanumeric characters from a set of at least - // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, - // so 52 total for upper and lower case, and 10 possible digits gives 62 - // characters... - // dbgen solves this by including a space and a comma as well, so we'll - // copy that. - const char alpha_numerics[65] = - "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; - std::uniform_int_distribution char_dist(0, 63); - for(int32_t i = 0; i < offsets[num_rows]; i++) - str[i] = alpha_numerics[char_dist(rng)]; - - ArrayData ad(utf8(), num_rows, { nullptr, std::move(offset_buff), std::move(str_buff) }); - return std::move(ad); - } - - void AppendNumber(char *&out, int num_digits, int32_t x) - { - out += (num_digits - 1); - while(x > 0) - { - *out-- = '0' + (x % 10); - x /= 10; - } - out += (num_digits + 1); - } - - void GeneratePhoneNumber( - char *out, - random::pcg32_fast &rng, - int32_t country) - { - std::uniform_int_distribution three_digit(100, 999); - std::uniform_int_distribution four_digit(1000, 9999); - - int32_t country_code = country + 10; - int32_t l1 = three_digit(rng); - int32_t l2 = three_digit(rng); - int32_t l3 = four_digit(rng); - AppendNumber(out, 2, country_code); - *out++ = '-'; - AppendNumber(out, 3, l1); - *out++ = '-'; - AppendNumber(out, 3, l2); - *out++ = '-'; - AppendNumber(out, 4, l3); - } - - static constexpr uint32_t STARTDATE = 8035; // January 1, 1992 is 8035 days after January 1, 1970 - static constexpr uint32_t CURRENTDATE = 9298; // June 17, 1995 is 9298 days after January 1, 1970 - static constexpr uint32_t ENDDATE = 10591; // December 12, 1998 is 10591 days after January 1, 1970 - const char *NameParts[] = { "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", @@ -415,30 +124,132 @@ namespace arrow }; static constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); - const char *Prepositions[] = - { - "about ", "above ", "according to ", "across ", "after ", "against ", "along ", "alongside of ", "among ", - "around ", "at ", "atop ", "before ", "behind ", "beneath ", "beside ", "besides ", "between ", "beyond ", - "beyond ", "by ", "despite ", "during ", "except ", "for ", "from ", "in place of ", "inside ", "instead of ", - "into ", "near ", "of ", "on ", "outside ", "over ", "past ", "since ", "through ", "throughout ", "to ", - "toward ", "under ", "until ", "up ", "upon ", "without ", "with ", "within ", + const char *Prepositions[] = + { + "about ", "above ", "according to ", "across ", "after ", "against ", "along ", "alongside of ", "among ", + "around ", "at ", "atop ", "before ", "behind ", "beneath ", "beside ", "besides ", "between ", "beyond ", + "beyond ", "by ", "despite ", "during ", "except ", "for ", "from ", "in place of ", "inside ", "instead of ", + "into ", "near ", "of ", "on ", "outside ", "over ", "past ", "since ", "through ", "throughout ", "to ", + "toward ", "under ", "until ", "up ", "upon ", "without ", "with ", "within ", + }; + static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); + + const char *Auxiliaries[] = + { + "do ", "may ", "might ", "shall ", "will ", "would ", "can ", "could ", "should ", "ought to ", "must ", + "will have to ", "shall have to ", "could have to ", "should have to ", "must have to ", "need to ", "try to ", + }; + static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); + + const char *Terminators[] = + { + ".", ";", ":", "?", "!", "--", + }; + static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + + // The spec says to generate a 300 MB string according to a grammar. This is a + // concurrent implementation of the generator. Each thread generates the text in + // (up to) 8KB chunks of text. The generator maintains a cursor into the + // 300 MB buffer. After generating the chunk, the cursor is incremented + // to reserve space, and the chunk is memcpy-d in. + // This text is used to generate the COMMENT columns. To generate a comment, the spec + // says to pick a random length and a random offset into the 300 MB buffer (it does + // not specify it should be word/sentence aligned), and that slice of text becomes + // the comment. + class TpchPseudotext + { + public: + Status EnsureInitialized(random::pcg32_fast &rng); + Result GenerateComments( + size_t num_comments, + size_t min_length, + size_t max_length, + random::pcg32_fast &rng); + + private: + bool GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices); + bool GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr); + + bool GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + bool GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); + + bool GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr); + + std::atomic generated_offset_ = { 0 }; + std::mutex text_guard_; + std::unique_ptr text_; + static constexpr int64_t kChunkSize = 8192; + static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB }; - static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); - const char *Auxiliaries[] = + static TpchPseudotext g_text; + + Status TpchPseudotext::EnsureInitialized(random::pcg32_fast &rng) { - "do ", "may ", "might ", "shall ", "will ", "would ", "can ", "could ", "should ", "ought to ", "must ", - "will have to ", "shall have to ", "could have to ", "should have to ", "must have to ", "need to ", "try to ", - }; - static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); + if(generated_offset_.load() >= kTextBytes) + return Status::OK(); - const char *Terminators[] = + { + std::lock_guard lock(text_guard_); + if(!text_) + { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + } + } + char *out = reinterpret_cast(text_->mutable_data()); + char temp_buff[kChunkSize]; + while(generated_offset_.load() < kTextBytes) + { + int64_t known_valid_offset = 0; + int64_t try_offset = 0; + while(GenerateSentence(try_offset, rng, temp_buff)) + known_valid_offset = try_offset; + + int64_t offset = generated_offset_.fetch_add(known_valid_offset); + if(offset >= kTextBytes) + return Status::OK(); + int64_t bytes_remaining = kTextBytes - offset; + int64_t memcpy_size = std::min(known_valid_offset, bytes_remaining); + std::memcpy(out + offset, temp_buff, memcpy_size); + } + return Status::OK(); + } + + Result TpchPseudotext::GenerateComments( + size_t num_comments, + size_t min_length, + size_t max_length, + random::pcg32_fast &rng) { - ".", ";", ":", "?", "!", "--", - }; - static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + RETURN_NOT_OK(EnsureInitialized(rng)); + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); + int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); + offsets[0] = 0; + for(size_t i = 1; i <= num_comments; i++) + offsets[i] = offsets[i - 1] + length_dist(rng); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, AllocateBuffer(offsets[num_comments])); + char *comments = reinterpret_cast(comment_buffer->mutable_data()); + for(size_t i = 0; i < num_comments; i++) + { + size_t length = offsets[i + 1] - offsets[i]; + std::uniform_int_distribution offset_dist(0, kTextBytes - length); + size_t offset_in_text = offset_dist(rng); + std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); + } + ArrayData ad(utf8(), num_comments, { nullptr, std::move(offset_buffer), std::move(comment_buffer) }); + return std::move(ad); + } - bool TpchText::GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices) + bool TpchPseudotext::GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices) { std::uniform_int_distribution dist(0, num_choices - 1); const char *word = words[dist(rng)]; @@ -450,37 +261,37 @@ namespace arrow return true; } - bool TpchText::GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr) { return GenerateWord(offset, rng, arr, Nouns, kNumNouns); } - bool TpchText::GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr) { return GenerateWord(offset, rng, arr, Verbs, kNumVerbs); } - bool TpchText::GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr) { return GenerateWord(offset, rng, arr, Adjectives, kNumAdjectives); } - bool TpchText::GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr) { return GenerateWord(offset, rng, arr, Adverbs, kNumAdverbs); } - bool TpchText::GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr) { return GenerateWord(offset, rng, arr, Prepositions, kNumPrepositions); } - bool TpchText::GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr) { return GenerateWord(offset, rng, arr, Auxiliaries, kNumAuxiliaries); } - bool TpchText::GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr) { bool result = GenerateWord(offset, rng, arr, Terminators, kNumTerminators); // Swap the space with the terminator @@ -489,7 +300,7 @@ namespace arrow return result; } - bool TpchText::GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 3); const char *comma_space = ", "; @@ -510,9 +321,9 @@ namespace arrow success &= GenerateNoun(offset, rng, arr); break; case 3: - GenerateAdverb(offset, rng, arr); - GenerateAdjective(offset, rng, arr); - GenerateNoun(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); break; default: Unreachable("Random number should be between 0 and 3 inclusive"); @@ -521,7 +332,7 @@ namespace arrow return success; } - bool TpchText::GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 3); bool success = true; @@ -550,7 +361,7 @@ namespace arrow return success; } - bool TpchText::GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) { const char *the_space = "the "; bool success = true; @@ -560,7 +371,7 @@ namespace arrow return success; } - bool TpchText::GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr) + bool TpchPseudotext::GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr) { std::uniform_int_distribution dist(0, 4); bool success = true; @@ -584,19 +395,13 @@ namespace arrow success &= GenerateTerminator(offset, rng, arr); break; case 3: - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateVerbPhrase(offset, rng, arr); - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateTerminator(offset, rng, arr); - break; - case 4: success &= GenerateNounPhrase(offset, rng, arr); success &= GeneratePrepositionalPhrase(offset, rng, arr); success &= GenerateVerbPhrase(offset, rng, arr); success &= GenerateNounPhrase(offset, rng, arr); success &= GenerateTerminator(offset, rng, arr); break; - case 5: + case 4: success &= GenerateNounPhrase(offset, rng, arr); success &= GeneratePrepositionalPhrase(offset, rng, arr); success &= GenerateVerbPhrase(offset, rng, arr); @@ -610,6 +415,184 @@ namespace arrow return success; } + class TpchTableGenerator + { + public: + using OutputBatchCallback = std::function; + using FinishedCallback = std::function; + using GenerateFn = std::function; + using ScheduleCallback = std::function; + using AbortCallback = std::function; + + virtual Status Init( + std::vector columns, + float scale_factor, + int64_t batch_size) = 0; + + virtual Status StartProducing( + size_t num_threads, + OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) = 0; + + bool Abort() + { + bool expected = false; + return done_.compare_exchange_strong(expected, true); + } + + virtual std::shared_ptr schema() const = 0; + + virtual ~TpchTableGenerator() = default; + + protected: + std::atomic done_ = { false }; + std::atomic batches_outputted_ = { 0 }; + }; + + int GetNumDigits(int64_t x) + { + // This if statement chain is for MAXIMUM SPEED + // Source: https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c + ARROW_DCHECK(x >= 0); + if(x < 10ll) return 1; + if(x < 100ll) return 2; + if(x < 1000ll) return 3; + if(x < 10000ll) return 4; + if(x < 100000ll) return 5; + if(x < 1000000ll) return 6; + if(x < 10000000ll) return 7; + if(x < 100000000ll) return 8; + if(x < 1000000000ll) return 9; + if(x < 10000000000ll) return 10; + if(x < 100000000000ll) return 11; + if(x < 1000000000000ll) return 12; + if(x < 10000000000000ll) return 13; + if(x < 100000000000000ll) return 14; + if(x < 1000000000000000ll) return 15; + if(x < 10000000000000000ll) return 16; + if(x < 100000000000000000ll) return 17; + if(x < 1000000000000000000ll) return 18; + return -1; + } + + void AppendNumberPaddedToNineDigits(char *out, int64_t x) + { + // We do all of this to avoid calling snprintf, which needs to handle locale, + // which can be slow, especially on Mac and Windows. + int num_digits = GetNumDigits(x); + int num_padding_zeros = std::max(9 - num_digits, 0); + std::memset(out, '0', static_cast(num_padding_zeros)); + while(x > 0) + { + *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); + num_digits -= 1; + x /= 10; + } + } + + Result> SetOutputColumns( + const std::vector &columns, + const std::vector> &types, + const std::unordered_map &name_map, + std::vector &gen_list) + { + gen_list.clear(); + std::vector> fields; + if(columns.empty()) + { + fields.resize(name_map.size()); + gen_list.resize(name_map.size()); + for(auto pair : name_map) + { + int col_idx = pair.second; + fields[col_idx] = field(pair.first, types[col_idx]); + gen_list[col_idx] = col_idx; + } + return schema(std::move(fields)); + } + else + { + for(const std::string &col : columns) + { + auto entry = name_map.find(col); + if(entry == name_map.end()) + return Status::Invalid("Not a valid column name"); + int col_idx = static_cast(entry->second); + fields.push_back(field(col, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } + } + + Result RandomVString( + random::pcg32_fast &rng, + int64_t num_rows, + int32_t min_length, + int32_t max_length) + { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((num_rows + 1) * sizeof(int32_t))); + int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for(int64_t i = 1; i <= num_rows; i++) + offsets[i] = offsets[i - 1] + length_dist(rng); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[num_rows])); + char *str = reinterpret_cast(str_buff->mutable_data()); + + // Spec says to pick random alphanumeric characters from a set of at least + // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, + // so 52 total for upper and lower case, and 10 possible digits gives 62 + // characters... + // dbgen solves this by including a space and a comma as well, so we'll + // copy that. + const char alpha_numerics[65] = + "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; + std::uniform_int_distribution char_dist(0, 63); + for(int32_t i = 0; i < offsets[num_rows]; i++) + str[i] = alpha_numerics[char_dist(rng)]; + + ArrayData ad(utf8(), num_rows, { nullptr, std::move(offset_buff), std::move(str_buff) }); + return std::move(ad); + } + + void AppendNumber(char *&out, int num_digits, int32_t x) + { + out += (num_digits - 1); + while(x > 0) + { + *out-- = '0' + (x % 10); + x /= 10; + } + out += (num_digits + 1); + } + + void GeneratePhoneNumber( + char *out, + random::pcg32_fast &rng, + int32_t country) + { + std::uniform_int_distribution three_digit(100, 999); + std::uniform_int_distribution four_digit(1000, 9999); + + int32_t country_code = country + 10; + int32_t l1 = three_digit(rng); + int32_t l2 = three_digit(rng); + int32_t l3 = four_digit(rng); + AppendNumber(out, 2, country_code); + *out++ = '-'; + AppendNumber(out, 3, l1); + *out++ = '-'; + AppendNumber(out, 3, l2); + *out++ = '-'; + AppendNumber(out, 4, l3); + } + + static constexpr uint32_t kStartDate = 8035; // January 1, 1992 is 8035 days after January 1, 1970 + static constexpr uint32_t kCurrentDate = 9298; // June 17, 1995 is 9298 days after January 1, 1970 + static constexpr uint32_t kEndDate = 10591; // December 12, 1998 is 10591 days after January 1, 1970 + using GenerateColumnFn = std::function; class PartAndPartSupplierGenerator { @@ -628,8 +611,8 @@ namespace arrow thread_local_data_.resize(num_threads); for(ThreadLocalData &tld : thread_local_data_) { - // 5 is the maximum number of different strings we need to concatenate - tld.string_indices.resize(5 * batch_size_); + constexpr int kMaxNumDistinctStrings = 5; + tld.string_indices.resize(kMaxNumDistinctStrings * batch_size_); } part_rows_to_generate_ = static_cast(scale_factor_ * 200000); } @@ -648,12 +631,12 @@ namespace arrow Result> SetPartOutputColumns(const std::vector &cols) { - return SetOutputColumns(cols, part_types_, part_name_map_, part_cols_); + return SetOutputColumns(cols, kPartTypes, kPartNameMap, part_cols_); } Result> SetPartSuppOutputColumns(const std::vector &cols) { - return SetOutputColumns(cols, partsupp_types_, partsupp_name_map_, partsupp_cols_); + return SetOutputColumns(cols, kPartsuppTypes, kPartsuppNameMap, partsupp_cols_); } Result> NextPartBatch() @@ -662,14 +645,13 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; { std::lock_guard lock(part_output_queue_mutex_); - bool all_generated = part_rows_generated_ == part_rows_to_generate_; if(!part_output_queue_.empty()) { ExecBatch batch = std::move(part_output_queue_.front()); part_output_queue_.pop(); return std::move(batch); } - else if(all_generated) + else if(part_rows_generated_ == part_rows_to_generate_) { return util::nullopt; } @@ -692,11 +674,9 @@ namespace arrow RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) - { - RETURN_NOT_OK(part_generators_[col](thread_index)); - } + RETURN_NOT_OK(kPartGenerators[col](thread_index)); for(int col : partsupp_cols_) - RETURN_NOT_OK(partsupp_generators_[col](thread_index)); + RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); std::vector part_result(part_cols_.size()); for(size_t i = 0; i < part_cols_.size(); i++) @@ -766,9 +746,9 @@ namespace arrow RETURN_NOT_OK(InitPartsupp(thread_index)); for(int col : part_cols_) - RETURN_NOT_OK(part_generators_[col](thread_index)); + RETURN_NOT_OK(kPartGenerators[col](thread_index)); for(int col : partsupp_cols_) - RETURN_NOT_OK(partsupp_generators_[col](thread_index)); + RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); if(!part_cols_.empty()) { std::vector part_result(part_cols_.size()); @@ -843,25 +823,25 @@ namespace arrow #define MAKE_STRING_MAP(col) \ { #col, PART::col }, - const std::unordered_map part_name_map_ = + const std::unordered_map kPartNameMap = { FOR_EACH_PART_COLUMN(MAKE_STRING_MAP) }; #undef MAKE_STRING_MAP #define MAKE_STRING_MAP(col) \ { #col, PARTSUPP::col }, - const std::unordered_map partsupp_name_map_ = + const std::unordered_map kPartsuppNameMap = { FOR_EACH_PARTSUPP_COLUMN(MAKE_STRING_MAP) }; #undef MAKE_STRING_MAP #define MAKE_FN_ARRAY(col) \ [this](size_t thread_index) { return this->col(thread_index); }, - std::vector part_generators_ = + std::vector kPartGenerators = { FOR_EACH_PART_COLUMN(MAKE_FN_ARRAY) }; - std::vector partsupp_generators_ = + std::vector kPartsuppGenerators = { FOR_EACH_PARTSUPP_COLUMN(MAKE_FN_ARRAY) }; @@ -869,7 +849,7 @@ namespace arrow #undef FOR_EACH_LINEITEM_COLUMN #undef FOR_EACH_ORDERS_COLUMN - const std::vector> part_types_ = + const std::vector> kPartTypes = { int32(), utf8(), @@ -882,7 +862,7 @@ namespace arrow utf8(), }; - const std::vector> partsupp_types_ = + const std::vector> kPartsuppTypes = { int32(), int32(), @@ -895,9 +875,9 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; ARROW_DCHECK(tld.part[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[column]); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[column]); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.part_to_generate * byte_width)); - ArrayData ad(part_types_[column], tld.part_to_generate, { nullptr, std::move(buff) }); + ArrayData ad(kPartTypes[column], tld.part_to_generate, { nullptr, std::move(buff) }); tld.part[column] = std::move(ad); return Status::OK(); } @@ -956,7 +936,7 @@ namespace arrow *row++ = ' '; } } - ArrayData ad(part_types_[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); + ArrayData ad(kPartTypes[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); Datum datum(ad); tld.part[PART::P_NAME] = std::move(datum); } @@ -973,7 +953,7 @@ namespace arrow const size_t manufacturer_length = std::strlen(manufacturer); RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR)); char *p_mfgr = reinterpret_cast(tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data()); - int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width); @@ -998,8 +978,8 @@ namespace arrow tld.part[PART::P_MFGR].array()->buffers[1]->data()); char *p_brand = reinterpret_cast( tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); - int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_BRAND]); - int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_MFGR]); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_BRAND]); + int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); const size_t mfgr_id_offset = std::strlen("Manufacturer#"); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { @@ -1058,7 +1038,7 @@ namespace arrow row += length; } } - ArrayData ad(part_types_[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); + ArrayData ad(kPartTypes[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); Datum datum(ad); tld.part[PART::P_TYPE] = std::move(datum); } @@ -1090,7 +1070,7 @@ namespace arrow RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER)); char *p_container = reinterpret_cast( tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data()); - int32_t byte_width = arrow::internal::GetByteWidth(*part_types_[PART::P_CONTAINER]); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_CONTAINER]); for(int64_t irow = 0; irow < tld.part_to_generate; irow++) { int container1_idx = dist1(tld.rng); @@ -1164,9 +1144,9 @@ namespace arrow Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) { ThreadLocalData &tld = thread_local_data_[thread_index]; - int32_t byte_width = arrow::internal::GetByteWidth(*partsupp_types_[column]); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartsuppTypes[column]); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); - ArrayData ad(partsupp_types_[column], batch_size_, { nullptr, std::move(buff) }); + ArrayData ad(kPartsuppTypes[column], batch_size_, { nullptr, std::move(buff) }); tld.partsupp[ibatch][column] = std::move(ad); return Status::OK(); } @@ -1327,8 +1307,8 @@ namespace arrow { std::vector part; std::vector string_indices; - int64_t part_to_generate; - int64_t partkey_start; + int64_t part_to_generate{0}; + int64_t partkey_start{0}; std::vector> partsupp; std::bitset generated_partsupp; @@ -1341,16 +1321,16 @@ namespace arrow std::mutex partsupp_output_queue_mutex_; std::queue part_output_queue_; std::queue partsupp_output_queue_; - int64_t batch_size_; - float scale_factor_; - int64_t part_rows_to_generate_; - int64_t part_rows_generated_; + int64_t batch_size_{0}; + float scale_factor_{0}; + int64_t part_rows_to_generate_{0}; + int64_t part_rows_generated_{0}; std::vector part_cols_; std::vector partsupp_cols_; ThreadIndexer thread_indexer_; - std::atomic part_batches_generated_ = { 0 }; - std::atomic partsupp_batches_generated_ = { 0 }; + std::atomic part_batches_generated_ = { 0 }; + std::atomic partsupp_batches_generated_ = { 0 }; static constexpr int64_t kPartSuppRowsPerPart = 4; }; @@ -1390,12 +1370,12 @@ namespace arrow Result> SetOrdersOutputColumns(const std::vector &cols) { - return SetOutputColumns(cols, orders_types_, orders_name_map_, orders_cols_); + return SetOutputColumns(cols, kOrdersTypes, kOrdersNameMap, orders_cols_); } Result> SetLineItemOutputColumns(const std::vector &cols) { - return SetOutputColumns(cols, lineitem_types_, lineitem_name_map_, lineitem_cols_); + return SetOutputColumns(cols, kLineitemTypes, kLineitemNameMap, lineitem_cols_); } Result> NextOrdersBatch() @@ -1432,9 +1412,9 @@ namespace arrow tld.generated_lineitem.reset(); for(int col : orders_cols_) - RETURN_NOT_OK(orders_generators_[col](thread_index)); + RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); for(int col : lineitem_cols_) - RETURN_NOT_OK(lineitem_generators_[col](thread_index)); + RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); std::vector orders_result(orders_cols_.size()); for(size_t i = 0; i < orders_cols_.size(); i++) @@ -1520,9 +1500,9 @@ namespace arrow } for(int col : orders_cols_) - RETURN_NOT_OK(orders_generators_[col](thread_index)); + RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); for(int col : lineitem_cols_) - RETURN_NOT_OK(lineitem_generators_[col](thread_index)); + RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); if(!orders_cols_.empty()) { @@ -1610,25 +1590,25 @@ namespace arrow #define MAKE_STRING_MAP(col) \ { #col, ORDERS::col }, - const std::unordered_map orders_name_map_ = + const std::unordered_map kOrdersNameMap = { FOR_EACH_ORDERS_COLUMN(MAKE_STRING_MAP) }; #undef MAKE_STRING_MAP #define MAKE_STRING_MAP(col) \ { #col, LINEITEM::col }, - const std::unordered_map lineitem_name_map_ = + const std::unordered_map kLineitemNameMap = { FOR_EACH_LINEITEM_COLUMN(MAKE_STRING_MAP) }; #undef MAKE_STRING_MAP #define MAKE_FN_ARRAY(col) \ [this](size_t thread_index) { return this->col(thread_index); }, - std::vector orders_generators_ = + const std::vector kOrdersGenerators = { FOR_EACH_ORDERS_COLUMN(MAKE_FN_ARRAY) }; - std::vector lineitem_generators_ = + const std::vector kLineitemGenerators = { FOR_EACH_LINEITEM_COLUMN(MAKE_FN_ARRAY) }; @@ -1636,7 +1616,7 @@ namespace arrow #undef FOR_EACH_LINEITEM_COLUMN #undef FOR_EACH_ORDERS_COLUMN - const std::vector> orders_types_ = + const std::vector> kOrdersTypes = { int32(), int32(), @@ -1649,7 +1629,7 @@ namespace arrow utf8() }; - const std::vector> lineitem_types_ = + const std::vector> kLineitemTypes = { int32(), int32(), @@ -1673,9 +1653,9 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[column]); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[column]); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.orders_to_generate * byte_width)); - ArrayData ad(orders_types_[column], tld.orders_to_generate, { nullptr, std::move(buff) }); + ArrayData ad(kOrdersTypes[column], tld.orders_to_generate, { nullptr, std::move(buff) }); tld.orders[column] = std::move(ad); return Status::OK(); } @@ -1833,7 +1813,7 @@ namespace arrow { RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERDATE)); - std::uniform_int_distribution dist(STARTDATE, ENDDATE - 151); + std::uniform_int_distribution dist(kStartDate, kEndDate - 151); uint32_t *o_orderdate = reinterpret_cast( tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.orders_to_generate; i++) @@ -1848,7 +1828,7 @@ namespace arrow if(tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) { RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY)); - int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_ORDERPRIORITY]); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_ORDERPRIORITY]); std::uniform_int_distribution dist(0, kNumPriorities - 1); char *o_orderpriority = reinterpret_cast( tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data()); @@ -1867,7 +1847,7 @@ namespace arrow if(tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) { RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); - int32_t byte_width = arrow::internal::GetByteWidth(*orders_types_[ORDERS::O_CLERK]); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_CLERK]); int64_t max_clerk_id = static_cast(scale_factor_ * 1000); std::uniform_int_distribution dist(1, max_clerk_id); char *o_clerk = reinterpret_cast( @@ -1935,9 +1915,9 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.lineitem[ibatch][column].kind() == Datum::NONE) { - int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[column]); + int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[column]); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); - ArrayData ad(lineitem_types_[column], batch_size_, { nullptr, std::move(buff) }); + ArrayData ad(kLineitemTypes[column], batch_size_, { nullptr, std::move(buff) }); tld.lineitem[ibatch][column] = std::move(ad); out_batch_offset = 0; } @@ -2233,7 +2213,7 @@ namespace arrow for(int64_t i = 0; i < next_run; i++, batch_offset++) { - if(l_receiptdate[batch_offset] <= CURRENTDATE) + if(l_receiptdate[batch_offset] <= kCurrentDate) { uint32_t r = dist(tld.rng); l_returnflag[batch_offset] = (r % 2 == 1) ? 'R' : 'A'; @@ -2272,7 +2252,7 @@ namespace arrow for(int64_t i = 0; i < next_run; i++, batch_offset++) { - if(l_shipdate[batch_offset] > CURRENTDATE) + if(l_shipdate[batch_offset] > kCurrentDate) l_linestatus[batch_offset] = 'O'; else l_linestatus[batch_offset] = 'F'; @@ -2397,7 +2377,7 @@ namespace arrow if(!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) { tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true; - int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[LINEITEM::L_SHIPINSTRUCT]); + int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPINSTRUCT]); size_t ibatch = 0; std::uniform_int_distribution dist(0, kNumInstructions - 1); for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) @@ -2430,7 +2410,7 @@ namespace arrow if(!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) { tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true; - int32_t byte_width = arrow::internal::GetByteWidth(*lineitem_types_[LINEITEM::L_SHIPMODE]); + int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPMODE]); size_t ibatch = 0; std::uniform_int_distribution dist(0, kNumModes - 1); for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) @@ -2530,8 +2510,8 @@ namespace arrow rows_generated_.store(0); ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, - types_, - name_map_, + kTypes, + kNameMap, gen_list_)); random::pcg32_fast rng; @@ -2604,21 +2584,21 @@ namespace arrow #undef MAKE_ENUM #define MAKE_STRING_MAP(col) \ { #col, SUPPLIER::col }, - const std::unordered_map name_map_ = + const std::unordered_map kNameMap = { FOR_EACH_COLUMN(MAKE_STRING_MAP) }; #undef MAKE_STRING_MAP #define MAKE_FN_ARRAY(col) \ [this](size_t thread_index) { return this->col(thread_index); }, - std::vector generators_ = + std::vector kGenerators = { FOR_EACH_COLUMN(MAKE_FN_ARRAY) }; #undef MAKE_FN_ARRAY #undef FOR_EACH_COLUMN - std::vector> types_ = + std::vector> kTypes = { int32(), fixed_size_binary(25), @@ -2644,7 +2624,7 @@ namespace arrow tld.batch.resize(SUPPLIER::kNumCols); std::fill(tld.batch.begin(), tld.batch.end(), Datum()); for(int col : gen_list_) - RETURN_NOT_OK(generators_[col](thread_index)); + RETURN_NOT_OK(kGenerators[col](thread_index)); std::vector result(gen_list_.size()); for(size_t i = 0; i < gen_list_.size(); i++) @@ -2670,9 +2650,9 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*types_[column]); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); - ArrayData ad(types_[column], tld.to_generate, { nullptr, std::move(buff) }); + ArrayData ad(kTypes[column], tld.to_generate, { nullptr, std::move(buff) }); tld.batch[column] = std::move(ad); return Status::OK(); } @@ -2702,7 +2682,7 @@ namespace arrow const int32_t *s_suppkey = reinterpret_cast( tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data()); RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME)); - int32_t byte_width = arrow::internal::GetByteWidth(*types_[SUPPLIER::S_NAME]); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_NAME]); char *s_name = reinterpret_cast( tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data()); // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022 @@ -2752,7 +2732,7 @@ namespace arrow { RETURN_NOT_OK(S_NATIONKEY(thread_index)); RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE)); - int32_t byte_width = arrow::internal::GetByteWidth(*types_[SUPPLIER::S_PHONE]); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_PHONE]); const int32_t *s_nationkey = reinterpret_cast( tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data()); char *s_phone = reinterpret_cast( @@ -3009,8 +2989,8 @@ namespace arrow rows_generated_.store(0); ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, - types_, - name_map_, + kTypes, + kNameMap, gen_list_)); return Status::OK(); } @@ -3058,21 +3038,21 @@ namespace arrow #undef MAKE_ENUM #define MAKE_STRING_MAP(col) \ { #col, CUSTOMER::col }, - const std::unordered_map name_map_ = + const std::unordered_map kNameMap = { FOR_EACH_COLUMN(MAKE_STRING_MAP) }; #undef MAKE_STRING_MAP #define MAKE_FN_ARRAY(col) \ [this](size_t thread_index) { return this->col(thread_index); }, - std::vector generators_ = + std::vector kGenerators = { FOR_EACH_COLUMN(MAKE_FN_ARRAY) }; #undef MAKE_FN_ARRAY #undef FOR_EACH_COLUMN - std::vector> types_ = + std::vector> kTypes = { int32(), utf8(), @@ -3099,7 +3079,7 @@ namespace arrow tld.batch.resize(CUSTOMER::kNumCols); std::fill(tld.batch.begin(), tld.batch.end(), Datum()); for(int col : gen_list_) - RETURN_NOT_OK(generators_[col](thread_index)); + RETURN_NOT_OK(kGenerators[col](thread_index)); std::vector result(gen_list_.size()); for(size_t i = 0; i < gen_list_.size(); i++) @@ -3128,9 +3108,9 @@ namespace arrow { ThreadLocalData &tld = thread_local_data_[thread_index]; ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*types_[column]); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); - ArrayData ad(types_[column], tld.to_generate, { nullptr, std::move(buff) }); + ArrayData ad(kTypes[column], tld.to_generate, { nullptr, std::move(buff) }); tld.batch[column] = std::move(ad); return Status::OK(); } @@ -3218,7 +3198,7 @@ namespace arrow { RETURN_NOT_OK(C_NATIONKEY(thread_index)); RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE)); - int32_t byte_width = arrow::internal::GetByteWidth(*types_[CUSTOMER::C_PHONE]); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_PHONE]); const int32_t *c_nationkey = reinterpret_cast( tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data()); char *c_phone = reinterpret_cast( @@ -3255,7 +3235,7 @@ namespace arrow if(tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) { RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT)); - int32_t byte_width = arrow::internal::GetByteWidth(*types_[CUSTOMER::C_MKTSEGMENT]); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_MKTSEGMENT]); char *c_mktsegment = reinterpret_cast( tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data()); std::uniform_int_distribution dist(0, kNumSegments - 1); @@ -3291,10 +3271,10 @@ namespace arrow OutputBatchCallback output_callback_; FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; - int64_t rows_to_generate_; - std::atomic rows_generated_; - float scale_factor_; - int64_t batch_size_; + int64_t rows_to_generate_{0}; + std::atomic rows_generated_ = { 0 }; + float scale_factor_{0}; + int64_t batch_size_{0}; std::vector gen_list_; std::shared_ptr schema_; }; @@ -3457,8 +3437,8 @@ namespace arrow ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, - types_, - name_map_, + kTypes, + kNameMap, column_indices_)); return Status::OK(); } @@ -3469,16 +3449,16 @@ namespace arrow FinishedCallback finished_callback, ScheduleCallback /*schedule_task_callback*/) override { - std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(N_NATIONKEY, sizeof(N_NATIONKEY)); + std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(kNationKey, sizeof(kNationKey)); ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_NATIONKEY_buffer) }); ARROW_ASSIGN_OR_RAISE(std::unique_ptr N_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); char *N_NAME = reinterpret_cast(N_NAME_buffer->mutable_data()); for(size_t i = 0; i < kRowCount; i++) - std::strncpy(N_NAME + kNameByteWidth * i, country_names_[i], kNameByteWidth); + std::strncpy(N_NAME + kNameByteWidth * i, kCountryNames[i], kNameByteWidth); ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, { nullptr, std::move(N_NAME_buffer) }); - std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(N_REGIONKEY, sizeof(N_REGIONKEY)); + std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_REGIONKEY_buffer) }); ARROW_ASSIGN_OR_RAISE(Datum N_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 114, rng_)); @@ -3510,8 +3490,8 @@ namespace arrow static constexpr size_t kRowCount = 25; static constexpr int32_t kNameByteWidth = 25; - const int32_t N_NATIONKEY[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; - const char *country_names_[kRowCount] = + const int32_t kNationKey[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; + const char *kCountryNames[kRowCount] = { "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", @@ -3523,7 +3503,7 @@ namespace arrow "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES" }; - const int32_t N_REGIONKEY[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; + const int32_t kRegionKey[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; struct NATION { @@ -3536,7 +3516,7 @@ namespace arrow }; }; - const std::unordered_map name_map_ = + const std::unordered_map kNameMap = { { "N_NATIONKEY", NATION::N_NATIONKEY }, { "N_NAME", NATION::N_NAME }, @@ -3544,7 +3524,7 @@ namespace arrow { "N_COMMENT", NATION::N_COMMENT }, }; - std::vector> types_ = + std::vector> kTypes = { int32(), fixed_size_binary(kNameByteWidth), @@ -3567,8 +3547,8 @@ namespace arrow ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, - types_, - name_map_, + kTypes, + kNameMap, column_indices_)); return Status::OK(); } @@ -3579,14 +3559,14 @@ namespace arrow FinishedCallback finished_callback, ScheduleCallback /*schedule_task_callback*/) override { - std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(R_REGIONKEY, sizeof(R_REGIONKEY)); + std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(R_REGIONKEY_buffer) }); ARROW_ASSIGN_OR_RAISE(std::unique_ptr R_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); char *R_NAME_data = reinterpret_cast(R_NAME_buffer->mutable_data()); for(size_t i = 0; i < kRowCount; i++) - std::strncpy(R_NAME_data + kNameByteWidth * i, region_names_[i], kNameByteWidth); - ArrayData R_NAME_arraydata(types_[static_cast(REGION::R_NAME)], kRowCount, { nullptr, std::move(R_NAME_buffer) }); + std::strncpy(R_NAME_data + kNameByteWidth * i, kRegionNames[i], kNameByteWidth); + ArrayData R_NAME_arraydata(kTypes[static_cast(REGION::R_NAME)], kRowCount, { nullptr, std::move(R_NAME_buffer) }); ARROW_ASSIGN_OR_RAISE(Datum R_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 115, rng_)); @@ -3609,8 +3589,8 @@ namespace arrow static constexpr size_t kRowCount = 5; static constexpr int32_t kNameByteWidth = 25; - const int32_t R_REGIONKEY[kRowCount] = { 0, 1, 2, 3, 4 }; - const char *region_names_[kRowCount] = + const int32_t kRegionKey[kRowCount] = { 0, 1, 2, 3, 4 }; + const char *kRegionNames[kRowCount] = { "AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST" }; @@ -3626,14 +3606,14 @@ namespace arrow }; }; - const std::unordered_map name_map_ = + const std::unordered_map kNameMap = { { "R_REGIONKEY", REGION::R_REGIONKEY }, { "R_NAME", REGION::R_NAME }, { "R_COMMENT", REGION::R_COMMENT }, }; - const std::vector> types_ = + const std::vector> kTypes = { int32(), fixed_size_binary(kNameByteWidth), @@ -3705,7 +3685,8 @@ namespace arrow void StopProducing() override { - generator_->Abort([this]() { this->finished_.MarkFinished(); }); + if(generator_->Abort()) + this->finished_.MarkFinished(); } Future<> finished() override diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index 1d904a2b5f0..5f37489caba 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -33,9 +33,17 @@ namespace arrow class OrdersAndLineItemGenerator; class PartAndPartSupplierGenerator; + class TpchGen { public: + /* + * \brief Create a factory for nodes that generate TPC-H data + * + * Note: Individual tables will reference each other. It is important that you only create a single TpchGen + * instance for each plan and then you can create nodes for each table from that single TpchGen instance. + * Note: Every batch will be scheduled as a new task using the ExecPlan's scheduler. + */ static Result Make(ExecPlan *plan, float scale_factor = 1.0f, int64_t batch_size = 4096); Result Supplier(std::vector columns = {}); @@ -52,6 +60,7 @@ namespace arrow : plan_(plan), scale_factor_(scale_factor), batch_size_(batch_size), + part_and_part_supp_generator_(nullptr), orders_and_line_item_generator_(nullptr) {} diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 2be2bafda8d..b2c29769370 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -40,9 +40,8 @@ namespace arrow { namespace compute { - static constexpr uint32_t STARTDATE = 8035; // January 1, 1992 is 8035 days after January 1, 1970 - static constexpr uint32_t CURRENTDATE = 9298; // June 17, 1995 is 9298 days after January 1, 1970 - static constexpr uint32_t ENDDATE = 10591; // December 12, 1998 is 10591 days after January 1, 1970 + static constexpr uint32_t kStartDate = 8035; // January 1, 1992 is 8035 days after January 1, 1970 + static constexpr uint32_t kEndDate = 10591; // December 12, 1998 is 10591 days after January 1, 1970 void ValidateBatch(const ExecBatch &batch) { @@ -60,10 +59,9 @@ namespace arrow int64_t num_keys = d.length(); for(int64_t i = 0; i < num_keys; i++) { - ASSERT_TRUE(seen.find(keys[i]) == seen.end()); + ASSERT_TRUE(seen.insert(keys[i]).second); ASSERT_LE(keys[i], max); ASSERT_GE(keys[i], min); - seen.insert(keys[i]); } } @@ -170,9 +168,9 @@ namespace arrow { int32_t start = off[i]; int32_t end = off[i + 1]; - int32_t length = end - start; - ASSERT_LE(length, max_length); - ASSERT_GE(length, min_length); + int32_t str_len = end - start; + ASSERT_LE(str_len, max_length); + ASSERT_GE(str_len, min_length); for(int32_t i = start; i < end; i++) { bool is_valid = std::isdigit(str[i]) || std::isalpha(str[i]) || str[i] == ',' || str[i] == ' '; @@ -368,6 +366,26 @@ namespace arrow } } + TEST(TpchNode, ScaleFactor) + { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get(), 0.25f); + ExecNode *table = *gen.Supplier(); + AsyncGenerator> sink_gen; + Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 2500; + int64_t num_rows = 0; + for(auto &batch : res) + num_rows += batch.length; + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); + } + TEST(TpchNode, Supplier) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); @@ -565,7 +583,7 @@ namespace arrow VerifyAllBetween(batch[1], /*min=*/1, /*max=*/static_cast(kExpectedRows)); VerifyModuloBetween(batch[1], /*min=*/1, /*max=*/2, /*mod=*/3); VerifyOneOf(batch[2], { 'F', 'O', 'P' }); - VerifyAllBetween(batch[4], STARTDATE, ENDDATE - 151); + VerifyAllBetween(batch[4], kStartDate, kEndDate - 151); VerifyOneOf(batch[5], /*byte_width=*/15, { @@ -608,9 +626,9 @@ namespace arrow VerifyDecimalsBetween(batch[7], /*min=*/0, /*max=*/8); VerifyOneOf(batch[8], { 'R', 'A', 'N' }); VerifyOneOf(batch[9], { 'O', 'F' }); - VerifyAllBetween(batch[10], STARTDATE + 1, ENDDATE - 151 + 121); - VerifyAllBetween(batch[11], STARTDATE + 30, ENDDATE - 151 + 90); - VerifyAllBetween(batch[12], STARTDATE + 2, ENDDATE - 151 + 121 + 30); + VerifyAllBetween(batch[10], kStartDate + 1, kEndDate - 151 + 121); + VerifyAllBetween(batch[11], kStartDate + 30, kEndDate - 151 + 90); + VerifyAllBetween(batch[12], kStartDate + 2, kEndDate - 151 + 121 + 30); VerifyOneOf( batch[13], /*byte_width=*/25, From 5b2a26f3b3369a877f138e287125efe2f9c11e78 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Fri, 11 Mar 2022 14:57:00 -0800 Subject: [PATCH 13/34] Switch back to locks --- cpp/src/arrow/compute/exec/tpch_node.cc | 29 ++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 22de9bbad92..418211df4a8 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -147,6 +147,7 @@ namespace arrow }; static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + // The spec says to generate a 300 MB string according to a grammar. This is a // concurrent implementation of the generator. Each thread generates the text in // (up to) 8KB chunks of text. The generator maintains a cursor into the @@ -182,7 +183,8 @@ namespace arrow bool GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr); - std::atomic generated_offset_ = { 0 }; + std::atomic done_ = { false }; + int64_t generated_offset_{0}; std::mutex text_guard_; std::unique_ptr text_; static constexpr int64_t kChunkSize = 8192; @@ -193,7 +195,7 @@ namespace arrow Status TpchPseudotext::EnsureInitialized(random::pcg32_fast &rng) { - if(generated_offset_.load() >= kTextBytes) + if(done_.load()) return Status::OK(); { @@ -205,19 +207,30 @@ namespace arrow } char *out = reinterpret_cast(text_->mutable_data()); char temp_buff[kChunkSize]; - while(generated_offset_.load() < kTextBytes) + + while(!done_.load()) { int64_t known_valid_offset = 0; int64_t try_offset = 0; while(GenerateSentence(try_offset, rng, temp_buff)) known_valid_offset = try_offset; - int64_t offset = generated_offset_.fetch_add(known_valid_offset); - if(offset >= kTextBytes) - return Status::OK(); - int64_t bytes_remaining = kTextBytes - offset; - int64_t memcpy_size = std::min(known_valid_offset, bytes_remaining); + bool last_one; + int64_t offset; + int64_t memcpy_size; + { + std::lock_guard lock(text_guard_); + if(done_.load()) + return Status::OK(); + int64_t bytes_remaining = kTextBytes - generated_offset_; + memcpy_size = std::min(known_valid_offset, bytes_remaining); + offset = generated_offset_; + generated_offset_ += memcpy_size; + last_one = generated_offset_ == kTextBytes; + } std::memcpy(out + offset, temp_buff, memcpy_size); + if(last_one) + done_.store(true); } return Status::OK(); } From 41904617b7a414e23486c64337d5e6c42fa9f055 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Fri, 11 Mar 2022 15:48:46 -0800 Subject: [PATCH 14/34] Async task group --- cpp/src/arrow/compute/exec/tpch_node.cc | 52 ++++++++++++++----------- cpp/src/arrow/compute/exec/tpch_node.h | 2 +- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 418211df4a8..bf2a4b76532 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -3641,15 +3641,17 @@ namespace arrow { public: TpchNode(ExecPlan *plan, + const char *name, std::unique_ptr generator) : ExecNode(plan, {}, {}, generator->schema(), /*num_outputs=*/1), + name_(name), generator_(std::move(generator)) { } const char *kind_name() const override { - return "TpchNode"; + return name_; } [[noreturn]] @@ -3699,12 +3701,12 @@ namespace arrow void StopProducing() override { if(generator_->Abort()) - this->finished_.MarkFinished(); + std::ignore = task_group_.End(); } Future<> finished() override { - return finished_; + return task_group_.OnFinished(); } private: @@ -3716,7 +3718,7 @@ namespace arrow void FinishedCallback(int64_t total_num_batches) { outputs_[0]->InputFinished(this, static_cast(total_num_batches)); - finished_.MarkFinished(); + std::ignore = task_group_.End(); } Status ScheduleTaskCallback(std::function func) @@ -3724,16 +3726,19 @@ namespace arrow auto executor = plan_->exec_context()->executor(); if (executor) { - RETURN_NOT_OK(executor->Spawn([this, func] + RETURN_NOT_OK(task_group_.AddTask([&] { - size_t thread_index = thread_indexer_(); - Status status = func(thread_index); - if (!status.ok()) + return executor->Submit([this, func] { - StopProducing(); - ErrorIfNotOk(status); - return; - } + size_t thread_index = thread_indexer_(); + Status status = func(thread_index); + if (!status.ok()) + { + StopProducing(); + ErrorIfNotOk(status); + return; + } + }); })); } else @@ -3743,9 +3748,10 @@ namespace arrow return Status::OK(); } + const char *name_; std::unique_ptr generator_; - Future<> finished_ = Future<>::MakeFinished(); + util::AsyncTaskGroup task_group_; ThreadIndexer thread_indexer_; }; @@ -3756,16 +3762,16 @@ namespace arrow } template - Result TpchGen::CreateNode(std::vector columns) + Result TpchGen::CreateNode(const char *name, std::vector columns) { std::unique_ptr generator = arrow::internal::make_unique(); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, std::move(generator)); + return plan_->EmplaceNode(plan_, name, std::move(generator)); } Result TpchGen::Supplier(std::vector columns) { - return CreateNode(std::move(columns)); + return CreateNode("Supplier", std::move(columns)); } Result TpchGen::Part(std::vector columns) @@ -3776,7 +3782,7 @@ namespace arrow } std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, std::move(generator)); + return plan_->EmplaceNode(plan_, "Part", std::move(generator)); } Result TpchGen::PartSupp(std::vector columns) @@ -3787,12 +3793,12 @@ namespace arrow } std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, std::move(generator)); + return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); } Result TpchGen::Customer(std::vector columns) { - return CreateNode(std::move(columns)); + return CreateNode("Customer", std::move(columns)); } Result TpchGen::Orders(std::vector columns) @@ -3803,7 +3809,7 @@ namespace arrow } std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, std::move(generator)); + return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); } Result TpchGen::Lineitem(std::vector columns) @@ -3814,17 +3820,17 @@ namespace arrow } std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, std::move(generator)); + return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); } Result TpchGen::Nation(std::vector columns) { - return CreateNode(std::move(columns)); + return CreateNode("Nation", std::move(columns)); } Result TpchGen::Region(std::vector columns) { - return CreateNode(std::move(columns)); + return CreateNode("Region", std::move(columns)); } } } diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index 5f37489caba..083f331c5e5 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -65,7 +65,7 @@ namespace arrow {} template - Result CreateNode(std::vector columns); + Result CreateNode(const char *name, std::vector columns); ExecPlan *plan_; float scale_factor_; From 23d82ae7d5944bb5d21dc669c7db8b3aad18a7e7 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 11:47:38 -0700 Subject: [PATCH 15/34] Add static cast to make windows happy --- cpp/src/arrow/compute/exec/tpch_node.cc | 32 ++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index bf2a4b76532..fcd27ad780b 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -247,7 +247,7 @@ namespace arrow int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); offsets[0] = 0; for(size_t i = 1; i <= num_comments; i++) - offsets[i] = offsets[i - 1] + length_dist(rng); + offsets[i] = offsets[i - 1] + static_cast(length_dist(rng)); ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, AllocateBuffer(offsets[num_comments])); char *comments = reinterpret_cast(comment_buffer->mutable_data()); @@ -905,7 +905,7 @@ namespace arrow tld.part[PART::P_PARTKEY].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.part_to_generate; i++) { - p_partkey[i] = (tld.partkey_start + i + 1); + p_partkey[i] = static_cast(tld.partkey_start + i + 1); ARROW_DCHECK(1 <= p_partkey[i] && p_partkey[i] <= part_rows_to_generate_); } } @@ -917,7 +917,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.part[PART::P_NAME].kind() == Datum::NONE) { - std::uniform_int_distribution dist(0, static_cast(kNumNameParts - 1)); + std::uniform_int_distribution dist(0, static_cast(kNumNameParts - 1)); ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); offsets[0] = 0; @@ -926,12 +926,12 @@ namespace arrow size_t string_length = 0; for(int ipart = 0; ipart < 5; ipart++) { - uint8_t name_part_index = dist(tld.rng); + uint8_t name_part_index = static_cast(dist(tld.rng)); tld.string_indices[irow * 5 + ipart] = name_part_index; string_length += std::strlen(NameParts[name_part_index]); } // Add 4 because there is a space between each word (i.e. four spaces) - offsets[irow + 1] = offsets[irow] + string_length + 4; + offsets[irow + 1] = static_cast(offsets[irow] + string_length + 4); } // Add an extra byte for the space after in the very last string. ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); @@ -1013,7 +1013,7 @@ namespace arrow ThreadLocalData &tld = thread_local_data_[thread_index]; if(tld.part[PART::P_TYPE].kind() == Datum::NONE) { - using D = std::uniform_int_distribution; + using D = std::uniform_int_distribution; D dists[] = { D{ 0, static_cast(kNumTypes_1 - 1) }, @@ -1031,11 +1031,11 @@ namespace arrow size_t string_length = 0; for(int ipart = 0; ipart < 3; ipart++) { - uint8_t name_part_index = dists[ipart](tld.rng); + uint8_t name_part_index = static_cast(dists[ipart](tld.rng)); tld.string_indices[irow * 3 + ipart] = name_part_index; string_length += std::strlen(types[ipart][name_part_index]); } - offsets[irow + 1] = offsets[irow] + string_length; + offsets[irow + 1] = static_cast(offsets[irow] + string_length); } ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate])); char *strings = reinterpret_cast(string_buffer->mutable_data()); @@ -1683,7 +1683,7 @@ namespace arrow tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); for(int64_t i = 0; i < tld.orders_to_generate; i++) { - int32_t orderkey_index = tld.orderkey_start + i; + int32_t orderkey_index = static_cast(tld.orderkey_start + i); int32_t index_of_run = orderkey_index / 8; int32_t index_in_run = orderkey_index % 8; o_orderkey[i] = (index_of_run * 32 + index_in_run + 1); @@ -1909,7 +1909,7 @@ namespace arrow tld.items_per_order.clear(); for(int64_t i = 0; i < tld.orders_to_generate; i++) { - int64_t length = length_dist(tld.rng); + int length = length_dist(tld.rng); tld.items_per_order.push_back(length); tld.lineitem_to_generate += length; } @@ -2680,7 +2680,7 @@ namespace arrow tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->mutable_data()); for(int64_t irow = 0; irow < tld.to_generate; irow++) { - s_suppkey[irow] = (tld.suppkey_start + irow + 1); + s_suppkey[irow] = static_cast(tld.suppkey_start + irow + 1); } } return Status::OK(); @@ -2799,8 +2799,8 @@ namespace arrow char *str = reinterpret_cast( tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->mutable_data()); const char *customer = "Customer"; - const size_t customer_length = std::strlen(customer); - const size_t review_length = std::strlen(review); + const int32_t customer_length = static_cast(std::strlen(customer)); + const int32_t review_length = static_cast(std::strlen(review)); auto it = std::lower_bound(indices.begin(), indices.end(), tld.suppkey_start); for(; it != indices.end() && *it < tld.suppkey_start + tld.to_generate; it++) @@ -2998,7 +2998,7 @@ namespace arrow { scale_factor_ = scale_factor; batch_size_ = batch_size; - rows_to_generate_ = scale_factor_ * 150000; + rows_to_generate_ = static_cast(scale_factor_ * 150000); rows_generated_.store(0); ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( columns, @@ -3138,7 +3138,7 @@ namespace arrow tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->mutable_data()); for(int64_t irow = 0; irow < tld.to_generate; irow++) { - c_custkey[irow] = (tld.custkey_start + irow + 1); + c_custkey[irow] = static_cast(tld.custkey_start + irow + 1); } } return Status::OK(); @@ -3161,7 +3161,7 @@ namespace arrow { int num_digits = GetNumDigits(c_custkey[irow]); int num_chars = std::max(num_digits, 9); - offsets[irow + 1] = offsets[irow] + num_chars + customer_length; + offsets[irow + 1] = static_cast(offsets[irow] + num_chars + customer_length); } ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[tld.to_generate])); char *str = reinterpret_cast(str_buff->mutable_data()); From 9b392a02ef78dd6718176fe24ef4ae9c5706d9c0 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 14:48:15 -0700 Subject: [PATCH 16/34] Include cctype --- cpp/src/arrow/compute/exec/tpch_node_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index b2c29769370..686b909811a 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -33,6 +33,7 @@ #include "arrow/util/thread_pool.h" #include "arrow/array/validate.h" +#include #include #include @@ -73,7 +74,7 @@ namespace arrow int byte_width, bool verify_padding) { - size_t num_offset = std::strlen(prefix); + int num_offset = static_cast(std::strlen(prefix)); ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) << row << ", prefix=" << prefix << ", i=" << i; const char *num_str = row + num_offset; int64_t num = 0; From a23c7cb71149c0dffe7aff4bbce904827143404e Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 16:16:01 -0700 Subject: [PATCH 17/34] Add ARROW_EXPORT --- cpp/src/arrow/compute/exec/tpch_node.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index 083f331c5e5..f286a66abe7 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -34,7 +34,7 @@ namespace arrow class PartAndPartSupplierGenerator; - class TpchGen + class ARROW_EXPORT TpchGen { public: /* From 9a9b0dda1a61ecdc992eb3152a65a350da68bb0b Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 16:41:46 -0700 Subject: [PATCH 18/34] Seed the rngs, more static_cast --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 2 +- cpp/src/arrow/compute/exec/tpch_node.cc | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index e015bcf2abd..903346014ab 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -156,7 +156,7 @@ static void BM_Tpch_Q1(benchmark::State &st) { st.PauseTiming(); AsyncGenerator> sink_gen; - std::shared_ptr plan = Plan_Q1(&sink_gen, st.range(0)); + std::shared_ptr plan = Plan_Q1(&sink_gen, static_cast(st.range(0))); st.ResumeTiming(); auto fut = StartAndCollect(plan.get(), sink_gen); auto res = *fut.MoveResult(); diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index fcd27ad780b..25c2c0fc0d0 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -621,11 +621,13 @@ namespace arrow batch_size_ = batch_size; scale_factor_ = scale_factor; + arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); for(ThreadLocalData &tld : thread_local_data_) { constexpr int kMaxNumDistinctStrings = 5; tld.string_indices.resize(kMaxNumDistinctStrings * batch_size_); + tld.rng.seed(seq); } part_rows_to_generate_ = static_cast(scale_factor_ * 200000); } @@ -1361,10 +1363,12 @@ namespace arrow batch_size_ = batch_size; scale_factor_ = scale_factor; + arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); for(ThreadLocalData &tld : thread_local_data_) { tld.items_per_order.resize(batch_size_); + tld.rng.seed(seq); } orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); } @@ -2561,7 +2565,11 @@ namespace arrow FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { + arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); + for(ThreadLocalData &tld : thread_local_data_) + tld.rng.seed(seq); + output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); @@ -3014,7 +3022,11 @@ namespace arrow FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { + arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); + for(ThreadLocalData &tld : thread_local_data_) + tld.rng.seed(seq); + output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); @@ -3453,6 +3465,7 @@ namespace arrow kTypes, kNameMap, column_indices_)); + rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); return Status::OK(); } @@ -3563,6 +3576,7 @@ namespace arrow kTypes, kNameMap, column_indices_)); + rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); return Status::OK(); } From 27254aa152cf8d9ebdd9bc70ca0956dd496120a9 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 17:32:38 -0700 Subject: [PATCH 19/34] MORE STATIC CAST --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 903346014ab..43e37798d70 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -33,7 +33,7 @@ std::shared_ptr Plan_Q1(AsyncGenerator> *sin ExecContext *ctx = default_exec_context(); *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::shared_ptr plan = *ExecPlan::Make(ctx); - TpchGen gen = *TpchGen::Make(plan.get(), scale_factor); + TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); ExecNode *lineitem = *gen.Lineitem( { From f3954aeadba48894aa4b009e108c4318d07d1ac1 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 18:02:06 -0700 Subject: [PATCH 20/34] Gate finished_callback_, don't init finished_ --- cpp/src/arrow/compute/exec/tpch_node.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 25c2c0fc0d0..f7fa7aaeb15 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -2660,8 +2660,9 @@ namespace arrow output_callback_(std::move(eb)); if(is_last_batch) { - done_.store(true); - finished_callback_(batches_outputted_.load()); + bool expected = false; + if(done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); return Status::OK(); } return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); @@ -3121,9 +3122,7 @@ namespace arrow { bool expected = false; if(done_.compare_exchange_strong(expected, true)) - { finished_callback_(batches_outputted_.load()); - } return Status::OK(); } return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); @@ -3694,7 +3693,6 @@ namespace arrow Status StartProducing() override { - finished_ = Future<>::Make(); return generator_->StartProducing( thread_indexer_.Capacity(), [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, From c9befae2d0566cd970fcc9c56ee00146ca155ccf Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 14 Mar 2022 18:10:00 -0700 Subject: [PATCH 21/34] Make the code completely unreadable --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 233 +- cpp/src/arrow/compute/exec/tpch_node.cc | 7131 +++++++++--------- cpp/src/arrow/compute/exec/tpch_node.h | 87 +- cpp/src/arrow/compute/exec/tpch_node_test.cc | 1261 ++-- r/src/compute-exec.cpp | 59 +- 5 files changed, 4094 insertions(+), 4677 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 43e37798d70..db5161055af 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -17,153 +17,104 @@ #include "benchmark/benchmark.h" -#include "arrow/testing/future_util.h" +#include "arrow/compute/cast.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" +#include "arrow/testing/future_util.h" #include "arrow/util/make_unique.h" -#include "arrow/compute/cast.h" -namespace arrow -{ -namespace compute -{ - -std::shared_ptr Plan_Q1(AsyncGenerator> *sink_gen, int scale_factor) -{ - ExecContext *ctx = default_exec_context(); - *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(ctx); - TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); - - ExecNode *lineitem = *gen.Lineitem( - { - "L_QUANTITY", - "L_EXTENDEDPRICE", - "L_TAX", - "L_DISCOUNT", - "L_SHIPDATE", - "L_RETURNFLAG", - "L_LINESTATUS" - }); - - std::shared_ptr sept_2_1998 = std::make_shared(10471); // September 2, 1998 is 10471 days after January 1, 1970 - Expression filter = less_equal(field_ref("L_SHIPDATE"), literal(std::move(sept_2_1998))); - FilterNodeOptions filter_opts(filter); - - Expression l_returnflag = field_ref("L_RETURNFLAG"); - Expression l_linestatus = field_ref("L_LINESTATUS"); - Expression quantity = field_ref("L_QUANTITY"); - Expression base_price = field_ref("L_EXTENDEDPRICE"); - - std::shared_ptr decimal_1 = std::make_shared(Decimal128{0, 100}, decimal(12, 2)); - Expression discount_multiplier = call("subtract", { literal(decimal_1), field_ref("L_DISCOUNT") }); - Expression tax_multiplier = call("add", { literal(decimal_1), field_ref("L_TAX") }); - Expression disc_price = call("multiply", { field_ref("L_EXTENDEDPRICE"), discount_multiplier }); - Expression charge = call("multiply", - { - call("cast", - { - call("multiply", { field_ref("L_EXTENDEDPRICE"), discount_multiplier }) - }, compute::CastOptions::Unsafe(decimal(12, 2))), - tax_multiplier - }); - Expression discount = field_ref("L_DISCOUNT"); - - std::vector projection_list = - { - l_returnflag, - l_linestatus, - quantity, - base_price, - disc_price, - charge, - quantity, - base_price, - discount - }; - std::vector project_names = - { - "l_returnflag", - "l_linestatus", - "sum_qty", - "sum_base_price", - "sum_disc_price", - "sum_charge", - "avg_qty", - "avg_price", - "avg_disc" - }; - ProjectNodeOptions project_opts(std::move(projection_list)); - - ScalarAggregateOptions sum_opts = ScalarAggregateOptions::Defaults(); - CountOptions count_opts(CountOptions::CountMode::ALL); - std::vector aggs = - { - { "hash_sum", &sum_opts }, - { "hash_sum", &sum_opts }, - { "hash_sum", &sum_opts }, - { "hash_sum", &sum_opts }, - { "hash_mean", &sum_opts }, - { "hash_mean", &sum_opts }, - { "hash_mean", &sum_opts }, - { "hash_count", &count_opts } - }; - - std::vector cols = - { - 2, 3, 4, 5, 6, 7, 8, 2 - }; - - std::vector names = - { - "sum_qty", - "sum_base_price", - "sum_disc_price", - "sum_charge", - "avg_qty", - "avg_price", - "avg_disc", - "count_order" - }; - - std::vector keys = { "L_RETURNFLAG", "L_LINESTATUS" }; - AggregateNodeOptions agg_opts(aggs, cols, names, keys); - - SortKey l_returnflag_key("L_RETURNFLAG"); - SortKey l_linestatus_key("L_LINESTATUS"); - SortOptions sort_opts({ l_returnflag_key, l_linestatus_key }); - OrderBySinkNodeOptions order_by_opts(sort_opts, sink_gen); - - Declaration filter_decl("filter", { Declaration::Input(lineitem) }, filter_opts); - Declaration project_decl("project", project_opts); - Declaration aggregate_decl("aggregate", agg_opts); - Declaration orderby_decl("order_by_sink", order_by_opts); - - Declaration q1 = Declaration::Sequence( - { - filter_decl, - project_decl, - aggregate_decl, - orderby_decl - }); - std::ignore = *q1.AddToPlan(plan.get()); - return plan; +namespace arrow { +namespace compute { + +std::shared_ptr Plan_Q1(AsyncGenerator>* sink_gen, + int scale_factor) { + ExecContext* ctx = default_exec_context(); + *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(ctx); + TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); + + ExecNode* lineitem = + *gen.Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", "L_SHIPDATE", + "L_RETURNFLAG", "L_LINESTATUS"}); + + std::shared_ptr sept_2_1998 = std::make_shared( + 10471); // September 2, 1998 is 10471 days after January 1, 1970 + Expression filter = + less_equal(field_ref("L_SHIPDATE"), literal(std::move(sept_2_1998))); + FilterNodeOptions filter_opts(filter); + + Expression l_returnflag = field_ref("L_RETURNFLAG"); + Expression l_linestatus = field_ref("L_LINESTATUS"); + Expression quantity = field_ref("L_QUANTITY"); + Expression base_price = field_ref("L_EXTENDEDPRICE"); + + std::shared_ptr decimal_1 = + std::make_shared(Decimal128{0, 100}, decimal(12, 2)); + Expression discount_multiplier = + call("subtract", {literal(decimal_1), field_ref("L_DISCOUNT")}); + Expression tax_multiplier = call("add", {literal(decimal_1), field_ref("L_TAX")}); + Expression disc_price = + call("multiply", {field_ref("L_EXTENDEDPRICE"), discount_multiplier}); + Expression charge = + call("multiply", + {call("cast", + {call("multiply", {field_ref("L_EXTENDEDPRICE"), discount_multiplier})}, + compute::CastOptions::Unsafe(decimal(12, 2))), + tax_multiplier}); + Expression discount = field_ref("L_DISCOUNT"); + + std::vector projection_list = {l_returnflag, l_linestatus, quantity, + base_price, disc_price, charge, + quantity, base_price, discount}; + std::vector project_names = { + "l_returnflag", "l_linestatus", "sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", "avg_disc"}; + ProjectNodeOptions project_opts(std::move(projection_list)); + + ScalarAggregateOptions sum_opts = ScalarAggregateOptions::Defaults(); + CountOptions count_opts(CountOptions::CountMode::ALL); + std::vector aggs = { + {"hash_sum", &sum_opts}, {"hash_sum", &sum_opts}, {"hash_sum", &sum_opts}, + {"hash_sum", &sum_opts}, {"hash_mean", &sum_opts}, {"hash_mean", &sum_opts}, + {"hash_mean", &sum_opts}, {"hash_count", &count_opts}}; + + std::vector cols = {2, 3, 4, 5, 6, 7, 8, 2}; + + std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", + "avg_disc", "count_order"}; + + std::vector keys = {"L_RETURNFLAG", "L_LINESTATUS"}; + AggregateNodeOptions agg_opts(aggs, cols, names, keys); + + SortKey l_returnflag_key("L_RETURNFLAG"); + SortKey l_linestatus_key("L_LINESTATUS"); + SortOptions sort_opts({l_returnflag_key, l_linestatus_key}); + OrderBySinkNodeOptions order_by_opts(sort_opts, sink_gen); + + Declaration filter_decl("filter", {Declaration::Input(lineitem)}, filter_opts); + Declaration project_decl("project", project_opts); + Declaration aggregate_decl("aggregate", agg_opts); + Declaration orderby_decl("order_by_sink", order_by_opts); + + Declaration q1 = + Declaration::Sequence({filter_decl, project_decl, aggregate_decl, orderby_decl}); + std::ignore = *q1.AddToPlan(plan.get()); + return plan; } -static void BM_Tpch_Q1(benchmark::State &st) -{ - for(auto _ : st) - { - st.PauseTiming(); - AsyncGenerator> sink_gen; - std::shared_ptr plan = Plan_Q1(&sink_gen, static_cast(st.range(0))); - st.ResumeTiming(); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - } +static void BM_Tpch_Q1(benchmark::State& st) { + for (auto _ : st) { + st.PauseTiming(); + AsyncGenerator> sink_gen; + std::shared_ptr plan = Plan_Q1(&sink_gen, static_cast(st.range(0))); + st.ResumeTiming(); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + } } -BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({ "SF" }); +BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({"SF"}); -} -} +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index f7fa7aaeb15..1ada5d5398a 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -1,3848 +1,3431 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "arrow/compute/exec/tpch_node.h" -#include "arrow/util/make_unique.h" #include "arrow/util/future.h" +#include "arrow/util/make_unique.h" #include "arrow/util/unreachable.h" #include #include #include -#include -#include #include #include #include +#include #include +#include -namespace arrow -{ - using internal::checked_cast; - - namespace compute +namespace arrow { +using internal::checked_cast; + +namespace compute { +const char* NameParts[] = { + "almond", "antique", "aquamarine", "azure", "beige", "bisque", + "black", "blanched", "blue", "blush", "brown", "burlywood", + "burnished", "chartreuse", "chiffon", "chocolate", "coral", "cornflower", + "cornsilk", "cream", "cyan", "dark", "deep", "dim", + "dodger", "drab", "firebrick", "floral", "forest", "frosted", + "gainsboro", "ghost", "goldenrod", "green", "grey", "honeydew", + "hot", "indian", "ivory", "khaki", "lace", "lavender", + "lawn", "lemon", "light", "lime", "linen", "magenta", + "maroon", "medium", "metallic", "midnight", "mint", "misty", + "moccasin", "navajo", "navy", "olive", "orange", "orchid", + "pale", "papaya", "peach", "peru", "pink", "plum", + "powder", "puff", "purple", "red", "rose", "rosy", + "royal", "saddle", "salmon", "sandy", "seashell", "sienna", + "sky", "slate", "smoke", "snow", "spring", "steel", + "tan", "thistle", "tomato", "turquoise", "violet", "wheat", + "white", "yellow", +}; +static constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); + +const char* Types_1[] = { + "STANDARD ", "SMALL ", "MEDIUM ", "LARGE ", "ECONOMY ", "PROMO ", +}; +static constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); + +const char* Types_2[] = { + "ANODIZED ", "BURNISHED ", "PLATED ", "POLISHED ", "BRUSHED ", +}; +static constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); + +const char* Types_3[] = { + "TIN", "NICKEL", "BRASS", "STEEL", "COPPER", +}; +static constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); + +const char* Containers_1[] = { + "SM ", "LG ", "MD ", "JUMBO ", "WRAP ", +}; +static constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); + +const char* Containers_2[] = { + "CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM", +}; +static constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); + +const char* Segments[] = { + "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD", +}; +static constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); + +const char* Priorities[] = { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", +}; +static constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); + +const char* Instructions[] = { + "DELIVER IN PERSON", + "COLLECT COD", + "NONE", + "TAKE BACK RETURN", +}; +static constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); + +const char* Modes[] = { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", +}; +static constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); + +const char* Nouns[] = { + "foxes ", "ideas ", "theodolites ", "pinto beans ", "instructions ", + "dependencies ", "excuses ", "platelets ", "asymptotes ", "courts ", + "dolphins ", "multipliers ", "sautemes ", "warthogs ", "frets ", + "dinos ", "attainments ", "somas ", "Tiresias '", "patterns ", + "forges ", "braids ", "hockey players ", "frays ", "warhorses ", + "dugouts ", "notomis ", "epitaphs ", "pearls ", "tithes ", + "waters ", "orbits ", "gifts ", "sheaves ", "depths ", + "sentiments ", "decoys ", "realms ", "pains ", "grouches ", + "escapades ", +}; +static constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); + +const char* Verbs[] = { + "sleep ", "wake ", "are ", "cajole ", "haggle ", "nag ", "use ", + "boost ", "affix ", "detect ", "integrate ", "maintain ", "nod ", "was ", + "lose ", "sublate ", "solve ", "thrash ", "promise ", "engage ", "hinder ", + "print ", "x-ray ", "breach ", "eat ", "grow ", "impress ", "mold ", + "poach ", "serve ", "run ", "dazzle ", "snooze ", "doze ", "unwind ", + "kindle ", "play ", "hang ", "believe ", "doubt ", +}; +static constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); + +const char* Adjectives[] = { + "furious ", "sly ", "careful ", "blithe ", "quick ", "fluffy ", "slow ", + "quiet ", "ruthless ", "thin ", "close ", "dogged ", "daring ", "brave ", + "stealthy ", "permanent ", "enticing ", "idle ", "busy ", "regular ", "final ", + "ironic ", "even ", "bold ", "silent ", +}; +static constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); + +const char* Adverbs[] = { + "sometimes ", "always ", "never ", "furiously ", "slyly ", "carefully ", + "blithely ", "quickly ", "fluffily ", "slowly ", "quietly ", "ruthlessly ", + "thinly ", "closely ", "doggedly ", "daringly ", "bravely ", "stealthily ", + "permanently ", "enticingly ", "idly ", "busily ", "regularly ", "finally ", + "ironically ", "evenly ", "boldly ", "silently ", +}; +static constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); + +const char* Prepositions[] = { + "about ", "above ", "according to ", "across ", "after ", "against ", + "along ", "alongside of ", "among ", "around ", "at ", "atop ", + "before ", "behind ", "beneath ", "beside ", "besides ", "between ", + "beyond ", "beyond ", "by ", "despite ", "during ", "except ", + "for ", "from ", "in place of ", "inside ", "instead of ", "into ", + "near ", "of ", "on ", "outside ", "over ", "past ", + "since ", "through ", "throughout ", "to ", "toward ", "under ", + "until ", "up ", "upon ", "without ", "with ", "within ", +}; +static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); + +const char* Auxiliaries[] = { + "do ", + "may ", + "might ", + "shall ", + "will ", + "would ", + "can ", + "could ", + "should ", + "ought to ", + "must ", + "will have to ", + "shall have to ", + "could have to ", + "should have to ", + "must have to ", + "need to ", + "try to ", +}; +static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); + +const char* Terminators[] = { + ".", ";", ":", "?", "!", "--", +}; +static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + +// The spec says to generate a 300 MB string according to a grammar. This is a +// concurrent implementation of the generator. Each thread generates the text in +// (up to) 8KB chunks of text. The generator maintains a cursor into the +// 300 MB buffer. After generating the chunk, the cursor is incremented +// to reserve space, and the chunk is memcpy-d in. +// This text is used to generate the COMMENT columns. To generate a comment, the spec +// says to pick a random length and a random offset into the 300 MB buffer (it does +// not specify it should be word/sentence aligned), and that slice of text becomes +// the comment. +class TpchPseudotext { + public: + Status EnsureInitialized(random::pcg32_fast& rng); + Result GenerateComments(size_t num_comments, size_t min_length, + size_t max_length, random::pcg32_fast& rng); + + private: + bool GenerateWord(int64_t& offset, random::pcg32_fast& rng, char* arr, + const char** words, size_t num_choices); + bool GenerateNoun(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateVerb(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateAdjective(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateAdverb(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GeneratePreposition(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateAuxiliary(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateTerminator(int64_t& offset, random::pcg32_fast& rng, char* arr); + + bool GenerateNounPhrase(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateVerbPhrase(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GeneratePrepositionalPhrase(int64_t& offset, random::pcg32_fast& rng, char* arr); + + bool GenerateSentence(int64_t& offset, random::pcg32_fast& rng, char* arr); + + std::atomic done_ = {false}; + int64_t generated_offset_{0}; + std::mutex text_guard_; + std::unique_ptr text_; + static constexpr int64_t kChunkSize = 8192; + static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB +}; + +static TpchPseudotext g_text; + +Status TpchPseudotext::EnsureInitialized(random::pcg32_fast& rng) { + if (done_.load()) return Status::OK(); + + { + std::lock_guard lock(text_guard_); + if (!text_) { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + } + } + char* out = reinterpret_cast(text_->mutable_data()); + char temp_buff[kChunkSize]; + + while (!done_.load()) { + int64_t known_valid_offset = 0; + int64_t try_offset = 0; + while (GenerateSentence(try_offset, rng, temp_buff)) known_valid_offset = try_offset; + + bool last_one; + int64_t offset; + int64_t memcpy_size; { - const char *NameParts[] = - { - "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", - "blush", "brown", "burlywood", "burnished", "chartreuse", "chiffon", "chocolate", "coral", - "cornflower", "cornsilk", "cream", "cyan", "dark", "deep", "dim", "dodger", "drab", "firebrick", - "floral", "forest", "frosted", "gainsboro", "ghost", "goldenrod", "green", "grey", "honeydew", - "hot", "indian", "ivory", "khaki", "lace", "lavender", "lawn", "lemon", "light", "lime", "linen", - "magenta", "maroon", "medium", "metallic", "midnight", "mint", "misty", "moccasin", "navajo", - "navy", "olive", "orange", "orchid", "pale", "papaya", "peach", "peru", "pink", "plum", "powder", - "puff", "purple", "red", "rose", "rosy", "royal", "saddle", "salmon", "sandy", "seashell", "sienna", - "sky", "slate", "smoke", "snow", "spring", "steel", "tan", "thistle", "tomato", "turquoise", "violet", - "wheat", "white", "yellow", - }; - static constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); - - const char *Types_1[] = - { - "STANDARD ", "SMALL ", "MEDIUM ", "LARGE ", "ECONOMY ", "PROMO ", - }; - static constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); - - const char *Types_2[] = - { - "ANODIZED ", "BURNISHED ", "PLATED ", "POLISHED ", "BRUSHED ", - }; - static constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); - - const char *Types_3[] = - { - "TIN", "NICKEL", "BRASS", "STEEL", "COPPER", - }; - static constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); - - const char *Containers_1[] = - { - "SM ", "LG ", "MD ", "JUMBO ", "WRAP ", - }; - static constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); - - const char *Containers_2[] = - { - "CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM", - }; - static constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); - - const char *Segments[] = - { - "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD", - }; - static constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); - - const char *Priorities[] = - { - "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", - }; - static constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); - - const char *Instructions[] = - { - "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN", - }; - static constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); - - const char *Modes[] = - { - "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", - }; - static constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); - - const char *Nouns[] = - { - "foxes ", "ideas ", "theodolites ", "pinto beans ", "instructions ", "dependencies ", "excuses ", - "platelets ", "asymptotes ", "courts ", "dolphins ", "multipliers ", "sautemes ", "warthogs ", "frets ", - "dinos ", "attainments ", "somas ", "Tiresias '", "patterns ", "forges ", "braids ", "hockey players ", "frays ", - "warhorses ", "dugouts ", "notomis ", "epitaphs ", "pearls ", "tithes ", "waters ", "orbits ", "gifts ", "sheaves ", - "depths ", "sentiments ", "decoys ", "realms ", "pains ", "grouches ", "escapades ", - }; - static constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); - - const char *Verbs[] = - { - "sleep ", "wake ", "are ", "cajole ", "haggle ", "nag ", "use ", "boost ", "affix ", "detect ", "integrate ", - "maintain ", "nod ", "was ", "lose ", "sublate ", "solve ", "thrash ", "promise ", "engage ", "hinder ", - "print ", "x-ray ", "breach ", "eat ", "grow ", "impress ", "mold ", "poach ", "serve ", "run ", "dazzle ", - "snooze ", "doze ", "unwind ", "kindle ", "play ", "hang ", "believe ", "doubt ", - }; - static constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); - - const char *Adjectives[] = - { - "furious ", "sly ", "careful ", "blithe ", "quick ", "fluffy ", "slow ", "quiet ", "ruthless ", "thin ", - "close ", "dogged ", "daring ", "brave ", "stealthy ", "permanent ", "enticing ", "idle ", "busy ", - "regular ", "final ", "ironic ", "even ", "bold ", "silent ", - }; - static constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); - - const char *Adverbs[] = - { - "sometimes ", "always ", "never ", "furiously ", "slyly ", "carefully ", "blithely ", "quickly ", "fluffily ", - "slowly ", "quietly ", "ruthlessly ", "thinly ", "closely ", "doggedly ", "daringly ", "bravely ", "stealthily ", - "permanently ", "enticingly ", "idly ", "busily ", "regularly ", "finally ", "ironically ", "evenly ", "boldly ", - "silently ", - }; - static constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); - - const char *Prepositions[] = - { - "about ", "above ", "according to ", "across ", "after ", "against ", "along ", "alongside of ", "among ", - "around ", "at ", "atop ", "before ", "behind ", "beneath ", "beside ", "besides ", "between ", "beyond ", - "beyond ", "by ", "despite ", "during ", "except ", "for ", "from ", "in place of ", "inside ", "instead of ", - "into ", "near ", "of ", "on ", "outside ", "over ", "past ", "since ", "through ", "throughout ", "to ", - "toward ", "under ", "until ", "up ", "upon ", "without ", "with ", "within ", - }; - static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); - - const char *Auxiliaries[] = - { - "do ", "may ", "might ", "shall ", "will ", "would ", "can ", "could ", "should ", "ought to ", "must ", - "will have to ", "shall have to ", "could have to ", "should have to ", "must have to ", "need to ", "try to ", - }; - static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); - - const char *Terminators[] = - { - ".", ";", ":", "?", "!", "--", - }; - static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); - - - // The spec says to generate a 300 MB string according to a grammar. This is a - // concurrent implementation of the generator. Each thread generates the text in - // (up to) 8KB chunks of text. The generator maintains a cursor into the - // 300 MB buffer. After generating the chunk, the cursor is incremented - // to reserve space, and the chunk is memcpy-d in. - // This text is used to generate the COMMENT columns. To generate a comment, the spec - // says to pick a random length and a random offset into the 300 MB buffer (it does - // not specify it should be word/sentence aligned), and that slice of text becomes - // the comment. - class TpchPseudotext - { - public: - Status EnsureInitialized(random::pcg32_fast &rng); - Result GenerateComments( - size_t num_comments, - size_t min_length, - size_t max_length, - random::pcg32_fast &rng); - - private: - bool GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices); - bool GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr); - - bool GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - bool GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr); - - bool GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr); - - std::atomic done_ = { false }; - int64_t generated_offset_{0}; - std::mutex text_guard_; - std::unique_ptr text_; - static constexpr int64_t kChunkSize = 8192; - static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB - }; - - static TpchPseudotext g_text; - - Status TpchPseudotext::EnsureInitialized(random::pcg32_fast &rng) - { - if(done_.load()) - return Status::OK(); - - { - std::lock_guard lock(text_guard_); - if(!text_) - { - ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); - } - } - char *out = reinterpret_cast(text_->mutable_data()); - char temp_buff[kChunkSize]; - - while(!done_.load()) - { - int64_t known_valid_offset = 0; - int64_t try_offset = 0; - while(GenerateSentence(try_offset, rng, temp_buff)) - known_valid_offset = try_offset; - - bool last_one; - int64_t offset; - int64_t memcpy_size; - { - std::lock_guard lock(text_guard_); - if(done_.load()) - return Status::OK(); - int64_t bytes_remaining = kTextBytes - generated_offset_; - memcpy_size = std::min(known_valid_offset, bytes_remaining); - offset = generated_offset_; - generated_offset_ += memcpy_size; - last_one = generated_offset_ == kTextBytes; - } - std::memcpy(out + offset, temp_buff, memcpy_size); - if(last_one) - done_.store(true); - } - return Status::OK(); - } + std::lock_guard lock(text_guard_); + if (done_.load()) return Status::OK(); + int64_t bytes_remaining = kTextBytes - generated_offset_; + memcpy_size = std::min(known_valid_offset, bytes_remaining); + offset = generated_offset_; + generated_offset_ += memcpy_size; + last_one = generated_offset_ == kTextBytes; + } + std::memcpy(out + offset, temp_buff, memcpy_size); + if (last_one) done_.store(true); + } + return Status::OK(); +} - Result TpchPseudotext::GenerateComments( - size_t num_comments, - size_t min_length, - size_t max_length, - random::pcg32_fast &rng) - { - RETURN_NOT_OK(EnsureInitialized(rng)); - std::uniform_int_distribution length_dist(min_length, max_length); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); - int32_t *offsets = reinterpret_cast(offset_buffer->mutable_data()); - offsets[0] = 0; - for(size_t i = 1; i <= num_comments; i++) - offsets[i] = offsets[i - 1] + static_cast(length_dist(rng)); - - ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, AllocateBuffer(offsets[num_comments])); - char *comments = reinterpret_cast(comment_buffer->mutable_data()); - for(size_t i = 0; i < num_comments; i++) - { - size_t length = offsets[i + 1] - offsets[i]; - std::uniform_int_distribution offset_dist(0, kTextBytes - length); - size_t offset_in_text = offset_dist(rng); - std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); - } - ArrayData ad(utf8(), num_comments, { nullptr, std::move(offset_buffer), std::move(comment_buffer) }); - return std::move(ad); - } +Result TpchPseudotext::GenerateComments(size_t num_comments, size_t min_length, + size_t max_length, + random::pcg32_fast& rng) { + RETURN_NOT_OK(EnsureInitialized(rng)); + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, + AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); + int32_t* offsets = reinterpret_cast(offset_buffer->mutable_data()); + offsets[0] = 0; + for (size_t i = 1; i <= num_comments; i++) + offsets[i] = offsets[i - 1] + static_cast(length_dist(rng)); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, + AllocateBuffer(offsets[num_comments])); + char* comments = reinterpret_cast(comment_buffer->mutable_data()); + for (size_t i = 0; i < num_comments; i++) { + size_t length = offsets[i + 1] - offsets[i]; + std::uniform_int_distribution offset_dist(0, kTextBytes - length); + size_t offset_in_text = offset_dist(rng); + std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); + } + ArrayData ad(utf8(), num_comments, + {nullptr, std::move(offset_buffer), std::move(comment_buffer)}); + return std::move(ad); +} - bool TpchPseudotext::GenerateWord(int64_t &offset, random::pcg32_fast &rng, char *arr, const char **words, size_t num_choices) - { - std::uniform_int_distribution dist(0, num_choices - 1); - const char *word = words[dist(rng)]; - size_t length = std::strlen(word); - if(offset + length > kChunkSize) - return false; - std::memcpy(arr + offset, word, length); - offset += length; - return true; - } +bool TpchPseudotext::GenerateWord(int64_t& offset, random::pcg32_fast& rng, char* arr, + const char** words, size_t num_choices) { + std::uniform_int_distribution dist(0, num_choices - 1); + const char* word = words[dist(rng)]; + size_t length = std::strlen(word); + if (offset + length > kChunkSize) return false; + std::memcpy(arr + offset, word, length); + offset += length; + return true; +} - bool TpchPseudotext::GenerateNoun(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - return GenerateWord(offset, rng, arr, Nouns, kNumNouns); - } +bool TpchPseudotext::GenerateNoun(int64_t& offset, random::pcg32_fast& rng, char* arr) { + return GenerateWord(offset, rng, arr, Nouns, kNumNouns); +} - bool TpchPseudotext::GenerateVerb(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - return GenerateWord(offset, rng, arr, Verbs, kNumVerbs); - } +bool TpchPseudotext::GenerateVerb(int64_t& offset, random::pcg32_fast& rng, char* arr) { + return GenerateWord(offset, rng, arr, Verbs, kNumVerbs); +} - bool TpchPseudotext::GenerateAdjective(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - return GenerateWord(offset, rng, arr, Adjectives, kNumAdjectives); - } +bool TpchPseudotext::GenerateAdjective(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + return GenerateWord(offset, rng, arr, Adjectives, kNumAdjectives); +} - bool TpchPseudotext::GenerateAdverb(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - return GenerateWord(offset, rng, arr, Adverbs, kNumAdverbs); - } +bool TpchPseudotext::GenerateAdverb(int64_t& offset, random::pcg32_fast& rng, char* arr) { + return GenerateWord(offset, rng, arr, Adverbs, kNumAdverbs); +} - bool TpchPseudotext::GeneratePreposition(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - return GenerateWord(offset, rng, arr, Prepositions, kNumPrepositions); - } +bool TpchPseudotext::GeneratePreposition(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + return GenerateWord(offset, rng, arr, Prepositions, kNumPrepositions); +} - bool TpchPseudotext::GenerateAuxiliary(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - return GenerateWord(offset, rng, arr, Auxiliaries, kNumAuxiliaries); - } +bool TpchPseudotext::GenerateAuxiliary(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + return GenerateWord(offset, rng, arr, Auxiliaries, kNumAuxiliaries); +} - bool TpchPseudotext::GenerateTerminator(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - bool result = GenerateWord(offset, rng, arr, Terminators, kNumTerminators); - // Swap the space with the terminator - if(result) - std::swap(*(arr + offset - 2), *(arr + offset - 1)); - return result; - } +bool TpchPseudotext::GenerateTerminator(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + bool result = GenerateWord(offset, rng, arr, Terminators, kNumTerminators); + // Swap the space with the terminator + if (result) std::swap(*(arr + offset - 2), *(arr + offset - 1)); + return result; +} - bool TpchPseudotext::GenerateNounPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - std::uniform_int_distribution dist(0, 3); - const char *comma_space = ", "; - bool success = true; - switch(dist(rng)) - { - case 0: - success &= GenerateNoun(offset, rng, arr); - break; - case 1: - success &= GenerateAdjective(offset, rng, arr); - success &= GenerateNoun(offset, rng, arr); - break; - case 2: - success &= GenerateAdjective(offset, rng, arr); - success &= GenerateWord(--offset, rng, arr, &comma_space, 1); - success &= GenerateAdjective(offset, rng, arr); - success &= GenerateNoun(offset, rng, arr); - break; - case 3: - success &= GenerateAdverb(offset, rng, arr); - success &= GenerateAdjective(offset, rng, arr); - success &= GenerateNoun(offset, rng, arr); - break; - default: - Unreachable("Random number should be between 0 and 3 inclusive"); - break; - } - return success; - } +bool TpchPseudotext::GenerateNounPhrase(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + std::uniform_int_distribution dist(0, 3); + const char* comma_space = ", "; + bool success = true; + switch (dist(rng)) { + case 0: + success &= GenerateNoun(offset, rng, arr); + break; + case 1: + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); + break; + case 2: + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateWord(--offset, rng, arr, &comma_space, 1); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); + break; + case 3: + success &= GenerateAdverb(offset, rng, arr); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + return success; +} - bool TpchPseudotext::GenerateVerbPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - std::uniform_int_distribution dist(0, 3); - bool success = true; - switch(dist(rng)) - { - case 0: - success &= GenerateVerb(offset, rng, arr); - break; - case 1: - success &= GenerateAuxiliary(offset, rng, arr); - success &= GenerateVerb(offset, rng, arr); - break; - case 2: - success &= GenerateVerb(offset, rng, arr); - success &= GenerateAdverb(offset, rng, arr); - break; - case 3: - success &= GenerateAuxiliary(offset, rng, arr); - success &= GenerateVerb(offset, rng, arr); - success &= GenerateAdverb(offset, rng, arr); - break; - default: - Unreachable("Random number should be between 0 and 3 inclusive"); - break; - } - return success; - } +bool TpchPseudotext::GenerateVerbPhrase(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + std::uniform_int_distribution dist(0, 3); + bool success = true; + switch (dist(rng)) { + case 0: + success &= GenerateVerb(offset, rng, arr); + break; + case 1: + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); + break; + case 2: + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); + break; + case 3: + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + return success; +} - bool TpchPseudotext::GeneratePrepositionalPhrase(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - const char *the_space = "the "; - bool success = true; - success &= GeneratePreposition(offset, rng, arr); - success &= GenerateWord(offset, rng, arr, &the_space, 1); - success &= GenerateNounPhrase(offset, rng, arr); - return success; - } +bool TpchPseudotext::GeneratePrepositionalPhrase(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + const char* the_space = "the "; + bool success = true; + success &= GeneratePreposition(offset, rng, arr); + success &= GenerateWord(offset, rng, arr, &the_space, 1); + success &= GenerateNounPhrase(offset, rng, arr); + return success; +} - bool TpchPseudotext::GenerateSentence(int64_t &offset, random::pcg32_fast &rng, char *arr) - { - std::uniform_int_distribution dist(0, 4); - bool success = true; - switch(dist(rng)) - { - case 0: - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateVerbPhrase(offset, rng, arr); - success &= GenerateTerminator(offset, rng, arr); - break; - case 1: - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateVerbPhrase(offset, rng, arr); - success &= GeneratePrepositionalPhrase(offset, rng, arr); - success &= GenerateTerminator(offset, rng, arr); - break; - case 2: - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateVerbPhrase(offset, rng, arr); - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateTerminator(offset, rng, arr); - break; - case 3: - success &= GenerateNounPhrase(offset, rng, arr); - success &= GeneratePrepositionalPhrase(offset, rng, arr); - success &= GenerateVerbPhrase(offset, rng, arr); - success &= GenerateNounPhrase(offset, rng, arr); - success &= GenerateTerminator(offset, rng, arr); - break; - case 4: - success &= GenerateNounPhrase(offset, rng, arr); - success &= GeneratePrepositionalPhrase(offset, rng, arr); - success &= GenerateVerbPhrase(offset, rng, arr); - success &= GeneratePrepositionalPhrase(offset, rng, arr); - success &= GenerateTerminator(offset, rng, arr); - break; - default: - Unreachable("Random number should be between 0 and 5 inclusive"); - break; - } - return success; - } +bool TpchPseudotext::GenerateSentence(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + std::uniform_int_distribution dist(0, 4); + bool success = true; + switch (dist(rng)) { + case 0: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 1: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 2: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 3: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 4: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + default: + Unreachable("Random number should be between 0 and 5 inclusive"); + break; + } + return success; +} - class TpchTableGenerator - { - public: - using OutputBatchCallback = std::function; - using FinishedCallback = std::function; - using GenerateFn = std::function; - using ScheduleCallback = std::function; - using AbortCallback = std::function; - - virtual Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) = 0; - - virtual Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) = 0; - - bool Abort() - { - bool expected = false; - return done_.compare_exchange_strong(expected, true); - } - - virtual std::shared_ptr schema() const = 0; - - virtual ~TpchTableGenerator() = default; - - protected: - std::atomic done_ = { false }; - std::atomic batches_outputted_ = { 0 }; - }; - - int GetNumDigits(int64_t x) - { - // This if statement chain is for MAXIMUM SPEED - // Source: https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c - ARROW_DCHECK(x >= 0); - if(x < 10ll) return 1; - if(x < 100ll) return 2; - if(x < 1000ll) return 3; - if(x < 10000ll) return 4; - if(x < 100000ll) return 5; - if(x < 1000000ll) return 6; - if(x < 10000000ll) return 7; - if(x < 100000000ll) return 8; - if(x < 1000000000ll) return 9; - if(x < 10000000000ll) return 10; - if(x < 100000000000ll) return 11; - if(x < 1000000000000ll) return 12; - if(x < 10000000000000ll) return 13; - if(x < 100000000000000ll) return 14; - if(x < 1000000000000000ll) return 15; - if(x < 10000000000000000ll) return 16; - if(x < 100000000000000000ll) return 17; - if(x < 1000000000000000000ll) return 18; - return -1; - } +class TpchTableGenerator { + public: + using OutputBatchCallback = std::function; + using FinishedCallback = std::function; + using GenerateFn = std::function; + using ScheduleCallback = std::function; + using AbortCallback = std::function; + + virtual Status Init(std::vector columns, float scale_factor, + int64_t batch_size) = 0; + + virtual Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) = 0; + + bool Abort() { + bool expected = false; + return done_.compare_exchange_strong(expected, true); + } + + virtual std::shared_ptr schema() const = 0; + + virtual ~TpchTableGenerator() = default; + + protected: + std::atomic done_ = {false}; + std::atomic batches_outputted_ = {0}; +}; + +int GetNumDigits(int64_t x) { + // This if statement chain is for MAXIMUM SPEED + // Source: + // https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c + ARROW_DCHECK(x >= 0); + if (x < 10ll) return 1; + if (x < 100ll) return 2; + if (x < 1000ll) return 3; + if (x < 10000ll) return 4; + if (x < 100000ll) return 5; + if (x < 1000000ll) return 6; + if (x < 10000000ll) return 7; + if (x < 100000000ll) return 8; + if (x < 1000000000ll) return 9; + if (x < 10000000000ll) return 10; + if (x < 100000000000ll) return 11; + if (x < 1000000000000ll) return 12; + if (x < 10000000000000ll) return 13; + if (x < 100000000000000ll) return 14; + if (x < 1000000000000000ll) return 15; + if (x < 10000000000000000ll) return 16; + if (x < 100000000000000000ll) return 17; + if (x < 1000000000000000000ll) return 18; + return -1; +} - void AppendNumberPaddedToNineDigits(char *out, int64_t x) - { - // We do all of this to avoid calling snprintf, which needs to handle locale, - // which can be slow, especially on Mac and Windows. - int num_digits = GetNumDigits(x); - int num_padding_zeros = std::max(9 - num_digits, 0); - std::memset(out, '0', static_cast(num_padding_zeros)); - while(x > 0) - { - *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); - num_digits -= 1; - x /= 10; - } - } +void AppendNumberPaddedToNineDigits(char* out, int64_t x) { + // We do all of this to avoid calling snprintf, which needs to handle locale, + // which can be slow, especially on Mac and Windows. + int num_digits = GetNumDigits(x); + int num_padding_zeros = std::max(9 - num_digits, 0); + std::memset(out, '0', static_cast(num_padding_zeros)); + while (x > 0) { + *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); + num_digits -= 1; + x /= 10; + } +} - Result> SetOutputColumns( - const std::vector &columns, - const std::vector> &types, - const std::unordered_map &name_map, - std::vector &gen_list) - { - gen_list.clear(); - std::vector> fields; - if(columns.empty()) - { - fields.resize(name_map.size()); - gen_list.resize(name_map.size()); - for(auto pair : name_map) - { - int col_idx = pair.second; - fields[col_idx] = field(pair.first, types[col_idx]); - gen_list[col_idx] = col_idx; - } - return schema(std::move(fields)); - } - else - { - for(const std::string &col : columns) - { - auto entry = name_map.find(col); - if(entry == name_map.end()) - return Status::Invalid("Not a valid column name"); - int col_idx = static_cast(entry->second); - fields.push_back(field(col, types[col_idx])); - gen_list.push_back(col_idx); - } - return schema(std::move(fields)); - } - } +Result> SetOutputColumns( + const std::vector& columns, + const std::vector>& types, + const std::unordered_map& name_map, std::vector& gen_list) { + gen_list.clear(); + std::vector> fields; + if (columns.empty()) { + fields.resize(name_map.size()); + gen_list.resize(name_map.size()); + for (auto pair : name_map) { + int col_idx = pair.second; + fields[col_idx] = field(pair.first, types[col_idx]); + gen_list[col_idx] = col_idx; + } + return schema(std::move(fields)); + } else { + for (const std::string& col : columns) { + auto entry = name_map.find(col); + if (entry == name_map.end()) return Status::Invalid("Not a valid column name"); + int col_idx = static_cast(entry->second); + fields.push_back(field(col, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } +} - Result RandomVString( - random::pcg32_fast &rng, - int64_t num_rows, - int32_t min_length, - int32_t max_length) - { - std::uniform_int_distribution length_dist(min_length, max_length); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((num_rows + 1) * sizeof(int32_t))); - int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); - offsets[0] = 0; - for(int64_t i = 1; i <= num_rows; i++) - offsets[i] = offsets[i - 1] + length_dist(rng); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[num_rows])); - char *str = reinterpret_cast(str_buff->mutable_data()); - - // Spec says to pick random alphanumeric characters from a set of at least - // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, - // so 52 total for upper and lower case, and 10 possible digits gives 62 - // characters... - // dbgen solves this by including a space and a comma as well, so we'll - // copy that. - const char alpha_numerics[65] = - "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; - std::uniform_int_distribution char_dist(0, 63); - for(int32_t i = 0; i < offsets[num_rows]; i++) - str[i] = alpha_numerics[char_dist(rng)]; - - ArrayData ad(utf8(), num_rows, { nullptr, std::move(offset_buff), std::move(str_buff) }); - return std::move(ad); - } +Result RandomVString(random::pcg32_fast& rng, int64_t num_rows, int32_t min_length, + int32_t max_length) { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((num_rows + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for (int64_t i = 1; i <= num_rows; i++) offsets[i] = offsets[i - 1] + length_dist(rng); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, + AllocateBuffer(offsets[num_rows])); + char* str = reinterpret_cast(str_buff->mutable_data()); + + // Spec says to pick random alphanumeric characters from a set of at least + // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, + // so 52 total for upper and lower case, and 10 possible digits gives 62 + // characters... + // dbgen solves this by including a space and a comma as well, so we'll + // copy that. + const char alpha_numerics[65] = + "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; + std::uniform_int_distribution char_dist(0, 63); + for (int32_t i = 0; i < offsets[num_rows]; i++) str[i] = alpha_numerics[char_dist(rng)]; + + ArrayData ad(utf8(), num_rows, {nullptr, std::move(offset_buff), std::move(str_buff)}); + return std::move(ad); +} - void AppendNumber(char *&out, int num_digits, int32_t x) - { - out += (num_digits - 1); - while(x > 0) - { - *out-- = '0' + (x % 10); - x /= 10; - } - out += (num_digits + 1); - } +void AppendNumber(char*& out, int num_digits, int32_t x) { + out += (num_digits - 1); + while (x > 0) { + *out-- = '0' + (x % 10); + x /= 10; + } + out += (num_digits + 1); +} + +void GeneratePhoneNumber(char* out, random::pcg32_fast& rng, int32_t country) { + std::uniform_int_distribution three_digit(100, 999); + std::uniform_int_distribution four_digit(1000, 9999); + + int32_t country_code = country + 10; + int32_t l1 = three_digit(rng); + int32_t l2 = three_digit(rng); + int32_t l3 = four_digit(rng); + AppendNumber(out, 2, country_code); + *out++ = '-'; + AppendNumber(out, 3, l1); + *out++ = '-'; + AppendNumber(out, 3, l2); + *out++ = '-'; + AppendNumber(out, 4, l3); +} + +static constexpr uint32_t kStartDate = + 8035; // January 1, 1992 is 8035 days after January 1, 1970 +static constexpr uint32_t kCurrentDate = + 9298; // June 17, 1995 is 9298 days after January 1, 1970 +static constexpr uint32_t kEndDate = + 10591; // December 12, 1998 is 10591 days after January 1, 1970 + +using GenerateColumnFn = std::function; +class PartAndPartSupplierGenerator { + public: + Status Init(size_t num_threads, int64_t batch_size, float scale_factor) { + if (!inited_) { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + arrow_vendored::pcg_extras::seed_seq_from seq; + thread_local_data_.resize(num_threads); + for (ThreadLocalData& tld : thread_local_data_) { + constexpr int kMaxNumDistinctStrings = 5; + tld.string_indices.resize(kMaxNumDistinctStrings * batch_size_); + tld.rng.seed(seq); + } + part_rows_to_generate_ = static_cast(scale_factor_ * 200000); + } + return Status::OK(); + } - void GeneratePhoneNumber( - char *out, - random::pcg32_fast &rng, - int32_t country) - { - std::uniform_int_distribution three_digit(100, 999); - std::uniform_int_distribution four_digit(1000, 9999); - - int32_t country_code = country + 10; - int32_t l1 = three_digit(rng); - int32_t l2 = three_digit(rng); - int32_t l3 = four_digit(rng); - AppendNumber(out, 2, country_code); - *out++ = '-'; - AppendNumber(out, 3, l1); - *out++ = '-'; - AppendNumber(out, 3, l2); - *out++ = '-'; - AppendNumber(out, 4, l3); + int64_t part_batches_generated() const { return part_batches_generated_.load(); } + + int64_t partsupp_batches_generated() const { + return partsupp_batches_generated_.load(); + } + + Result> SetPartOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kPartTypes, kPartNameMap, part_cols_); + } + + Result> SetPartSuppOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kPartsuppTypes, kPartsuppNameMap, partsupp_cols_); + } + + Result> NextPartBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(part_output_queue_mutex_); + if (!part_output_queue_.empty()) { + ExecBatch batch = std::move(part_output_queue_.front()); + part_output_queue_.pop(); + return std::move(batch); + } else if (part_rows_generated_ == part_rows_to_generate_) { + return util::nullopt; + } else { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = + std::min(batch_size_, part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for (int col : part_cols_) RETURN_NOT_OK(kPartGenerators[col](thread_index)); + for (int col : partsupp_cols_) RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); + + std::vector part_result(part_cols_.size()); + for (size_t i = 0; i < part_cols_.size(); i++) { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + if (!partsupp_cols_.empty()) { + std::vector partsupp_results; + for (size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) { + std::vector partsupp_result(partsupp_cols_.size()); + for (size_t icol = 0; icol < partsupp_cols_.size(); icol++) { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(partsupp_output_queue_mutex_); + for (ExecBatch& eb : partsupp_results) { + partsupp_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(part_result)); + } - static constexpr uint32_t kStartDate = 8035; // January 1, 1992 is 8035 days after January 1, 1970 - static constexpr uint32_t kCurrentDate = 9298; // June 17, 1995 is 9298 days after January 1, 1970 - static constexpr uint32_t kEndDate = 10591; // December 12, 1998 is 10591 days after January 1, 1970 - - using GenerateColumnFn = std::function; - class PartAndPartSupplierGenerator - { - public: - Status Init( - size_t num_threads, - int64_t batch_size, - float scale_factor) - { - if(!inited_) - { - inited_ = true; - batch_size_ = batch_size; - scale_factor_ = scale_factor; - - arrow_vendored::pcg_extras::seed_seq_from seq; - thread_local_data_.resize(num_threads); - for(ThreadLocalData &tld : thread_local_data_) - { - constexpr int kMaxNumDistinctStrings = 5; - tld.string_indices.resize(kMaxNumDistinctStrings * batch_size_); - tld.rng.seed(seq); - } - part_rows_to_generate_ = static_cast(scale_factor_ * 200000); - } - return Status::OK(); - } - - int64_t part_batches_generated() const - { - return part_batches_generated_.load(); - } - - int64_t partsupp_batches_generated() const - { - return partsupp_batches_generated_.load(); - } - - Result> SetPartOutputColumns(const std::vector &cols) - { - return SetOutputColumns(cols, kPartTypes, kPartNameMap, part_cols_); - } - - Result> SetPartSuppOutputColumns(const std::vector &cols) - { - return SetOutputColumns(cols, kPartsuppTypes, kPartsuppNameMap, partsupp_cols_); - } - - Result> NextPartBatch() - { - size_t thread_index = thread_indexer_(); - ThreadLocalData &tld = thread_local_data_[thread_index]; - { - std::lock_guard lock(part_output_queue_mutex_); - if(!part_output_queue_.empty()) - { - ExecBatch batch = std::move(part_output_queue_.front()); - part_output_queue_.pop(); - return std::move(batch); - } - else if(part_rows_generated_ == part_rows_to_generate_) - { - return util::nullopt; - } - else - { - tld.partkey_start = part_rows_generated_; - tld.part_to_generate = std::min( - batch_size_, - part_rows_to_generate_ - part_rows_generated_); - part_rows_generated_ += tld.part_to_generate; - - int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); - part_batches_generated_.fetch_add(1); - partsupp_batches_generated_.fetch_add(num_ps_batches); - ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); - } - } - tld.part.resize(PART::kNumCols); - std::fill(tld.part.begin(), tld.part.end(), Datum()); - RETURN_NOT_OK(InitPartsupp(thread_index)); - - for(int col : part_cols_) - RETURN_NOT_OK(kPartGenerators[col](thread_index)); - for(int col : partsupp_cols_) - RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); - - std::vector part_result(part_cols_.size()); - for(size_t i = 0; i < part_cols_.size(); i++) - { - int col_idx = part_cols_[i]; - part_result[i] = tld.part[col_idx]; - } - if(!partsupp_cols_.empty()) - { - std::vector partsupp_results; - for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) - { - std::vector partsupp_result(partsupp_cols_.size()); - for(size_t icol = 0; icol < partsupp_cols_.size(); icol++) - { - int col_idx = partsupp_cols_[icol]; - partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); - partsupp_results.emplace_back(std::move(eb)); - } - { - std::lock_guard guard(partsupp_output_queue_mutex_); - for(ExecBatch &eb : partsupp_results) - { - partsupp_output_queue_.emplace(std::move(eb)); - } - } - } - return ExecBatch::Make(std::move(part_result)); - } - - Result> NextPartSuppBatch() - { - size_t thread_index = thread_indexer_(); - ThreadLocalData &tld = thread_local_data_[thread_index]; - { - std::lock_guard lock(partsupp_output_queue_mutex_); - if(!partsupp_output_queue_.empty()) - { - ExecBatch result = std::move(partsupp_output_queue_.front()); - partsupp_output_queue_.pop(); - return std::move(result); - } - } - { - std::lock_guard lock(part_output_queue_mutex_); - if(part_rows_generated_ == part_rows_to_generate_) - { - return util::nullopt; - } - else - { - tld.partkey_start = part_rows_generated_; - tld.part_to_generate = std::min( - batch_size_, - part_rows_to_generate_ - part_rows_generated_); - part_rows_generated_ += tld.part_to_generate; - int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); - part_batches_generated_.fetch_add(1); - partsupp_batches_generated_.fetch_add(num_ps_batches); - ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); - } - } - tld.part.resize(PART::kNumCols); - std::fill(tld.part.begin(), tld.part.end(), Datum()); - RETURN_NOT_OK(InitPartsupp(thread_index)); - - for(int col : part_cols_) - RETURN_NOT_OK(kPartGenerators[col](thread_index)); - for(int col : partsupp_cols_) - RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); - if(!part_cols_.empty()) - { - std::vector part_result(part_cols_.size()); - for(size_t i = 0; i < part_cols_.size(); i++) - { - int col_idx = part_cols_[i]; - part_result[i] = tld.part[col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch part_batch, ExecBatch::Make(std::move(part_result))); - { - std::lock_guard lock(part_output_queue_mutex_); - part_output_queue_.emplace(std::move(part_batch)); - } - } - std::vector partsupp_results; - for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) - { - std::vector partsupp_result(partsupp_cols_.size()); - for(size_t icol = 0; icol < partsupp_cols_.size(); icol++) - { - int col_idx = partsupp_cols_[icol]; - partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); - partsupp_results.emplace_back(std::move(eb)); - } - // Return the first batch, enqueue the rest. - { - std::lock_guard lock(partsupp_output_queue_mutex_); - for(size_t i = 1; i < partsupp_results.size(); i++) - partsupp_output_queue_.emplace(std::move(partsupp_results[i])); - } - return std::move(partsupp_results[0]); - } - - private: -#define FOR_EACH_PART_COLUMN(F) \ - F(P_PARTKEY) \ - F(P_NAME) \ - F(P_MFGR) \ - F(P_BRAND) \ - F(P_TYPE) \ - F(P_SIZE) \ - F(P_CONTAINER) \ - F(P_RETAILPRICE) \ - F(P_COMMENT) - -#define FOR_EACH_PARTSUPP_COLUMN(F) \ - F(PS_PARTKEY) \ - F(PS_SUPPKEY) \ - F(PS_AVAILQTY) \ - F(PS_SUPPLYCOST) \ - F(PS_COMMENT) \ + Result> NextPartSuppBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(partsupp_output_queue_mutex_); + if (!partsupp_output_queue_.empty()) { + ExecBatch result = std::move(partsupp_output_queue_.front()); + partsupp_output_queue_.pop(); + return std::move(result); + } + } + { + std::lock_guard lock(part_output_queue_mutex_); + if (part_rows_generated_ == part_rows_to_generate_) { + return util::nullopt; + } else { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = + std::min(batch_size_, part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for (int col : part_cols_) RETURN_NOT_OK(kPartGenerators[col](thread_index)); + for (int col : partsupp_cols_) RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); + if (!part_cols_.empty()) { + std::vector part_result(part_cols_.size()); + for (size_t i = 0; i < part_cols_.size(); i++) { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch part_batch, + ExecBatch::Make(std::move(part_result))); + { + std::lock_guard lock(part_output_queue_mutex_); + part_output_queue_.emplace(std::move(part_batch)); + } + } + std::vector partsupp_results; + for (size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) { + std::vector partsupp_result(partsupp_cols_.size()); + for (size_t icol = 0; icol < partsupp_cols_.size(); icol++) { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(partsupp_output_queue_mutex_); + for (size_t i = 1; i < partsupp_results.size(); i++) + partsupp_output_queue_.emplace(std::move(partsupp_results[i])); + } + return std::move(partsupp_results[0]); + } + + private: +#define FOR_EACH_PART_COLUMN(F) \ + F(P_PARTKEY) \ + F(P_NAME) \ + F(P_MFGR) \ + F(P_BRAND) \ + F(P_TYPE) \ + F(P_SIZE) \ + F(P_CONTAINER) \ + F(P_RETAILPRICE) \ + F(P_COMMENT) + +#define FOR_EACH_PARTSUPP_COLUMN(F) \ + F(PS_PARTKEY) \ + F(PS_SUPPKEY) \ + F(PS_AVAILQTY) \ + F(PS_SUPPLYCOST) \ + F(PS_COMMENT) #define MAKE_ENUM(col) col, - struct PART - { - enum - { - FOR_EACH_PART_COLUMN(MAKE_ENUM) - kNumCols, - }; - }; - struct PARTSUPP - { - enum - { - FOR_EACH_PARTSUPP_COLUMN(MAKE_ENUM) - kNumCols, - }; - }; - -#define MAKE_STRING_MAP(col) \ - { #col, PART::col }, - const std::unordered_map kPartNameMap = - { - FOR_EACH_PART_COLUMN(MAKE_STRING_MAP) - }; + struct PART { + enum { + FOR_EACH_PART_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + struct PARTSUPP { + enum { + FOR_EACH_PARTSUPP_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) {#col, PART::col}, + const std::unordered_map kPartNameMap = { + FOR_EACH_PART_COLUMN(MAKE_STRING_MAP)}; #undef MAKE_STRING_MAP -#define MAKE_STRING_MAP(col) \ - { #col, PARTSUPP::col }, - const std::unordered_map kPartsuppNameMap = - { - FOR_EACH_PARTSUPP_COLUMN(MAKE_STRING_MAP) - }; +#define MAKE_STRING_MAP(col) {#col, PARTSUPP::col}, + const std::unordered_map kPartsuppNameMap = { + FOR_EACH_PARTSUPP_COLUMN(MAKE_STRING_MAP)}; #undef MAKE_STRING_MAP -#define MAKE_FN_ARRAY(col) \ - [this](size_t thread_index) { return this->col(thread_index); }, - std::vector kPartGenerators = - { - FOR_EACH_PART_COLUMN(MAKE_FN_ARRAY) - }; - std::vector kPartsuppGenerators = - { - FOR_EACH_PARTSUPP_COLUMN(MAKE_FN_ARRAY) - }; +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector kPartGenerators = {FOR_EACH_PART_COLUMN(MAKE_FN_ARRAY)}; + std::vector kPartsuppGenerators = { + FOR_EACH_PARTSUPP_COLUMN(MAKE_FN_ARRAY)}; #undef MAKE_FN_ARRAY #undef FOR_EACH_LINEITEM_COLUMN #undef FOR_EACH_ORDERS_COLUMN - const std::vector> kPartTypes = - { - int32(), - utf8(), - fixed_size_binary(25), - fixed_size_binary(10), - utf8(), - int32(), - fixed_size_binary(10), - decimal(12, 2), - utf8(), - }; - - const std::vector> kPartsuppTypes = - { - int32(), - int32(), - int32(), - decimal(12, 2), - utf8(), - }; - - Status AllocatePartBatch(size_t thread_index, int column) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - ARROW_DCHECK(tld.part[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[column]); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.part_to_generate * byte_width)); - ArrayData ad(kPartTypes[column], tld.part_to_generate, { nullptr, std::move(buff) }); - tld.part[column] = std::move(ad); - return Status::OK(); - } - - Status P_PARTKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_PARTKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_PARTKEY)); - int32_t *p_partkey = reinterpret_cast( - tld.part[PART::P_PARTKEY].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.part_to_generate; i++) - { - p_partkey[i] = static_cast(tld.partkey_start + i + 1); - ARROW_DCHECK(1 <= p_partkey[i] && p_partkey[i] <= part_rows_to_generate_); - } - } - return Status::OK(); - } - - Status P_NAME(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_NAME].kind() == Datum::NONE) - { - std::uniform_int_distribution dist(0, static_cast(kNumNameParts - 1)); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); - int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); - offsets[0] = 0; - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - size_t string_length = 0; - for(int ipart = 0; ipart < 5; ipart++) - { - uint8_t name_part_index = static_cast(dist(tld.rng)); - tld.string_indices[irow * 5 + ipart] = name_part_index; - string_length += std::strlen(NameParts[name_part_index]); - } - // Add 4 because there is a space between each word (i.e. four spaces) - offsets[irow + 1] = static_cast(offsets[irow] + string_length + 4); - } - // Add an extra byte for the space after in the very last string. - ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate] + 1)); - char *strings = reinterpret_cast(string_buffer->mutable_data()); - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - char *row = strings + offsets[irow]; - for(int ipart = 0; ipart < 5; ipart++) - { - uint8_t name_part_index = tld.string_indices[irow * 5 + ipart]; - const char *part = NameParts[name_part_index]; - size_t length = std::strlen(part); - std::memcpy(row, part, length); - row += length; - *row++ = ' '; - } - } - ArrayData ad(kPartTypes[PART::P_NAME], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); - Datum datum(ad); - tld.part[PART::P_NAME] = std::move(datum); - } - return Status::OK(); - } - - Status P_MFGR(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_MFGR].kind() == Datum::NONE) - { - std::uniform_int_distribution dist(1, 5); - const char *manufacturer = "Manufacturer#"; - const size_t manufacturer_length = std::strlen(manufacturer); - RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR)); - char *p_mfgr = reinterpret_cast(tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data()); - int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width); - char mfgr_id = '0' + dist(tld.rng); - *(p_mfgr + byte_width * irow + manufacturer_length) = mfgr_id; - } - } - return Status::OK(); - } - - Status P_BRAND(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_BRAND].kind() == Datum::NONE) - { - RETURN_NOT_OK(P_MFGR(thread_index)); - std::uniform_int_distribution dist(1, 5); - const char *brand = "Brand#"; - const size_t brand_length = std::strlen(brand); - RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); - const char *p_mfgr = reinterpret_cast( - tld.part[PART::P_MFGR].array()->buffers[1]->data()); - char *p_brand = reinterpret_cast( - tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); - int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_BRAND]); - int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); - const size_t mfgr_id_offset = std::strlen("Manufacturer#"); - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - char *row = p_brand + byte_width * irow; - char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); - char brand_id = '0' + dist(tld.rng); - std::strncpy(row, brand, byte_width); - *(row + brand_length) = mfgr_id; - *(row + brand_length + 1) = brand_id; - irow += 0; - } - } - return Status::OK(); - } - - Status P_TYPE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_TYPE].kind() == Datum::NONE) - { - using D = std::uniform_int_distribution; - D dists[] = - { - D{ 0, static_cast(kNumTypes_1 - 1) }, - D{ 0, static_cast(kNumTypes_2 - 1) }, - D{ 0, static_cast(kNumTypes_3 - 1) }, - }; - - const char **types[] = { Types_1, Types_2, Types_3 }; - - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); - int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); - offsets[0] = 0; - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - size_t string_length = 0; - for(int ipart = 0; ipart < 3; ipart++) - { - uint8_t name_part_index = static_cast(dists[ipart](tld.rng)); - tld.string_indices[irow * 3 + ipart] = name_part_index; - string_length += std::strlen(types[ipart][name_part_index]); - } - offsets[irow + 1] = static_cast(offsets[irow] + string_length); - } - ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, AllocateBuffer(offsets[tld.part_to_generate])); - char *strings = reinterpret_cast(string_buffer->mutable_data()); - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - char *row = strings + offsets[irow]; - for(int ipart = 0; ipart < 3; ipart++) - { - uint8_t name_part_index = tld.string_indices[irow * 3 + ipart]; - const char *part = types[ipart][name_part_index]; - size_t length = std::strlen(part); - std::memcpy(row, part, length); - row += length; - } - } - ArrayData ad(kPartTypes[PART::P_TYPE], tld.part_to_generate, { nullptr, std::move(offset_buff), std::move(string_buffer) }); - Datum datum(ad); - tld.part[PART::P_TYPE] = std::move(datum); - } - return Status::OK(); - } - - Status P_SIZE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_SIZE].kind() == Datum::NONE) - { - std::uniform_int_distribution dist(1, 50); - RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_SIZE)); - int32_t *p_size = reinterpret_cast( - tld.part[PART::P_SIZE].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.part_to_generate; i++) - p_size[i] = dist(tld.rng); - } - return Status::OK(); - } - - Status P_CONTAINER(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_CONTAINER].kind() == Datum::NONE) - { - std::uniform_int_distribution dist1(0, static_cast(kNumContainers_1 - 1)); - std::uniform_int_distribution dist2(0, static_cast(kNumContainers_2 - 1)); - RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER)); - char *p_container = reinterpret_cast( - tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data()); - int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_CONTAINER]); - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - int container1_idx = dist1(tld.rng); - int container2_idx = dist2(tld.rng); - const char *container1 = Containers_1[container1_idx]; - const char *container2 = Containers_2[container2_idx]; - size_t container1_length = std::strlen(container1); - size_t container2_length = std::strlen(container2); - - char *row = p_container + byte_width * irow; - std::strncpy(row, container1, byte_width); - std::memcpy(row + container1_length, container2, container2_length); - } - } - return Status::OK(); - } - - Status P_RETAILPRICE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_RETAILPRICE].kind() == Datum::NONE) - { - RETURN_NOT_OK(P_PARTKEY(thread_index)); - RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_RETAILPRICE)); - const int32_t *p_partkey = reinterpret_cast( - tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); - Decimal128 *p_retailprice = reinterpret_cast( - tld.part[PART::P_RETAILPRICE].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.part_to_generate; irow++) - { - int32_t partkey = p_partkey[irow]; - int64_t retail_price = (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); - p_retailprice[irow] = { retail_price }; - } - } - return Status::OK(); - } - - Status P_COMMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PART::P_COMMENT].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE(tld.part[PART::P_COMMENT], g_text.GenerateComments(tld.part_to_generate, 5, 22, tld.rng)); - } - return Status::OK(); - } - - int64_t PartsuppBatchesToGenerate(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; - return num_batches; - } - - Status InitPartsupp(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - tld.generated_partsupp.reset(); - int64_t num_batches = PartsuppBatchesToGenerate(thread_index); - tld.partsupp.resize(num_batches); - for(std::vector &batch : tld.partsupp) - { - batch.resize(PARTSUPP::kNumCols); - std::fill(batch.begin(), batch.end(), Datum()); - } - return Status::OK(); - } - - Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - int32_t byte_width = arrow::internal::GetByteWidth(*kPartsuppTypes[column]); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); - ArrayData ad(kPartsuppTypes[column], batch_size_, { nullptr, std::move(buff) }); - tld.partsupp[ibatch][column] = std::move(ad); - return Status::OK(); - } - - Status PS_PARTKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_partsupp[PARTSUPP::PS_PARTKEY]) - { - tld.generated_partsupp[PARTSUPP::PS_PARTKEY] = true; - RETURN_NOT_OK(P_PARTKEY(thread_index)); - const int32_t *p_partkey = reinterpret_cast( - tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); - - size_t ibatch = 0; - int64_t ipartsupp = 0; - int64_t ipart = 0; - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - for(int64_t irow = 0; irow < ps_to_generate; ibatch++) - { - RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_PARTKEY)); - int32_t *ps_partkey = reinterpret_cast( - tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->buffers[1]->mutable_data()); - int64_t next_run = std::min(batch_size_, ps_to_generate - irow); - - int64_t batch_offset = 0; - for(int64_t irun = 0; irun < next_run;) - { - for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) - ps_partkey[batch_offset++] = p_partkey[ipart]; - - if(ipartsupp == kPartSuppRowsPerPart) - { - ipartsupp = 0; - ipart++; - } - } - irow += next_run; - tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->length = batch_offset; - } - } - return Status::OK(); - } - - Status PS_SUPPKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_partsupp[PARTSUPP::PS_SUPPKEY]) - { - tld.generated_partsupp[PARTSUPP::PS_SUPPKEY] = true; - RETURN_NOT_OK(P_PARTKEY(thread_index)); - const int32_t *p_partkey = reinterpret_cast( - tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); - - size_t ibatch = 0; - int64_t ipartsupp = 0; - int64_t ipart = 0; - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - const int32_t S = static_cast(scale_factor_ * 10000); - for(int64_t irow = 0; irow < ps_to_generate; ibatch++) - { - RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); - int32_t *ps_suppkey = reinterpret_cast( - tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->buffers[1]->mutable_data()); - int64_t next_run = std::min(batch_size_, ps_to_generate - irow); - - int64_t batch_offset = 0; - for(int64_t irun = 0; irun < next_run;) - { - for(; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) - { - int32_t supplier = static_cast(ipartsupp); - int32_t partkey = p_partkey[ipart]; - ps_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; - } - if(ipartsupp == kPartSuppRowsPerPart) - { - ipartsupp = 0; - ipart++; - } - } - irow += next_run; - tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->length = batch_offset; - } - } - return Status::OK(); - } - - Status PS_AVAILQTY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_partsupp[PARTSUPP::PS_AVAILQTY]) - { - tld.generated_partsupp[PARTSUPP::PS_AVAILQTY] = true; - std::uniform_int_distribution dist(1, 9999); - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - int64_t ibatch = 0; - for(int64_t irow = 0; irow < ps_to_generate; ibatch++) - { - RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_AVAILQTY)); - int32_t *ps_availqty = reinterpret_cast( - tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->buffers[1]->mutable_data()); - int64_t next_run = std::min(batch_size_, ps_to_generate - irow); - for(int64_t irun = 0; irun < next_run; irun++) - ps_availqty[irun] = dist(tld.rng); - - tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; - irow += next_run; - } - } - return Status::OK(); - } - - Status PS_SUPPLYCOST(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST]) - { - tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST] = true; - std::uniform_int_distribution dist(100, 100000); - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - int64_t ibatch = 0; - for(int64_t irow = 0; irow < ps_to_generate; ibatch++) - { - RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPLYCOST)); - Decimal128 *ps_supplycost = reinterpret_cast( - tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->buffers[1]->mutable_data()); - int64_t next_run = std::min(batch_size_, ps_to_generate - irow); - for(int64_t irun = 0; irun < next_run; irun++) - ps_supplycost[irun] = { dist(tld.rng) }; - - tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->length = next_run; - irow += next_run; - } - } - return Status::OK(); - } - - Status PS_COMMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.part[PARTSUPP::PS_COMMENT].kind() == Datum::NONE) - { - int64_t irow = 0; - int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; - for(size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) - { - int64_t num_rows = std::min(batch_size_, ps_to_generate - irow); - ARROW_ASSIGN_OR_RAISE( - tld.partsupp[ibatch][PARTSUPP::PS_COMMENT], g_text.GenerateComments(num_rows, 49, 198, tld.rng)); - irow += num_rows; - } - } - return Status::OK(); - } - - struct ThreadLocalData - { - std::vector part; - std::vector string_indices; - int64_t part_to_generate{0}; - int64_t partkey_start{0}; - - std::vector> partsupp; - std::bitset generated_partsupp; - random::pcg32_fast rng; - }; - std::vector thread_local_data_; - - bool inited_ = false; - std::mutex part_output_queue_mutex_; - std::mutex partsupp_output_queue_mutex_; - std::queue part_output_queue_; - std::queue partsupp_output_queue_; - int64_t batch_size_{0}; - float scale_factor_{0}; - int64_t part_rows_to_generate_{0}; - int64_t part_rows_generated_{0}; - std::vector part_cols_; - std::vector partsupp_cols_; - ThreadIndexer thread_indexer_; - - std::atomic part_batches_generated_ = { 0 }; - std::atomic partsupp_batches_generated_ = { 0 }; - static constexpr int64_t kPartSuppRowsPerPart = 4; - }; - - class OrdersAndLineItemGenerator - { - public: - Status Init( - size_t num_threads, - int64_t batch_size, - float scale_factor) - { - if(!inited_) - { - inited_ = true; - batch_size_ = batch_size; - scale_factor_ = scale_factor; - - arrow_vendored::pcg_extras::seed_seq_from seq; - thread_local_data_.resize(num_threads); - for(ThreadLocalData &tld : thread_local_data_) - { - tld.items_per_order.resize(batch_size_); - tld.rng.seed(seq); - } - orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); - } - return Status::OK(); - } - - int64_t orders_batches_generated() const - { - return orders_batches_generated_.load(); - } - - int64_t lineitem_batches_generated() const - { - return lineitem_batches_generated_.load(); - } - - Result> SetOrdersOutputColumns(const std::vector &cols) - { - return SetOutputColumns(cols, kOrdersTypes, kOrdersNameMap, orders_cols_); - } - - Result> SetLineItemOutputColumns(const std::vector &cols) - { - return SetOutputColumns(cols, kLineitemTypes, kLineitemNameMap, lineitem_cols_); - } - - Result> NextOrdersBatch() - { - size_t thread_index = thread_indexer_(); - ThreadLocalData &tld = thread_local_data_[thread_index]; - { - std::lock_guard lock(orders_output_queue_mutex_); - if(!orders_output_queue_.empty()) - { - ExecBatch batch = std::move(orders_output_queue_.front()); - orders_output_queue_.pop(); - return std::move(batch); - } - else if(orders_rows_generated_ == orders_rows_to_generate_) - { - return util::nullopt; - } - else - { - tld.orderkey_start = orders_rows_generated_; - tld.orders_to_generate = std::min( - batch_size_, - orders_rows_to_generate_ - orders_rows_generated_); - orders_rows_generated_ += tld.orders_to_generate; - orders_batches_generated_.fetch_add(1); - ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); - } - } - tld.orders.resize(ORDERS::kNumCols); - std::fill(tld.orders.begin(), tld.orders.end(), Datum()); - RETURN_NOT_OK(GenerateRowCounts(thread_index)); - tld.first_batch_offset = 0; - tld.generated_lineitem.reset(); - - for(int col : orders_cols_) - RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); - for(int col : lineitem_cols_) - RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); - - std::vector orders_result(orders_cols_.size()); - for(size_t i = 0; i < orders_cols_.size(); i++) - { - int col_idx = orders_cols_[i]; - orders_result[i] = tld.orders[col_idx]; - } - if(!lineitem_cols_.empty()) - { - std::vector lineitem_results; - for(size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) - { - std::vector lineitem_result(lineitem_cols_.size()); - for(size_t icol = 0; icol < lineitem_cols_.size(); icol++) - { - int col_idx = lineitem_cols_[icol]; - lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); - lineitem_results.emplace_back(std::move(eb)); - } - { - std::lock_guard guard(lineitem_output_queue_mutex_); - for(ExecBatch &eb : lineitem_results) - { - lineitem_output_queue_.emplace(std::move(eb)); - } - } - } - return ExecBatch::Make(std::move(orders_result)); - } - - Result> NextLineItemBatch() - { - size_t thread_index = thread_indexer_(); - ThreadLocalData &tld = thread_local_data_[thread_index]; - ExecBatch queued; - bool from_queue = false; - { - std::lock_guard lock(lineitem_output_queue_mutex_); - if(!lineitem_output_queue_.empty()) - { - queued = std::move(lineitem_output_queue_.front()); - lineitem_output_queue_.pop(); - from_queue = true; - } - } - tld.first_batch_offset = 0; - if(from_queue) - { - ARROW_DCHECK(queued.length <= batch_size_); - tld.first_batch_offset = queued.length; - if(queued.length == batch_size_) - return std::move(queued); - } - { - std::lock_guard lock(orders_output_queue_mutex_); - if(orders_rows_generated_ == orders_rows_to_generate_) - { - if(from_queue) - return std::move(queued); - return util::nullopt; - } - - tld.orderkey_start = orders_rows_generated_; - tld.orders_to_generate = std::min( - batch_size_, - orders_rows_to_generate_ - orders_rows_generated_); - orders_rows_generated_ += tld.orders_to_generate; - orders_batches_generated_.fetch_add(1ll); - ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); - } - tld.orders.resize(ORDERS::kNumCols); - std::fill(tld.orders.begin(), tld.orders.end(), Datum()); - RETURN_NOT_OK(GenerateRowCounts(thread_index)); - tld.generated_lineitem.reset(); - if(from_queue) - { - lineitem_batches_generated_.fetch_sub(1); - for(size_t i = 0; i < lineitem_cols_.size(); i++) - if(tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) - tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); - } - - for(int col : orders_cols_) - RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); - for(int col : lineitem_cols_) - RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); - - if(!orders_cols_.empty()) - { - std::vector orders_result(orders_cols_.size()); - for(size_t i = 0; i < orders_cols_.size(); i++) - { - int col_idx = orders_cols_[i]; - orders_result[i] = tld.orders[col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch orders_batch, ExecBatch::Make(std::move(orders_result))); - { - std::lock_guard lock(orders_output_queue_mutex_); - orders_output_queue_.emplace(std::move(orders_batch)); - } - } - std::vector lineitem_results; - for(size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) - { - std::vector lineitem_result(lineitem_cols_.size()); - for(size_t icol = 0; icol < lineitem_cols_.size(); icol++) - { - int col_idx = lineitem_cols_[icol]; - lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); - lineitem_results.emplace_back(std::move(eb)); - } - lineitem_batches_generated_.fetch_add(static_cast(lineitem_results.size())); - // Return the first batch, enqueue the rest. - { - std::lock_guard lock(lineitem_output_queue_mutex_); - for(size_t i = 1; i < lineitem_results.size(); i++) - lineitem_output_queue_.emplace(std::move(lineitem_results[i])); - } - return std::move(lineitem_results[0]); - } - - private: -#define FOR_EACH_ORDERS_COLUMN(F) \ - F(O_ORDERKEY) \ - F(O_CUSTKEY) \ - F(O_ORDERSTATUS) \ - F(O_TOTALPRICE) \ - F(O_ORDERDATE) \ - F(O_ORDERPRIORITY) \ - F(O_CLERK) \ - F(O_SHIPPRIORITY) \ - F(O_COMMENT) - -#define FOR_EACH_LINEITEM_COLUMN(F) \ - F(L_ORDERKEY) \ - F(L_PARTKEY) \ - F(L_SUPPKEY) \ - F(L_LINENUMBER) \ - F(L_QUANTITY) \ - F(L_EXTENDEDPRICE) \ - F(L_DISCOUNT) \ - F(L_TAX) \ - F(L_RETURNFLAG) \ - F(L_LINESTATUS) \ - F(L_SHIPDATE) \ - F(L_COMMITDATE) \ - F(L_RECEIPTDATE) \ - F(L_SHIPINSTRUCT) \ - F(L_SHIPMODE) \ - F(L_COMMENT) + const std::vector> kPartTypes = { + int32(), utf8(), fixed_size_binary(25), fixed_size_binary(10), + utf8(), int32(), fixed_size_binary(10), decimal(12, 2), + utf8(), + }; + + const std::vector> kPartsuppTypes = { + int32(), int32(), int32(), decimal(12, 2), utf8(), + }; + + Status AllocatePartBatch(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.part[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.part_to_generate * byte_width)); + ArrayData ad(kPartTypes[column], tld.part_to_generate, {nullptr, std::move(buff)}); + tld.part[column] = std::move(ad); + return Status::OK(); + } + + Status P_PARTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_PARTKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_PARTKEY)); + int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.part_to_generate; i++) { + p_partkey[i] = static_cast(tld.partkey_start + i + 1); + ARROW_DCHECK(1 <= p_partkey[i] && p_partkey[i] <= part_rows_to_generate_); + } + } + return Status::OK(); + } + + Status P_NAME(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_NAME].kind() == Datum::NONE) { + std::uniform_int_distribution dist( + 0, static_cast(kNumNameParts - 1)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + size_t string_length = 0; + for (int ipart = 0; ipart < 5; ipart++) { + uint8_t name_part_index = static_cast(dist(tld.rng)); + tld.string_indices[irow * 5 + ipart] = name_part_index; + string_length += std::strlen(NameParts[name_part_index]); + } + // Add 4 because there is a space between each word (i.e. four spaces) + offsets[irow + 1] = static_cast(offsets[irow] + string_length + 4); + } + // Add an extra byte for the space after in the very last string. + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, + AllocateBuffer(offsets[tld.part_to_generate] + 1)); + char* strings = reinterpret_cast(string_buffer->mutable_data()); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char* row = strings + offsets[irow]; + for (int ipart = 0; ipart < 5; ipart++) { + uint8_t name_part_index = tld.string_indices[irow * 5 + ipart]; + const char* part = NameParts[name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + *row++ = ' '; + } + } + ArrayData ad(kPartTypes[PART::P_NAME], tld.part_to_generate, + {nullptr, std::move(offset_buff), std::move(string_buffer)}); + Datum datum(ad); + tld.part[PART::P_NAME] = std::move(datum); + } + return Status::OK(); + } + + Status P_MFGR(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_MFGR].kind() == Datum::NONE) { + std::uniform_int_distribution dist(1, 5); + const char* manufacturer = "Manufacturer#"; + const size_t manufacturer_length = std::strlen(manufacturer); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR)); + char* p_mfgr = reinterpret_cast( + tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width); + char mfgr_id = '0' + dist(tld.rng); + *(p_mfgr + byte_width * irow + manufacturer_length) = mfgr_id; + } + } + return Status::OK(); + } + + Status P_BRAND(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_BRAND].kind() == Datum::NONE) { + RETURN_NOT_OK(P_MFGR(thread_index)); + std::uniform_int_distribution dist(1, 5); + const char* brand = "Brand#"; + const size_t brand_length = std::strlen(brand); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); + const char* p_mfgr = reinterpret_cast( + tld.part[PART::P_MFGR].array()->buffers[1]->data()); + char* p_brand = reinterpret_cast( + tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_BRAND]); + int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); + const size_t mfgr_id_offset = std::strlen("Manufacturer#"); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char* row = p_brand + byte_width * irow; + char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); + char brand_id = '0' + dist(tld.rng); + std::strncpy(row, brand, byte_width); + *(row + brand_length) = mfgr_id; + *(row + brand_length + 1) = brand_id; + irow += 0; + } + } + return Status::OK(); + } + + Status P_TYPE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_TYPE].kind() == Datum::NONE) { + using D = std::uniform_int_distribution; + D dists[] = { + D{0, static_cast(kNumTypes_1 - 1)}, + D{0, static_cast(kNumTypes_2 - 1)}, + D{0, static_cast(kNumTypes_3 - 1)}, + }; + + const char** types[] = {Types_1, Types_2, Types_3}; + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + size_t string_length = 0; + for (int ipart = 0; ipart < 3; ipart++) { + uint8_t name_part_index = static_cast(dists[ipart](tld.rng)); + tld.string_indices[irow * 3 + ipart] = name_part_index; + string_length += std::strlen(types[ipart][name_part_index]); + } + offsets[irow + 1] = static_cast(offsets[irow] + string_length); + } + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, + AllocateBuffer(offsets[tld.part_to_generate])); + char* strings = reinterpret_cast(string_buffer->mutable_data()); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char* row = strings + offsets[irow]; + for (int ipart = 0; ipart < 3; ipart++) { + uint8_t name_part_index = tld.string_indices[irow * 3 + ipart]; + const char* part = types[ipart][name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + } + } + ArrayData ad(kPartTypes[PART::P_TYPE], tld.part_to_generate, + {nullptr, std::move(offset_buff), std::move(string_buffer)}); + Datum datum(ad); + tld.part[PART::P_TYPE] = std::move(datum); + } + return Status::OK(); + } + + Status P_SIZE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_SIZE].kind() == Datum::NONE) { + std::uniform_int_distribution dist(1, 50); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_SIZE)); + int32_t* p_size = reinterpret_cast( + tld.part[PART::P_SIZE].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.part_to_generate; i++) p_size[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status P_CONTAINER(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_CONTAINER].kind() == Datum::NONE) { + std::uniform_int_distribution dist1( + 0, static_cast(kNumContainers_1 - 1)); + std::uniform_int_distribution dist2( + 0, static_cast(kNumContainers_2 - 1)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER)); + char* p_container = reinterpret_cast( + tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_CONTAINER]); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + int container1_idx = dist1(tld.rng); + int container2_idx = dist2(tld.rng); + const char* container1 = Containers_1[container1_idx]; + const char* container2 = Containers_2[container2_idx]; + size_t container1_length = std::strlen(container1); + size_t container2_length = std::strlen(container2); + + char* row = p_container + byte_width * irow; + std::strncpy(row, container1, byte_width); + std::memcpy(row + container1_length, container2, container2_length); + } + } + return Status::OK(); + } + + Status P_RETAILPRICE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_RETAILPRICE].kind() == Datum::NONE) { + RETURN_NOT_OK(P_PARTKEY(thread_index)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_RETAILPRICE)); + const int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + Decimal128* p_retailprice = reinterpret_cast( + tld.part[PART::P_RETAILPRICE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + int32_t partkey = p_partkey[irow]; + int64_t retail_price = + (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + p_retailprice[irow] = {retail_price}; + } + } + return Status::OK(); + } + + Status P_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE( + tld.part[PART::P_COMMENT], + g_text.GenerateComments(tld.part_to_generate, 5, 22, tld.rng)); + } + return Status::OK(); + } + + int64_t PartsuppBatchesToGenerate(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + return num_batches; + } + + Status InitPartsupp(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + tld.generated_partsupp.reset(); + int64_t num_batches = PartsuppBatchesToGenerate(thread_index); + tld.partsupp.resize(num_batches); + for (std::vector& batch : tld.partsupp) { + batch.resize(PARTSUPP::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); + } + return Status::OK(); + } + + Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + int32_t byte_width = arrow::internal::GetByteWidth(*kPartsuppTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(kPartsuppTypes[column], batch_size_, {nullptr, std::move(buff)}); + tld.partsupp[ibatch][column] = std::move(ad); + return Status::OK(); + } + + Status PS_PARTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_PARTKEY]) { + tld.generated_partsupp[PARTSUPP::PS_PARTKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_PARTKEY)); + int32_t* ps_partkey = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for (int64_t irun = 0; irun < next_run;) { + for (; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) + ps_partkey[batch_offset++] = p_partkey[ipart]; + + if (ipartsupp == kPartSuppRowsPerPart) { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_SUPPKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_SUPPKEY]) { + tld.generated_partsupp[PARTSUPP::PS_SUPPKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + const int32_t S = static_cast(scale_factor_ * 10000); + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); + int32_t* ps_suppkey = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for (int64_t irun = 0; irun < next_run;) { + for (; ipartsupp < kPartSuppRowsPerPart && irun < next_run; + ipartsupp++, irun++) { + int32_t supplier = static_cast(ipartsupp); + int32_t partkey = p_partkey[ipart]; + ps_suppkey[batch_offset++] = + (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + if (ipartsupp == kPartSuppRowsPerPart) { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_AVAILQTY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_AVAILQTY]) { + tld.generated_partsupp[PARTSUPP::PS_AVAILQTY] = true; + std::uniform_int_distribution dist(1, 9999); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_AVAILQTY)); + int32_t* ps_availqty = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for (int64_t irun = 0; irun < next_run; irun++) ps_availqty[irun] = dist(tld.rng); + + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_SUPPLYCOST(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST]) { + tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST] = true; + std::uniform_int_distribution dist(100, 100000); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK( + AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPLYCOST)); + Decimal128* ps_supplycost = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for (int64_t irun = 0; irun < next_run; irun++) + ps_supplycost[irun] = {dist(tld.rng)}; + + tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PARTSUPP::PS_COMMENT].kind() == Datum::NONE) { + int64_t irow = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for (size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) { + int64_t num_rows = std::min(batch_size_, ps_to_generate - irow); + ARROW_ASSIGN_OR_RAISE(tld.partsupp[ibatch][PARTSUPP::PS_COMMENT], + g_text.GenerateComments(num_rows, 49, 198, tld.rng)); + irow += num_rows; + } + } + return Status::OK(); + } + + struct ThreadLocalData { + std::vector part; + std::vector string_indices; + int64_t part_to_generate{0}; + int64_t partkey_start{0}; + + std::vector> partsupp; + std::bitset generated_partsupp; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex part_output_queue_mutex_; + std::mutex partsupp_output_queue_mutex_; + std::queue part_output_queue_; + std::queue partsupp_output_queue_; + int64_t batch_size_{0}; + float scale_factor_{0}; + int64_t part_rows_to_generate_{0}; + int64_t part_rows_generated_{0}; + std::vector part_cols_; + std::vector partsupp_cols_; + ThreadIndexer thread_indexer_; + + std::atomic part_batches_generated_ = {0}; + std::atomic partsupp_batches_generated_ = {0}; + static constexpr int64_t kPartSuppRowsPerPart = 4; +}; + +class OrdersAndLineItemGenerator { + public: + Status Init(size_t num_threads, int64_t batch_size, float scale_factor) { + if (!inited_) { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + arrow_vendored::pcg_extras::seed_seq_from seq; + thread_local_data_.resize(num_threads); + for (ThreadLocalData& tld : thread_local_data_) { + tld.items_per_order.resize(batch_size_); + tld.rng.seed(seq); + } + orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); + } + return Status::OK(); + } + + int64_t orders_batches_generated() const { return orders_batches_generated_.load(); } + + int64_t lineitem_batches_generated() const { + return lineitem_batches_generated_.load(); + } + + Result> SetOrdersOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kOrdersTypes, kOrdersNameMap, orders_cols_); + } + + Result> SetLineItemOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kLineitemTypes, kLineitemNameMap, lineitem_cols_); + } + + Result> NextOrdersBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(orders_output_queue_mutex_); + if (!orders_output_queue_.empty()) { + ExecBatch batch = std::move(orders_output_queue_.front()); + orders_output_queue_.pop(); + return std::move(batch); + } else if (orders_rows_generated_ == orders_rows_to_generate_) { + return util::nullopt; + } else { + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = + std::min(batch_size_, orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1); + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + } + } + tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.first_batch_offset = 0; + tld.generated_lineitem.reset(); + + for (int col : orders_cols_) RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); + for (int col : lineitem_cols_) RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); + + std::vector orders_result(orders_cols_.size()); + for (size_t i = 0; i < orders_cols_.size(); i++) { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + if (!lineitem_cols_.empty()) { + std::vector lineitem_results; + for (size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) { + std::vector lineitem_result(lineitem_cols_.size()); + for (size_t icol = 0; icol < lineitem_cols_.size(); icol++) { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(lineitem_output_queue_mutex_); + for (ExecBatch& eb : lineitem_results) { + lineitem_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(orders_result)); + } + + Result> NextLineItemBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + ExecBatch queued; + bool from_queue = false; + { + std::lock_guard lock(lineitem_output_queue_mutex_); + if (!lineitem_output_queue_.empty()) { + queued = std::move(lineitem_output_queue_.front()); + lineitem_output_queue_.pop(); + from_queue = true; + } + } + tld.first_batch_offset = 0; + if (from_queue) { + ARROW_DCHECK(queued.length <= batch_size_); + tld.first_batch_offset = queued.length; + if (queued.length == batch_size_) return std::move(queued); + } + { + std::lock_guard lock(orders_output_queue_mutex_); + if (orders_rows_generated_ == orders_rows_to_generate_) { + if (from_queue) return std::move(queued); + return util::nullopt; + } + + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = + std::min(batch_size_, orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1ll); + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + } + tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.generated_lineitem.reset(); + if (from_queue) { + lineitem_batches_generated_.fetch_sub(1); + for (size_t i = 0; i < lineitem_cols_.size(); i++) + if (tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) + tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); + } + + for (int col : orders_cols_) RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); + for (int col : lineitem_cols_) RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); + + if (!orders_cols_.empty()) { + std::vector orders_result(orders_cols_.size()); + for (size_t i = 0; i < orders_cols_.size(); i++) { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch orders_batch, + ExecBatch::Make(std::move(orders_result))); + { + std::lock_guard lock(orders_output_queue_mutex_); + orders_output_queue_.emplace(std::move(orders_batch)); + } + } + std::vector lineitem_results; + for (size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) { + std::vector lineitem_result(lineitem_cols_.size()); + for (size_t icol = 0; icol < lineitem_cols_.size(); icol++) { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + lineitem_batches_generated_.fetch_add(static_cast(lineitem_results.size())); + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(lineitem_output_queue_mutex_); + for (size_t i = 1; i < lineitem_results.size(); i++) + lineitem_output_queue_.emplace(std::move(lineitem_results[i])); + } + return std::move(lineitem_results[0]); + } + + private: +#define FOR_EACH_ORDERS_COLUMN(F) \ + F(O_ORDERKEY) \ + F(O_CUSTKEY) \ + F(O_ORDERSTATUS) \ + F(O_TOTALPRICE) \ + F(O_ORDERDATE) \ + F(O_ORDERPRIORITY) \ + F(O_CLERK) \ + F(O_SHIPPRIORITY) \ + F(O_COMMENT) + +#define FOR_EACH_LINEITEM_COLUMN(F) \ + F(L_ORDERKEY) \ + F(L_PARTKEY) \ + F(L_SUPPKEY) \ + F(L_LINENUMBER) \ + F(L_QUANTITY) \ + F(L_EXTENDEDPRICE) \ + F(L_DISCOUNT) \ + F(L_TAX) \ + F(L_RETURNFLAG) \ + F(L_LINESTATUS) \ + F(L_SHIPDATE) \ + F(L_COMMITDATE) \ + F(L_RECEIPTDATE) \ + F(L_SHIPINSTRUCT) \ + F(L_SHIPMODE) \ + F(L_COMMENT) #define MAKE_ENUM(col) col, - struct ORDERS - { - enum - { - FOR_EACH_ORDERS_COLUMN(MAKE_ENUM) - kNumCols, - }; - }; - struct LINEITEM - { - enum - { - FOR_EACH_LINEITEM_COLUMN(MAKE_ENUM) - kNumCols, - }; - }; - -#define MAKE_STRING_MAP(col) \ - { #col, ORDERS::col }, - const std::unordered_map kOrdersNameMap = - { - FOR_EACH_ORDERS_COLUMN(MAKE_STRING_MAP) - }; + struct ORDERS { + enum { + FOR_EACH_ORDERS_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + struct LINEITEM { + enum { + FOR_EACH_LINEITEM_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) {#col, ORDERS::col}, + const std::unordered_map kOrdersNameMap = { + FOR_EACH_ORDERS_COLUMN(MAKE_STRING_MAP)}; #undef MAKE_STRING_MAP -#define MAKE_STRING_MAP(col) \ - { #col, LINEITEM::col }, - const std::unordered_map kLineitemNameMap = - { - FOR_EACH_LINEITEM_COLUMN(MAKE_STRING_MAP) - }; +#define MAKE_STRING_MAP(col) {#col, LINEITEM::col}, + const std::unordered_map kLineitemNameMap = { + FOR_EACH_LINEITEM_COLUMN(MAKE_STRING_MAP)}; #undef MAKE_STRING_MAP -#define MAKE_FN_ARRAY(col) \ - [this](size_t thread_index) { return this->col(thread_index); }, - const std::vector kOrdersGenerators = - { - FOR_EACH_ORDERS_COLUMN(MAKE_FN_ARRAY) - }; - const std::vector kLineitemGenerators = - { - FOR_EACH_LINEITEM_COLUMN(MAKE_FN_ARRAY) - }; +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + const std::vector kOrdersGenerators = { + FOR_EACH_ORDERS_COLUMN(MAKE_FN_ARRAY)}; + const std::vector kLineitemGenerators = { + FOR_EACH_LINEITEM_COLUMN(MAKE_FN_ARRAY)}; #undef MAKE_FN_ARRAY #undef FOR_EACH_LINEITEM_COLUMN #undef FOR_EACH_ORDERS_COLUMN - const std::vector> kOrdersTypes = - { - int32(), - int32(), - fixed_size_binary(1), - decimal(12, 2), - date32(), - fixed_size_binary(15), - fixed_size_binary(15), - int32(), - utf8() - }; - - const std::vector> kLineitemTypes = - { - int32(), - int32(), - int32(), - int32(), - decimal(12, 2), - decimal(12, 2), - decimal(12, 2), - decimal(12, 2), - fixed_size_binary(1), - fixed_size_binary(1), - date32(), - date32(), - date32(), - fixed_size_binary(25), - fixed_size_binary(10), - utf8(), - }; - - Status AllocateOrdersBatch(size_t thread_index, int column) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[column]); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.orders_to_generate * byte_width)); - ArrayData ad(kOrdersTypes[column], tld.orders_to_generate, { nullptr, std::move(buff) }); - tld.orders[column] = std::move(ad); - return Status::OK(); - } - - Status O_ORDERKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_ORDERKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERKEY)); - int32_t *o_orderkey = reinterpret_cast( - tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.orders_to_generate; i++) - { - int32_t orderkey_index = static_cast(tld.orderkey_start + i); - int32_t index_of_run = orderkey_index / 8; - int32_t index_in_run = orderkey_index % 8; - o_orderkey[i] = (index_of_run * 32 + index_in_run + 1); - ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= 4 * orders_rows_to_generate_); - } - } - return Status::OK(); - } - - Status O_CUSTKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_CUSTKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CUSTKEY)); - - // Spec says it must be a random number between 1 and SF*150000 that is not - // divisible by 3. Rather than repeatedly generating numbers until we get to - // a non-divisible-by-3 number, we just generate a number between - // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. - int32_t sf_50k = static_cast(scale_factor_ * 50000); - std::uniform_int_distribution base_dist(0, sf_50k - 1); - std::uniform_int_distribution offset_dist(1, 2); - int32_t *o_custkey = reinterpret_cast( - tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.orders_to_generate; i++) - o_custkey[i] = 3 * base_dist(tld.rng) + offset_dist(tld.rng); - } - return Status::OK(); - } - - Status O_ORDERSTATUS(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_ORDERSTATUS].kind() == Datum::NONE) - { - RETURN_NOT_OK(L_LINESTATUS(thread_index)); - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERSTATUS)); - - char *o_orderstatus = reinterpret_cast( - tld.orders[ORDERS::O_ORDERSTATUS].array()->buffers[1]->mutable_data()); - - size_t batch_offset = tld.first_batch_offset; - size_t ibatch = 0; - size_t iorder = 0; - int32_t iline = 0; - bool all_f = true; - bool all_o = true; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - const char *l_linestatus = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->data()); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - for(int64_t irun = 0; irun < next_run;) - { - for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++, batch_offset++) - { - all_f &= l_linestatus[batch_offset] == 'F'; - all_o &= l_linestatus[batch_offset] == 'O'; - } - if(iline == tld.items_per_order[iorder]) - { - iline = 0; - ARROW_DCHECK(!(all_f && all_o)); - if(all_f) - o_orderstatus[iorder] = 'F'; - else if(all_o) - o_orderstatus[iorder] = 'O'; - else - o_orderstatus[iorder] = 'P'; - iorder++; - } - } - irow += next_run; - batch_offset = 0; - } - } - return Status::OK(); - } - - Status O_TOTALPRICE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_TOTALPRICE].kind() == Datum::NONE) - { - RETURN_NOT_OK(L_EXTENDEDPRICE(thread_index)); - RETURN_NOT_OK(L_TAX(thread_index)); - RETURN_NOT_OK(L_DISCOUNT(thread_index)); - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_TOTALPRICE)); - - size_t batch_offset = tld.first_batch_offset; - size_t ibatch = 0; - size_t iorder = 0; - int32_t iline = 0; - int64_t sum = 0; - Decimal128 *o_totalprice = reinterpret_cast( - tld.orders[ORDERS::O_TOTALPRICE].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - - const Decimal128 *l_extendedprice = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->data()); - const Decimal128 *l_tax = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->data()); - const Decimal128 *l_discount = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->data()); - - for(int64_t irun = 0; irun < next_run;) - { - for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++, batch_offset++) - { - int64_t eprice = static_cast(l_extendedprice[batch_offset]); - int64_t tax = static_cast(l_tax[batch_offset]); - int64_t discount = static_cast(l_discount[batch_offset]); - sum += (eprice * (100 + tax) * (100 - discount)); - } - if(iline == tld.items_per_order[iorder]) - { - sum /= 100 * 100; - o_totalprice[iorder] = { sum }; - iline = 0; - iorder++; - } - } - irow += next_run; - batch_offset = 0; - } - } - return Status::OK(); - } - - Status O_ORDERDATE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_ORDERDATE].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERDATE)); - - std::uniform_int_distribution dist(kStartDate, kEndDate - 151); - uint32_t *o_orderdate = reinterpret_cast( - tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.orders_to_generate; i++) - o_orderdate[i] = dist(tld.rng); - } - return Status::OK(); - } - - Status O_ORDERPRIORITY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY)); - int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_ORDERPRIORITY]); - std::uniform_int_distribution dist(0, kNumPriorities - 1); - char *o_orderpriority = reinterpret_cast( - tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.orders_to_generate; i++) - { - const char *str = Priorities[dist(tld.rng)]; - std::strncpy(o_orderpriority + i * byte_width, str, byte_width); - } - } - return Status::OK(); - } - - Status O_CLERK(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); - int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_CLERK]); - int64_t max_clerk_id = static_cast(scale_factor_ * 1000); - std::uniform_int_distribution dist(1, max_clerk_id); - char *o_clerk = reinterpret_cast( - tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < tld.orders_to_generate; i++) - { - const char *clerk = "Clerk#"; - const size_t clerk_length = std::strlen(clerk); - int64_t clerk_number = dist(tld.rng); - char *output = o_clerk + i * byte_width; - std::strncpy(output, clerk, byte_width); - AppendNumberPaddedToNineDigits(output + clerk_length, clerk_number); - } - } - return Status::OK(); - } - - Status O_SHIPPRIORITY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_SHIPPRIORITY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_SHIPPRIORITY)); - int32_t *o_shippriority = reinterpret_cast( - tld.orders[ORDERS::O_SHIPPRIORITY].array()->buffers[1]->mutable_data()); - std::memset(o_shippriority, 0, tld.orders_to_generate * sizeof(int32_t)); - } - return Status::OK(); - } - - Status O_COMMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE(tld.orders[ORDERS::O_COMMENT], g_text.GenerateComments(tld.orders_to_generate, 19, 78, tld.rng)); - } - return Status::OK(); - } - - Status GenerateRowCounts(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - std::uniform_int_distribution length_dist(1, 7); - tld.lineitem_to_generate = 0; - tld.items_per_order.clear(); - for(int64_t i = 0; i < tld.orders_to_generate; i++) - { - int length = length_dist(tld.rng); - tld.items_per_order.push_back(length); - tld.lineitem_to_generate += length; - } - int64_t num_batches = (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / batch_size_; - tld.lineitem.resize(num_batches); - for(std::vector &batch : tld.lineitem) - { - batch.resize(LINEITEM::kNumCols); - std::fill(batch.begin(), batch.end(), Datum()); - } - return Status::OK(); - } - - Status AllocateLineItemBufferIfNeeded(size_t thread_index, size_t ibatch, int column, size_t &out_batch_offset) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.lineitem[ibatch][column].kind() == Datum::NONE) - { - int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[column]); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(batch_size_ * byte_width)); - ArrayData ad(kLineitemTypes[column], batch_size_, { nullptr, std::move(buff) }); - tld.lineitem[ibatch][column] = std::move(ad); - out_batch_offset = 0; - } - if(ibatch == 0) - out_batch_offset = tld.first_batch_offset; - - return Status::OK(); - } - - Status L_ORDERKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_ORDERKEY]) - { - tld.generated_lineitem[LINEITEM::L_ORDERKEY] = true; - RETURN_NOT_OK(O_ORDERKEY(thread_index)); - const int32_t *o_orderkey = reinterpret_cast( - tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->data()); - - size_t ibatch = 0; - size_t iorder = 0; - int32_t iline = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_ORDERKEY, batch_offset)); - int32_t *l_linenumber = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->buffers[1]->mutable_data()); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - for(int64_t irun = 0; irun < next_run;) - { - for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) - l_linenumber[batch_offset++] = o_orderkey[iorder]; - if(iline == tld.items_per_order[iorder]) - { - iline = 0; - iorder++; - } - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_PARTKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_PARTKEY]) - { - tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; - - size_t ibatch = 0; - int32_t max_partkey = static_cast(scale_factor_ * 200000); - std::uniform_int_distribution dist(1, max_partkey); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_PARTKEY, batch_offset)); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - int32_t *l_partkey = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < next_run; i++, batch_offset++) - l_partkey[batch_offset] = dist(tld.rng); - - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_SUPPKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_SUPPKEY]) - { - tld.generated_lineitem[LINEITEM::L_SUPPKEY] = true; - RETURN_NOT_OK(L_PARTKEY(thread_index)); - - size_t ibatch = 0; - std::uniform_int_distribution dist(0, 3); - const int32_t S = static_cast(scale_factor_ * 10000); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset = 0; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SUPPKEY, batch_offset)); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - int32_t *l_suppkey = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->buffers[1]->mutable_data()); - const int32_t *l_partkey = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); - for(int64_t i = 0; i < next_run; i++) - { - int32_t supplier = dist(tld.rng); - int32_t partkey = l_partkey[batch_offset]; - // Fun fact: the parentheses for this expression are unbalanced in the TPC-H spec. - l_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_LINENUMBER(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_LINENUMBER]) - { - tld.generated_lineitem[LINEITEM::L_LINENUMBER] = true; - size_t ibatch = 0; - size_t iorder = 0; - int32_t iline = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_LINENUMBER, batch_offset)); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - int32_t *l_linenumber = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->buffers[1]->mutable_data()); - for(int64_t irun = 0; irun < next_run;) - { - for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) - { - l_linenumber[batch_offset++] = (iline + 1); - ARROW_DCHECK(1 <= (iline + 1) && (iline + 1) <= 7); - } - if(iline == tld.items_per_order[iorder]) - { - iline = 0; - iorder++; - } - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_QUANTITY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_QUANTITY]) - { - tld.generated_lineitem[LINEITEM::L_QUANTITY] = true; - - size_t ibatch = 0; - std::uniform_int_distribution dist(1, 50); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_QUANTITY, batch_offset)); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - Decimal128 *l_quantity = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->mutable_data()); - for(int64_t i = 0; i < next_run; i++) - { - // Multiply by 100 because the type is decimal(12, 2), so the decimal goes after two digits - int64_t quantity = dist(tld.rng) * 100; - l_quantity[batch_offset++] = { quantity }; - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_EXTENDEDPRICE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE]) - { - tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE] = true; - RETURN_NOT_OK(L_PARTKEY(thread_index)); - RETURN_NOT_OK(L_QUANTITY(thread_index)); - size_t ibatch = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_EXTENDEDPRICE, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - const int32_t *l_partkey = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); - const Decimal128 *l_quantity = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->data()); - Decimal128 *l_extendedprice = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - { - int64_t partkey = static_cast(l_partkey[batch_offset]); - // Divide by 100 to recover the integer representation (not Decimal). - int64_t quantity = static_cast(l_quantity[batch_offset]) / 100; - - // Spec says to divide by 100, but that happens automatically due to this being stored - // to two decimal points. - int64_t retail_price = (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); - int64_t extended_price = retail_price * quantity; - l_extendedprice[batch_offset] = { extended_price }; - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_DISCOUNT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_DISCOUNT]) - { - tld.generated_lineitem[LINEITEM::L_DISCOUNT] = true; - size_t ibatch = 0; - std::uniform_int_distribution dist(0, 10); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_DISCOUNT, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - Decimal128 *l_discount = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - l_discount[batch_offset] = { dist(tld.rng) }; - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_TAX(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_TAX]) - { - tld.generated_lineitem[LINEITEM::L_TAX] = true; - size_t ibatch = 0; - std::uniform_int_distribution dist(0, 8); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_TAX, batch_offset)); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - Decimal128 *l_tax = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - l_tax[batch_offset] = { dist(tld.rng) }; - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_TAX].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_RETURNFLAG(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_RETURNFLAG]) - { - tld.generated_lineitem[LINEITEM::L_RETURNFLAG] = true; - RETURN_NOT_OK(L_RECEIPTDATE(thread_index)); - size_t ibatch = 0; - std::uniform_int_distribution dist; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_RETURNFLAG, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - char *l_returnflag = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->buffers[1]->mutable_data()); - const uint32_t *l_receiptdate = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - { - if(l_receiptdate[batch_offset] <= kCurrentDate) - { - uint32_t r = dist(tld.rng); - l_returnflag[batch_offset] = (r % 2 == 1) ? 'R' : 'A'; - } - else - { - l_returnflag[batch_offset] = 'N'; - } - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_LINESTATUS(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_LINESTATUS]) - { - tld.generated_lineitem[LINEITEM::L_LINESTATUS] = true; - RETURN_NOT_OK(L_SHIPDATE(thread_index)); - size_t ibatch = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_LINESTATUS, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - char *l_linestatus = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->mutable_data()); - const uint32_t *l_shipdate = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - { - if(l_shipdate[batch_offset] > kCurrentDate) - l_linestatus[batch_offset] = 'O'; - else - l_linestatus[batch_offset] = 'F'; - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_SHIPDATE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_SHIPDATE]) - { - tld.generated_lineitem[LINEITEM::L_SHIPDATE] = true; - RETURN_NOT_OK(O_ORDERDATE(thread_index)); - const int32_t *o_orderdate = reinterpret_cast( - tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); - std::uniform_int_distribution dist(1, 121); - size_t ibatch = 0; - size_t iorder = 0; - int32_t iline = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPDATE, batch_offset)); - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - uint32_t *l_shipdate = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); - for(int64_t irun = 0; irun < next_run;) - { - for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) - l_shipdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); - if(iline == tld.items_per_order[iorder]) - { - iline = 0; - iorder++; - } - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_COMMITDATE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_COMMITDATE]) - { - tld.generated_lineitem[LINEITEM::L_COMMITDATE] = true; - const int32_t *o_orderdate = reinterpret_cast( - tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); - std::uniform_int_distribution dist(30, 90); - size_t ibatch = 0; - size_t iorder = 0; - int32_t iline = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_COMMITDATE, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - uint32_t *l_commitdate = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->buffers[1]->mutable_data()); - for(int64_t irun = 0; irun < next_run;) - { - for(; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) - l_commitdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); - if(iline == tld.items_per_order[iorder]) - { - iline = 0; - iorder++; - } - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_RECEIPTDATE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_RECEIPTDATE]) - { - tld.generated_lineitem[LINEITEM::L_RECEIPTDATE] = true; - RETURN_NOT_OK(L_SHIPDATE(thread_index)); - size_t ibatch = 0; - std::uniform_int_distribution dist(1, 30); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_RECEIPTDATE, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - uint32_t *l_receiptdate = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->buffers[1]->mutable_data()); - const uint32_t *l_shipdate = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - l_receiptdate[batch_offset] = l_shipdate[batch_offset] + dist(tld.rng); - - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_SHIPINSTRUCT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) - { - tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true; - int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPINSTRUCT]); - size_t ibatch = 0; - std::uniform_int_distribution dist(0, kNumInstructions - 1); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPINSTRUCT, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - char *l_shipinstruct = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - { - const char *str = Instructions[dist(tld.rng)]; - // Note that we don't have to memset the buffer to 0 because strncpy pads each string - // with 0's anyway - std::strncpy(l_shipinstruct + batch_offset * byte_width, str, byte_width); - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_SHIPMODE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) - { - tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true; - int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPMODE]); - size_t ibatch = 0; - std::uniform_int_distribution dist(0, kNumModes - 1); - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - size_t batch_offset; - RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, LINEITEM::L_SHIPMODE, batch_offset)); - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - char *l_shipmode = reinterpret_cast( - tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->buffers[1]->mutable_data()); - - for(int64_t i = 0; i < next_run; i++, batch_offset++) - { - const char *str = Modes[dist(tld.rng)]; - std::strncpy(l_shipmode + batch_offset * byte_width, str, byte_width); - } - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->length = static_cast(batch_offset); - } - } - return Status::OK(); - } - - Status L_COMMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(!tld.generated_lineitem[LINEITEM::L_COMMENT]) - { - tld.generated_lineitem[LINEITEM::L_COMMENT] = true; - - size_t batch_offset = tld.first_batch_offset; - size_t ibatch = 0; - for(int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) - { - // Comments are kind of sneaky: we always generate the full batch and then just bump the length - if(tld.lineitem[ibatch][LINEITEM::L_COMMENT].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE(tld.lineitem[ibatch][LINEITEM::L_COMMENT], g_text.GenerateComments(batch_size_, 10, 43, tld.rng)); - batch_offset = 0; - } - - int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); - int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); - - batch_offset += next_run; - irow += next_run; - tld.lineitem[ibatch][LINEITEM::L_COMMENT].array()->length = batch_offset; - } - } - return Status::OK(); - } - - struct ThreadLocalData - { - std::vector orders; - int64_t orders_to_generate; - int64_t orderkey_start; - - std::vector> lineitem; - std::vector items_per_order; - int64_t lineitem_to_generate; - int64_t first_batch_offset; - std::bitset generated_lineitem; - random::pcg32_fast rng; - }; - std::vector thread_local_data_; - - bool inited_ = false; - std::mutex orders_output_queue_mutex_; - std::mutex lineitem_output_queue_mutex_; - std::queue orders_output_queue_; - std::queue lineitem_output_queue_; - int64_t batch_size_; - float scale_factor_; - int64_t orders_rows_to_generate_; - int64_t orders_rows_generated_; - std::vector orders_cols_; - std::vector lineitem_cols_; - ThreadIndexer thread_indexer_; - - std::atomic orders_batches_generated_ = { 0 }; - std::atomic lineitem_batches_generated_ = { 0 }; - }; - - class SupplierGenerator : public TpchTableGenerator - { - public: - Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) override - { - scale_factor_ = scale_factor; - batch_size_ = batch_size; - rows_to_generate_ = static_cast(scale_factor_ * 10000); - rows_generated_.store(0); - ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( - columns, - kTypes, - kNameMap, - gen_list_)); - - random::pcg32_fast rng; - std::uniform_int_distribution dist(0, rows_to_generate_ - 1); - size_t num_special_rows = static_cast(5 * scale_factor_); - std::unordered_set good_rows_set; - while(good_rows_set.size() < num_special_rows) - { - int64_t row = dist(rng); - good_rows_set.insert(row); - } - std::unordered_set bad_rows_set; - while(bad_rows_set.size() < num_special_rows) - { - int64_t bad_row; - do - { - bad_row = dist(rng); - } while(good_rows_set.find(bad_row) != good_rows_set.end()); - bad_rows_set.insert(bad_row); - } - good_rows_.clear(); - bad_rows_.clear(); - good_rows_.insert(good_rows_.end(), good_rows_set.begin(), good_rows_set.end()); - bad_rows_.insert(bad_rows_.end(), bad_rows_set.begin(), bad_rows_set.end()); - std::sort(good_rows_.begin(), good_rows_.end()); - std::sort(bad_rows_.begin(), bad_rows_.end()); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) override - { - arrow_vendored::pcg_extras::seed_seq_from seq; - thread_local_data_.resize(num_threads); - for(ThreadLocalData &tld : thread_local_data_) - tld.rng.seed(seq); - - output_callback_ = std::move(output_callback); - finished_callback_ = std::move(finished_callback); - schedule_callback_ = std::move(schedule_callback); - for(size_t i = 0; i < num_threads; i++) - RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: -#define FOR_EACH_COLUMN(F) \ - F(S_SUPPKEY) \ - F(S_NAME) \ - F(S_ADDRESS) \ - F(S_NATIONKEY) \ - F(S_PHONE) \ - F(S_ACCTBAL) \ - F(S_COMMENT) + const std::vector> kOrdersTypes = {int32(), + int32(), + fixed_size_binary(1), + decimal(12, 2), + date32(), + fixed_size_binary(15), + fixed_size_binary(15), + int32(), + utf8()}; + + const std::vector> kLineitemTypes = { + int32(), + int32(), + int32(), + int32(), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + fixed_size_binary(1), + fixed_size_binary(1), + date32(), + date32(), + date32(), + fixed_size_binary(25), + fixed_size_binary(10), + utf8(), + }; + + Status AllocateOrdersBatch(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.orders_to_generate * byte_width)); + ArrayData ad(kOrdersTypes[column], tld.orders_to_generate, + {nullptr, std::move(buff)}); + tld.orders[column] = std::move(ad); + return Status::OK(); + } + + Status O_ORDERKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERKEY)); + int32_t* o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + int32_t orderkey_index = static_cast(tld.orderkey_start + i); + int32_t index_of_run = orderkey_index / 8; + int32_t index_in_run = orderkey_index % 8; + o_orderkey[i] = (index_of_run * 32 + index_in_run + 1); + ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= 4 * orders_rows_to_generate_); + } + } + return Status::OK(); + } + + Status O_CUSTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_CUSTKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CUSTKEY)); + + // Spec says it must be a random number between 1 and SF*150000 that is not + // divisible by 3. Rather than repeatedly generating numbers until we get to + // a non-divisible-by-3 number, we just generate a number between + // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. + int32_t sf_50k = static_cast(scale_factor_ * 50000); + std::uniform_int_distribution base_dist(0, sf_50k - 1); + std::uniform_int_distribution offset_dist(1, 2); + int32_t* o_custkey = reinterpret_cast( + tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) + o_custkey[i] = 3 * base_dist(tld.rng) + offset_dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERSTATUS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERSTATUS].kind() == Datum::NONE) { + RETURN_NOT_OK(L_LINESTATUS(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERSTATUS)); + + char* o_orderstatus = reinterpret_cast( + tld.orders[ORDERS::O_ORDERSTATUS].array()->buffers[1]->mutable_data()); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + bool all_f = true; + bool all_o = true; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + const char* l_linestatus = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->data()); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; + iline++, irun++, batch_offset++) { + all_f &= l_linestatus[batch_offset] == 'F'; + all_o &= l_linestatus[batch_offset] == 'O'; + } + if (iline == tld.items_per_order[iorder]) { + iline = 0; + ARROW_DCHECK(!(all_f && all_o)); + if (all_f) + o_orderstatus[iorder] = 'F'; + else if (all_o) + o_orderstatus[iorder] = 'O'; + else + o_orderstatus[iorder] = 'P'; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_TOTALPRICE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_TOTALPRICE].kind() == Datum::NONE) { + RETURN_NOT_OK(L_EXTENDEDPRICE(thread_index)); + RETURN_NOT_OK(L_TAX(thread_index)); + RETURN_NOT_OK(L_DISCOUNT(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_TOTALPRICE)); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + int64_t sum = 0; + Decimal128* o_totalprice = reinterpret_cast( + tld.orders[ORDERS::O_TOTALPRICE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + const Decimal128* l_extendedprice = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->data()); + const Decimal128* l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->data()); + const Decimal128* l_discount = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->data()); + + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; + iline++, irun++, batch_offset++) { + int64_t eprice = static_cast(l_extendedprice[batch_offset]); + int64_t tax = static_cast(l_tax[batch_offset]); + int64_t discount = static_cast(l_discount[batch_offset]); + sum += (eprice * (100 + tax) * (100 - discount)); + } + if (iline == tld.items_per_order[iorder]) { + sum /= 100 * 100; + o_totalprice[iorder] = {sum}; + iline = 0; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_ORDERDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERDATE].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERDATE)); + + std::uniform_int_distribution dist(kStartDate, kEndDate - 151); + uint32_t* o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) o_orderdate[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERPRIORITY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY)); + int32_t byte_width = + arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_ORDERPRIORITY]); + std::uniform_int_distribution dist(0, kNumPriorities - 1); + char* o_orderpriority = reinterpret_cast( + tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + const char* str = Priorities[dist(tld.rng)]; + std::strncpy(o_orderpriority + i * byte_width, str, byte_width); + } + } + return Status::OK(); + } + + Status O_CLERK(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_CLERK]); + int64_t max_clerk_id = static_cast(scale_factor_ * 1000); + std::uniform_int_distribution dist(1, max_clerk_id); + char* o_clerk = reinterpret_cast( + tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + const char* clerk = "Clerk#"; + const size_t clerk_length = std::strlen(clerk); + int64_t clerk_number = dist(tld.rng); + char* output = o_clerk + i * byte_width; + std::strncpy(output, clerk, byte_width); + AppendNumberPaddedToNineDigits(output + clerk_length, clerk_number); + } + } + return Status::OK(); + } + + Status O_SHIPPRIORITY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_SHIPPRIORITY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_SHIPPRIORITY)); + int32_t* o_shippriority = reinterpret_cast( + tld.orders[ORDERS::O_SHIPPRIORITY].array()->buffers[1]->mutable_data()); + std::memset(o_shippriority, 0, tld.orders_to_generate * sizeof(int32_t)); + } + return Status::OK(); + } + + Status O_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE( + tld.orders[ORDERS::O_COMMENT], + g_text.GenerateComments(tld.orders_to_generate, 19, 78, tld.rng)); + } + return Status::OK(); + } + + Status GenerateRowCounts(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + std::uniform_int_distribution length_dist(1, 7); + tld.lineitem_to_generate = 0; + tld.items_per_order.clear(); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + int length = length_dist(tld.rng); + tld.items_per_order.push_back(length); + tld.lineitem_to_generate += length; + } + int64_t num_batches = + (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / + batch_size_; + tld.lineitem.resize(num_batches); + for (std::vector& batch : tld.lineitem) { + batch.resize(LINEITEM::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); + } + return Status::OK(); + } + + Status AllocateLineItemBufferIfNeeded(size_t thread_index, size_t ibatch, int column, + size_t& out_batch_offset) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.lineitem[ibatch][column].kind() == Datum::NONE) { + int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(kLineitemTypes[column], batch_size_, {nullptr, std::move(buff)}); + tld.lineitem[ibatch][column] = std::move(ad); + out_batch_offset = 0; + } + if (ibatch == 0) out_batch_offset = tld.first_batch_offset; + + return Status::OK(); + } + + Status L_ORDERKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_ORDERKEY]) { + tld.generated_lineitem[LINEITEM::L_ORDERKEY] = true; + RETURN_NOT_OK(O_ORDERKEY(thread_index)); + const int32_t* o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_ORDERKEY, batch_offset)); + int32_t* l_linenumber = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_ORDERKEY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_linenumber[batch_offset++] = o_orderkey[iorder]; + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_PARTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_PARTKEY]) { + tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; + + size_t ibatch = 0; + int32_t max_partkey = static_cast(scale_factor_ * 200000); + std::uniform_int_distribution dist(1, max_partkey); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_PARTKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t* l_partkey = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_PARTKEY] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_partkey[batch_offset] = dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SUPPKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SUPPKEY]) { + tld.generated_lineitem[LINEITEM::L_SUPPKEY] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 3); + const int32_t S = static_cast(scale_factor_ * 10000); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset = 0; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_SUPPKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t* l_suppkey = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SUPPKEY] + .array() + ->buffers[1] + ->mutable_data()); + const int32_t* l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + for (int64_t i = 0; i < next_run; i++) { + int32_t supplier = dist(tld.rng); + int32_t partkey = l_partkey[batch_offset]; + // Fun fact: the parentheses for this expression are unbalanced in the TPC-H + // spec. + l_suppkey[batch_offset++] = + (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINENUMBER(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_LINENUMBER]) { + tld.generated_lineitem[LINEITEM::L_LINENUMBER] = true; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_LINENUMBER, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t* l_linenumber = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_LINENUMBER] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; + iline++, irun++) { + l_linenumber[batch_offset++] = (iline + 1); + ARROW_DCHECK(1 <= (iline + 1) && (iline + 1) <= 7); + } + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_QUANTITY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_QUANTITY]) { + tld.generated_lineitem[LINEITEM::L_QUANTITY] = true; + + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 50); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_QUANTITY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128* l_quantity = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_QUANTITY] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t i = 0; i < next_run; i++) { + // Multiply by 100 because the type is decimal(12, 2), so the decimal goes after + // two digits + int64_t quantity = dist(tld.rng) * 100; + l_quantity[batch_offset++] = {quantity}; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_EXTENDEDPRICE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE]) { + tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + RETURN_NOT_OK(L_QUANTITY(thread_index)); + size_t ibatch = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_EXTENDEDPRICE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + const int32_t* l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + const Decimal128* l_quantity = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->data()); + Decimal128* l_extendedprice = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + int64_t partkey = static_cast(l_partkey[batch_offset]); + // Divide by 100 to recover the integer representation (not Decimal). + int64_t quantity = static_cast(l_quantity[batch_offset]) / 100; + + // Spec says to divide by 100, but that happens automatically due to this being + // stored to two decimal points. + int64_t retail_price = + (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + int64_t extended_price = retail_price * quantity; + l_extendedprice[batch_offset] = {extended_price}; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_DISCOUNT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_DISCOUNT]) { + tld.generated_lineitem[LINEITEM::L_DISCOUNT] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 10); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_DISCOUNT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128* l_discount = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_DISCOUNT] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_discount[batch_offset] = {dist(tld.rng)}; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_TAX(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_TAX]) { + tld.generated_lineitem[LINEITEM::L_TAX] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 8); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_TAX, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128* l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_tax[batch_offset] = {dist(tld.rng)}; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RETURNFLAG(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_RETURNFLAG]) { + tld.generated_lineitem[LINEITEM::L_RETURNFLAG] = true; + RETURN_NOT_OK(L_RECEIPTDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_RETURNFLAG, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_returnflag = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG] + .array() + ->buffers[1] + ->mutable_data()); + const uint32_t* l_receiptdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + if (l_receiptdate[batch_offset] <= kCurrentDate) { + uint32_t r = dist(tld.rng); + l_returnflag[batch_offset] = (r % 2 == 1) ? 'R' : 'A'; + } else { + l_returnflag[batch_offset] = 'N'; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINESTATUS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_LINESTATUS]) { + tld.generated_lineitem[LINEITEM::L_LINESTATUS] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_LINESTATUS, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_linestatus = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_LINESTATUS] + .array() + ->buffers[1] + ->mutable_data()); + const uint32_t* l_shipdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPDATE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + if (l_shipdate[batch_offset] > kCurrentDate) + l_linestatus[batch_offset] = 'O'; + else + l_linestatus[batch_offset] = 'F'; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SHIPDATE]) { + tld.generated_lineitem[LINEITEM::L_SHIPDATE] = true; + RETURN_NOT_OK(O_ORDERDATE(thread_index)); + const int32_t* o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(1, 121); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_SHIPDATE, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t* l_shipdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPDATE] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_shipdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMITDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_COMMITDATE]) { + tld.generated_lineitem[LINEITEM::L_COMMITDATE] = true; + const int32_t* o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(30, 90); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_COMMITDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t* l_commitdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_COMMITDATE] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_commitdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RECEIPTDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_RECEIPTDATE]) { + tld.generated_lineitem[LINEITEM::L_RECEIPTDATE] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 30); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_RECEIPTDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t* l_receiptdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE] + .array() + ->buffers[1] + ->mutable_data()); + const uint32_t* l_shipdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPDATE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_receiptdate[batch_offset] = l_shipdate[batch_offset] + dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPINSTRUCT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) { + tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true; + int32_t byte_width = + arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPINSTRUCT]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumInstructions - 1); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_SHIPINSTRUCT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_shipinstruct = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + const char* str = Instructions[dist(tld.rng)]; + // Note that we don't have to memset the buffer to 0 because strncpy pads each + // string with 0's anyway + std::strncpy(l_shipinstruct + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPMODE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) { + tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true; + int32_t byte_width = + arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPMODE]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumModes - 1); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_SHIPMODE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_shipmode = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPMODE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + const char* str = Modes[dist(tld.rng)]; + std::strncpy(l_shipmode + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_COMMENT]) { + tld.generated_lineitem[LINEITEM::L_COMMENT] = true; + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + // Comments are kind of sneaky: we always generate the full batch and then just + // bump the length + if (tld.lineitem[ibatch][LINEITEM::L_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.lineitem[ibatch][LINEITEM::L_COMMENT], + g_text.GenerateComments(batch_size_, 10, 43, tld.rng)); + batch_offset = 0; + } + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + batch_offset += next_run; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMENT].array()->length = batch_offset; + } + } + return Status::OK(); + } + + struct ThreadLocalData { + std::vector orders; + int64_t orders_to_generate; + int64_t orderkey_start; + + std::vector> lineitem; + std::vector items_per_order; + int64_t lineitem_to_generate; + int64_t first_batch_offset; + std::bitset generated_lineitem; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex orders_output_queue_mutex_; + std::mutex lineitem_output_queue_mutex_; + std::queue orders_output_queue_; + std::queue lineitem_output_queue_; + int64_t batch_size_; + float scale_factor_; + int64_t orders_rows_to_generate_; + int64_t orders_rows_generated_; + std::vector orders_cols_; + std::vector lineitem_cols_; + ThreadIndexer thread_indexer_; + + std::atomic orders_batches_generated_ = {0}; + std::atomic lineitem_batches_generated_ = {0}; +}; + +class SupplierGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, float scale_factor, + int64_t batch_size) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = static_cast(scale_factor_ * 10000); + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, gen_list_)); + + random::pcg32_fast rng; + std::uniform_int_distribution dist(0, rows_to_generate_ - 1); + size_t num_special_rows = static_cast(5 * scale_factor_); + std::unordered_set good_rows_set; + while (good_rows_set.size() < num_special_rows) { + int64_t row = dist(rng); + good_rows_set.insert(row); + } + std::unordered_set bad_rows_set; + while (bad_rows_set.size() < num_special_rows) { + int64_t bad_row; + do { + bad_row = dist(rng); + } while (good_rows_set.find(bad_row) != good_rows_set.end()); + bad_rows_set.insert(bad_row); + } + good_rows_.clear(); + bad_rows_.clear(); + good_rows_.insert(good_rows_.end(), good_rows_set.begin(), good_rows_set.end()); + bad_rows_.insert(bad_rows_.end(), bad_rows_set.begin(), bad_rows_set.end()); + std::sort(good_rows_.begin(), good_rows_.end()); + std::sort(bad_rows_.begin(), bad_rows_.end()); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + arrow_vendored::pcg_extras::seed_seq_from seq; + thread_local_data_.resize(num_threads); + for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(seq); + + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: +#define FOR_EACH_COLUMN(F) \ + F(S_SUPPKEY) \ + F(S_NAME) \ + F(S_ADDRESS) \ + F(S_NATIONKEY) \ + F(S_PHONE) \ + F(S_ACCTBAL) \ + F(S_COMMENT) #define MAKE_ENUM(col) col, - struct SUPPLIER - { - enum - { - FOR_EACH_COLUMN(MAKE_ENUM) - kNumCols, - }; - }; + struct SUPPLIER { + enum { + FOR_EACH_COLUMN(MAKE_ENUM) kNumCols, + }; + }; #undef MAKE_ENUM -#define MAKE_STRING_MAP(col) \ - { #col, SUPPLIER::col }, - const std::unordered_map kNameMap = - { - FOR_EACH_COLUMN(MAKE_STRING_MAP) - }; +#define MAKE_STRING_MAP(col) {#col, SUPPLIER::col}, + const std::unordered_map kNameMap = { + FOR_EACH_COLUMN(MAKE_STRING_MAP)}; #undef MAKE_STRING_MAP -#define MAKE_FN_ARRAY(col) \ - [this](size_t thread_index) { return this->col(thread_index); }, - std::vector kGenerators = - { - FOR_EACH_COLUMN(MAKE_FN_ARRAY) - }; +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector kGenerators = {FOR_EACH_COLUMN(MAKE_FN_ARRAY)}; #undef MAKE_FN_ARRAY #undef FOR_EACH_COLUMN - std::vector> kTypes = - { - int32(), - fixed_size_binary(25), - utf8(), - int32(), - fixed_size_binary(15), - decimal(12, 2), - utf8(), - }; - - Status ProduceCallback(size_t thread_index) - { - if(done_.load()) - return Status::OK(); - ThreadLocalData &tld = thread_local_data_[thread_index]; - tld.suppkey_start = rows_generated_.fetch_add(batch_size_); - if(tld.suppkey_start >= rows_to_generate_) - return Status::OK(); - - tld.to_generate = std::min(batch_size_, - rows_to_generate_ - tld.suppkey_start); - - tld.batch.resize(SUPPLIER::kNumCols); - std::fill(tld.batch.begin(), tld.batch.end(), Datum()); - for(int col : gen_list_) - RETURN_NOT_OK(kGenerators[col](thread_index)); - - std::vector result(gen_list_.size()); - for(size_t i = 0; i < gen_list_.size(); i++) - { - int col_idx = gen_list_[i]; - result[i] = tld.batch[col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); - int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; - int64_t batches_outputted_before_this_one = batches_outputted_.fetch_add(1); - bool is_last_batch = batches_outputted_before_this_one == (batches_to_generate - 1); - output_callback_(std::move(eb)); - if(is_last_batch) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - finished_callback_(batches_outputted_.load()); - return Status::OK(); - } - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); - } - - Status AllocateColumn(size_t thread_index, int column) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); - ArrayData ad(kTypes[column], tld.to_generate, { nullptr, std::move(buff) }); - tld.batch[column] = std::move(ad); - return Status::OK(); - } - - Status S_SUPPKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_SUPPKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_SUPPKEY)); - int32_t *s_suppkey = reinterpret_cast( - tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - s_suppkey[irow] = static_cast(tld.suppkey_start + irow + 1); - } - } - return Status::OK(); - } - - Status S_NAME(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_NAME].kind() == Datum::NONE) - { - RETURN_NOT_OK(S_SUPPKEY(thread_index)); - const int32_t *s_suppkey = reinterpret_cast( - tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data()); - RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME)); - int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_NAME]); - char *s_name = reinterpret_cast( - tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data()); - // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022 - const char *supplier = "Supplie#r"; - const size_t supplier_length = std::strlen(supplier); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - char *out = s_name + byte_width * irow; - std::strncpy(out, supplier, byte_width); - AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); - } - } - return Status::OK(); - } - - Status S_ADDRESS(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_ADDRESS].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE( - tld.batch[SUPPLIER::S_ADDRESS], - RandomVString(tld.rng, tld.to_generate, 10, 40)); - } - return Status::OK(); - } - - Status S_NATIONKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_NATIONKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NATIONKEY)); - std::uniform_int_distribution dist(0, 24); - int32_t *s_nationkey = reinterpret_cast( - tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - s_nationkey[irow] = dist(tld.rng); - } - return Status::OK(); - } - - Status S_PHONE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_PHONE].kind() == Datum::NONE) - { - RETURN_NOT_OK(S_NATIONKEY(thread_index)); - RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE)); - int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_PHONE]); - const int32_t *s_nationkey = reinterpret_cast( - tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data()); - char *s_phone = reinterpret_cast( - tld.batch[SUPPLIER::S_PHONE].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - GeneratePhoneNumber( - s_phone + irow * byte_width, - tld.rng, - s_nationkey[irow]); - } - } - return Status::OK(); - } - - Status S_ACCTBAL(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_ACCTBAL].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_ACCTBAL)); - Decimal128 *s_acctbal = reinterpret_cast( - tld.batch[SUPPLIER::S_ACCTBAL].array()->buffers[1]->mutable_data()); - std::uniform_int_distribution dist(-99999, 999999); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - s_acctbal[irow] = { dist(tld.rng) }; - } - return Status::OK(); - } - - Status S_COMMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], g_text.GenerateComments(tld.to_generate, 25, 100, tld.rng)); - ModifyComments(thread_index, "Recommends", good_rows_); - ModifyComments(thread_index, "Complaints", bad_rows_); - } - return Status::OK(); - } - - void ModifyComments( - size_t thread_index, - const char *review, - const std::vector &indices) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - const int32_t *offsets = reinterpret_cast( - tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->data()); - char *str = reinterpret_cast( - tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->mutable_data()); - const char *customer = "Customer"; - const int32_t customer_length = static_cast(std::strlen(customer)); - const int32_t review_length = static_cast(std::strlen(review)); - - auto it = std::lower_bound(indices.begin(), indices.end(), tld.suppkey_start); - for(; it != indices.end() && *it < tld.suppkey_start + tld.to_generate; it++) - { - int64_t idx_in_batch = *it - tld.suppkey_start; - char *out = str + offsets[idx_in_batch]; - int32_t str_length = offsets[idx_in_batch + 1] - offsets[idx_in_batch]; - std::uniform_int_distribution gap_dist(0, str_length - customer_length - review_length); - int32_t gap = gap_dist(tld.rng); - int32_t total_length = customer_length + gap + review_length; - std::uniform_int_distribution start_dist(0, str_length - total_length); - int32_t start = start_dist(tld.rng); - std::memcpy(out + start, customer, customer_length); - std::memcpy(out + start + customer_length + gap, review, review_length); - } - } - - struct ThreadLocalData - { - random::pcg32_fast rng; - int64_t suppkey_start; - int64_t to_generate; - std::vector batch; - }; - std::vector thread_local_data_; - std::vector good_rows_; - std::vector bad_rows_; - - OutputBatchCallback output_callback_; - FinishedCallback finished_callback_; - ScheduleCallback schedule_callback_; - int64_t rows_to_generate_; - std::atomic rows_generated_; - float scale_factor_; - int64_t batch_size_; - std::vector gen_list_; - std::shared_ptr schema_; - }; - - class PartGenerator : public TpchTableGenerator - { - public: - PartGenerator(std::shared_ptr gen) - : gen_(std::move(gen)) - { - } - - Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) override - { - scale_factor_ = scale_factor; - batch_size_ = batch_size; - ARROW_ASSIGN_OR_RAISE(schema_, - gen_->SetPartOutputColumns(columns)); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) override - { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); - output_callback_ = std::move(output_callback); - finished_callback_ = std::move(finished_callback); - schedule_callback_ = std::move(schedule_callback); - - for(size_t i = 0; i < num_threads; i++) - RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: - Status ProduceCallback(size_t) - { - if(done_.load()) - return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextPartBatch()); - if(!maybe_batch.has_value()) - { - int64_t batches_generated = gen_->part_batches_generated(); - if(batches_generated == batches_outputted_.load()) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - finished_callback_(batches_outputted_.load()); - } - return Status::OK(); - } - ExecBatch batch = std::move(*maybe_batch); - output_callback_(std::move(batch)); - batches_outputted_++; - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); - } - - OutputBatchCallback output_callback_; - FinishedCallback finished_callback_; - ScheduleCallback schedule_callback_; - int64_t batch_size_; - float scale_factor_; - std::shared_ptr gen_; - std::shared_ptr schema_; - }; - - class PartSuppGenerator : public TpchTableGenerator - { - public: - PartSuppGenerator(std::shared_ptr gen) - : gen_(std::move(gen)) - { - } - - Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) override - { - scale_factor_ = scale_factor; - batch_size_ = batch_size; - ARROW_ASSIGN_OR_RAISE(schema_, - gen_->SetPartSuppOutputColumns(columns)); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) override - { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); - output_callback_ = std::move(output_callback); - finished_callback_ = std::move(finished_callback); - schedule_callback_ = std::move(schedule_callback); - - for(size_t i = 0; i < num_threads; i++) - RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: - Status ProduceCallback(size_t) - { - if(done_.load()) - return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextPartSuppBatch()); - if(!maybe_batch.has_value()) - { - int64_t batches_generated = gen_->partsupp_batches_generated(); - if(batches_generated == batches_outputted_.load()) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - finished_callback_(batches_outputted_.load()); - } - return Status::OK(); - } - ExecBatch batch = std::move(*maybe_batch); - output_callback_(std::move(batch)); - batches_outputted_++; - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); - } - - OutputBatchCallback output_callback_; - FinishedCallback finished_callback_; - ScheduleCallback schedule_callback_; - int64_t batch_size_; - float scale_factor_; - std::shared_ptr gen_; - std::shared_ptr schema_; - }; - - class CustomerGenerator : public TpchTableGenerator - { - public: - Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) override - { - scale_factor_ = scale_factor; - batch_size_ = batch_size; - rows_to_generate_ = static_cast(scale_factor_ * 150000); - rows_generated_.store(0); - ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns( - columns, - kTypes, - kNameMap, - gen_list_)); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) override - { - arrow_vendored::pcg_extras::seed_seq_from seq; - thread_local_data_.resize(num_threads); - for(ThreadLocalData &tld : thread_local_data_) - tld.rng.seed(seq); - - output_callback_ = std::move(output_callback); - finished_callback_ = std::move(finished_callback); - schedule_callback_ = std::move(schedule_callback); - for(size_t i = 0; i < num_threads; i++) - RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: -#define FOR_EACH_COLUMN(F) \ - F(C_CUSTKEY) \ - F(C_NAME) \ - F(C_ADDRESS) \ - F(C_NATIONKEY) \ - F(C_PHONE) \ - F(C_ACCTBAL) \ - F(C_MKTSEGMENT) \ - F(C_COMMENT) + std::vector> kTypes = { + int32(), fixed_size_binary(25), utf8(), + int32(), fixed_size_binary(15), decimal(12, 2), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) { + if (done_.load()) return Status::OK(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + tld.suppkey_start = rows_generated_.fetch_add(batch_size_); + if (tld.suppkey_start >= rows_to_generate_) return Status::OK(); + + tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.suppkey_start); + + tld.batch.resize(SUPPLIER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); + for (int col : gen_list_) RETURN_NOT_OK(kGenerators[col](thread_index)); + + std::vector result(gen_list_.size()); + for (size_t i = 0; i < gen_list_.size(); i++) { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_outputted_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_outputted_before_this_one == (batches_to_generate - 1); + output_callback_(std::move(eb)); + if (is_last_batch) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + return Status::OK(); + } + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(kTypes[column], tld.to_generate, {nullptr, std::move(buff)}); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status S_SUPPKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_SUPPKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_SUPPKEY)); + int32_t* s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + s_suppkey[irow] = static_cast(tld.suppkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status S_NAME(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_NAME].kind() == Datum::NONE) { + RETURN_NOT_OK(S_SUPPKEY(thread_index)); + const int32_t* s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data()); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_NAME]); + char* s_name = reinterpret_cast( + tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data()); + // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022 + const char* supplier = "Supplie#r"; + const size_t supplier_length = std::strlen(supplier); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + char* out = s_name + byte_width * irow; + std::strncpy(out, supplier, byte_width); + AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); + } + } + return Status::OK(); + } + + Status S_ADDRESS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_ADDRESS].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status S_NATIONKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_NATIONKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t* s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + s_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status S_PHONE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_PHONE].kind() == Datum::NONE) { + RETURN_NOT_OK(S_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_PHONE]); + const int32_t* s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data()); + char* s_phone = reinterpret_cast( + tld.batch[SUPPLIER::S_PHONE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + GeneratePhoneNumber(s_phone + irow * byte_width, tld.rng, s_nationkey[irow]); + } + } + return Status::OK(); + } + + Status S_ACCTBAL(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_ACCTBAL].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_ACCTBAL)); + Decimal128* s_acctbal = reinterpret_cast( + tld.batch[SUPPLIER::S_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + s_acctbal[irow] = {dist(tld.rng)}; + } + return Status::OK(); + } + + Status S_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], + g_text.GenerateComments(tld.to_generate, 25, 100, tld.rng)); + ModifyComments(thread_index, "Recommends", good_rows_); + ModifyComments(thread_index, "Complaints", bad_rows_); + } + return Status::OK(); + } + + void ModifyComments(size_t thread_index, const char* review, + const std::vector& indices) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + const int32_t* offsets = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->data()); + char* str = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->mutable_data()); + const char* customer = "Customer"; + const int32_t customer_length = static_cast(std::strlen(customer)); + const int32_t review_length = static_cast(std::strlen(review)); + + auto it = std::lower_bound(indices.begin(), indices.end(), tld.suppkey_start); + for (; it != indices.end() && *it < tld.suppkey_start + tld.to_generate; it++) { + int64_t idx_in_batch = *it - tld.suppkey_start; + char* out = str + offsets[idx_in_batch]; + int32_t str_length = offsets[idx_in_batch + 1] - offsets[idx_in_batch]; + std::uniform_int_distribution gap_dist( + 0, str_length - customer_length - review_length); + int32_t gap = gap_dist(tld.rng); + int32_t total_length = customer_length + gap + review_length; + std::uniform_int_distribution start_dist(0, str_length - total_length); + int32_t start = start_dist(tld.rng); + std::memcpy(out + start, customer, customer_length); + std::memcpy(out + start + customer_length + gap, review, review_length); + } + } + + struct ThreadLocalData { + random::pcg32_fast rng; + int64_t suppkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + std::vector good_rows_; + std::vector bad_rows_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_; + std::atomic rows_generated_; + float scale_factor_; + int64_t batch_size_; + std::vector gen_list_; + std::shared_ptr schema_; +}; + +class PartGenerator : public TpchTableGenerator { + public: + explicit PartGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, float scale_factor, + int64_t batch_size) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetPartOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, gen_->NextPartBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->part_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + float scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class PartSuppGenerator : public TpchTableGenerator { + public: + explicit PartSuppGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, float scale_factor, + int64_t batch_size) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetPartSuppOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextPartSuppBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->partsupp_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + float scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class CustomerGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, float scale_factor, + int64_t batch_size) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = static_cast(scale_factor_ * 150000); + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, gen_list_)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + arrow_vendored::pcg_extras::seed_seq_from seq; + thread_local_data_.resize(num_threads); + for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(seq); + + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: +#define FOR_EACH_COLUMN(F) \ + F(C_CUSTKEY) \ + F(C_NAME) \ + F(C_ADDRESS) \ + F(C_NATIONKEY) \ + F(C_PHONE) \ + F(C_ACCTBAL) \ + F(C_MKTSEGMENT) \ + F(C_COMMENT) #define MAKE_ENUM(col) col, - struct CUSTOMER - { - enum - { - FOR_EACH_COLUMN(MAKE_ENUM) - kNumCols, - }; - }; + struct CUSTOMER { + enum { + FOR_EACH_COLUMN(MAKE_ENUM) kNumCols, + }; + }; #undef MAKE_ENUM -#define MAKE_STRING_MAP(col) \ - { #col, CUSTOMER::col }, - const std::unordered_map kNameMap = - { - FOR_EACH_COLUMN(MAKE_STRING_MAP) - }; +#define MAKE_STRING_MAP(col) {#col, CUSTOMER::col}, + const std::unordered_map kNameMap = { + FOR_EACH_COLUMN(MAKE_STRING_MAP)}; #undef MAKE_STRING_MAP -#define MAKE_FN_ARRAY(col) \ - [this](size_t thread_index) { return this->col(thread_index); }, - std::vector kGenerators = - { - FOR_EACH_COLUMN(MAKE_FN_ARRAY) - }; +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector kGenerators = {FOR_EACH_COLUMN(MAKE_FN_ARRAY)}; #undef MAKE_FN_ARRAY #undef FOR_EACH_COLUMN - std::vector> kTypes = - { - int32(), - utf8(), - utf8(), - int32(), - fixed_size_binary(15), - decimal(12, 2), - fixed_size_binary(10), - utf8(), - }; - - Status ProduceCallback(size_t thread_index) - { - if(done_.load()) - return Status::OK(); - ThreadLocalData &tld = thread_local_data_[thread_index]; - tld.custkey_start = rows_generated_.fetch_add(batch_size_); - if(tld.custkey_start >= rows_to_generate_) - return Status::OK(); - - tld.to_generate = std::min(batch_size_, - rows_to_generate_ - tld.custkey_start); - - tld.batch.resize(CUSTOMER::kNumCols); - std::fill(tld.batch.begin(), tld.batch.end(), Datum()); - for(int col : gen_list_) - RETURN_NOT_OK(kGenerators[col](thread_index)); - - std::vector result(gen_list_.size()); - for(size_t i = 0; i < gen_list_.size(); i++) - { - int col_idx = gen_list_[i]; - result[i] = tld.batch[col_idx]; - } - ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); - int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; - int64_t batches_generated_before_this_one = batches_outputted_.fetch_add(1); - bool is_last_batch = batches_generated_before_this_one == (batches_to_generate - 1); - output_callback_(std::move(eb)); - if(is_last_batch) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - finished_callback_(batches_outputted_.load()); - return Status::OK(); - } - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); - } - - Status AllocateColumn(size_t thread_index, int column) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); - int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, AllocateBuffer(tld.to_generate * byte_width)); - ArrayData ad(kTypes[column], tld.to_generate, { nullptr, std::move(buff) }); - tld.batch[column] = std::move(ad); - return Status::OK(); - } - - Status C_CUSTKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_CUSTKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_CUSTKEY)); - int32_t *c_custkey = reinterpret_cast( - tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - c_custkey[irow] = static_cast(tld.custkey_start + irow + 1); - } - } - return Status::OK(); - } - - Status C_NAME(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_NAME].kind() == Datum::NONE) - { - RETURN_NOT_OK(C_CUSTKEY(thread_index)); - const int32_t *c_custkey = reinterpret_cast( - tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->data()); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, AllocateBuffer((tld.to_generate + 1) * sizeof(int32_t))); - int32_t *offsets = reinterpret_cast(offset_buff->mutable_data()); - const char *customer = "Customer#"; - const size_t customer_length = std::strlen(customer); - offsets[0] = 0; - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - int num_digits = GetNumDigits(c_custkey[irow]); - int num_chars = std::max(num_digits, 9); - offsets[irow + 1] = static_cast(offsets[irow] + num_chars + customer_length); - } - ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, AllocateBuffer(offsets[tld.to_generate])); - char *str = reinterpret_cast(str_buff->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - char *out = str + offsets[irow]; - std::memcpy(out, customer, customer_length); - AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); - } - ArrayData ad(utf8(), tld.to_generate, { nullptr, std::move(offset_buff), std::move(str_buff) }); - tld.batch[CUSTOMER::C_NAME] = std::move(ad); - } - return Status::OK(); - } - - Status C_ADDRESS(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_ADDRESS].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE( - tld.batch[CUSTOMER::C_ADDRESS], - RandomVString(tld.rng, tld.to_generate, 10, 40)); - } - return Status::OK(); - } - - Status C_NATIONKEY(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_NATIONKEY].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_NATIONKEY)); - std::uniform_int_distribution dist(0, 24); - int32_t *c_nationkey = reinterpret_cast( - tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - c_nationkey[irow] = dist(tld.rng); - } - return Status::OK(); - } - - Status C_PHONE(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_PHONE].kind() == Datum::NONE) - { - RETURN_NOT_OK(C_NATIONKEY(thread_index)); - RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE)); - int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_PHONE]); - const int32_t *c_nationkey = reinterpret_cast( - tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data()); - char *c_phone = reinterpret_cast( - tld.batch[CUSTOMER::C_PHONE].array()->buffers[1]->mutable_data()); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - GeneratePhoneNumber( - c_phone + irow * byte_width, - tld.rng, - c_nationkey[irow]); - } - } - return Status::OK(); - } - - Status C_ACCTBAL(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_ACCTBAL].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_ACCTBAL)); - Decimal128 *c_acctbal = reinterpret_cast( - tld.batch[CUSTOMER::C_ACCTBAL].array()->buffers[1]->mutable_data()); - std::uniform_int_distribution dist(-99999, 999999); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - c_acctbal[irow] = { dist(tld.rng) }; - } - return Status::OK(); - } - - Status C_MKTSEGMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) - { - RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT)); - int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_MKTSEGMENT]); - char *c_mktsegment = reinterpret_cast( - tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data()); - std::uniform_int_distribution dist(0, kNumSegments - 1); - for(int64_t irow = 0; irow < tld.to_generate; irow++) - { - char *out = c_mktsegment + irow * byte_width; - int str_idx = dist(tld.rng); - std::strncpy(out, Segments[str_idx], byte_width); - } - } - return Status::OK(); - } - - Status C_COMMENT(size_t thread_index) - { - ThreadLocalData &tld = thread_local_data_[thread_index]; - if(tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) - { - ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], g_text.GenerateComments(tld.to_generate, 29, 116, tld.rng)); - } - return Status::OK(); - } - - struct ThreadLocalData - { - random::pcg32_fast rng; - int64_t custkey_start; - int64_t to_generate; - std::vector batch; - }; - std::vector thread_local_data_; - - OutputBatchCallback output_callback_; - FinishedCallback finished_callback_; - ScheduleCallback schedule_callback_; - int64_t rows_to_generate_{0}; - std::atomic rows_generated_ = { 0 }; - float scale_factor_{0}; - int64_t batch_size_{0}; - std::vector gen_list_; - std::shared_ptr schema_; - }; - - class OrdersGenerator : public TpchTableGenerator - { - public: - OrdersGenerator(std::shared_ptr gen) - : gen_(std::move(gen)) - { - } - - Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) override - { - scale_factor_ = scale_factor; - batch_size_ = batch_size; - ARROW_ASSIGN_OR_RAISE(schema_, - gen_->SetOrdersOutputColumns(columns)); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) override - { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); - output_callback_ = std::move(output_callback); - finished_callback_ = std::move(finished_callback); - schedule_callback_ = std::move(schedule_callback); - - for(size_t i = 0; i < num_threads; i++) - RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: - Status ProduceCallback(size_t) - { - if(done_.load()) - return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextOrdersBatch()); - if(!maybe_batch.has_value()) - { - int64_t batches_generated = gen_->orders_batches_generated(); - if(batches_generated == batches_outputted_.load()) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - finished_callback_(batches_outputted_.load()); - } - return Status::OK(); - } - ExecBatch batch = std::move(*maybe_batch); - output_callback_(std::move(batch)); - batches_outputted_++; - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); - } - - OutputBatchCallback output_callback_; - FinishedCallback finished_callback_; - ScheduleCallback schedule_callback_; - int64_t batch_size_; - float scale_factor_; - std::shared_ptr gen_; - std::shared_ptr schema_; - }; - - class LineitemGenerator : public TpchTableGenerator - { - public: - LineitemGenerator(std::shared_ptr gen) - : gen_(std::move(gen)) - {} - - Status Init( - std::vector columns, - float scale_factor, - int64_t batch_size) override - { - scale_factor_ = scale_factor; - batch_size_ = batch_size; - ARROW_ASSIGN_OR_RAISE(schema_, - gen_->SetLineItemOutputColumns(columns)); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback schedule_callback) override - { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); - output_callback_ = std::move(output_callback); - finished_callback_ = std::move(finished_callback); - schedule_callback_ = std::move(schedule_callback); - - for(size_t i = 0; i < num_threads; i++) - RETURN_NOT_OK(schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); })); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: - Status ProduceCallback(size_t) - { - if(done_.load()) - return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, - gen_->NextLineItemBatch()); - if(!maybe_batch.has_value()) - { - int64_t batches_generated = gen_->lineitem_batches_generated(); - if(batches_generated == batches_outputted_.load()) - { - bool expected = false; - if(done_.compare_exchange_strong(expected, true)) - finished_callback_(batches_outputted_.load()); - } - return Status::OK(); - } - ExecBatch batch = std::move(*maybe_batch); - output_callback_(std::move(batch)); - batches_outputted_++; - return schedule_callback_([this](size_t thread_index) { return this->ProduceCallback(thread_index); }); - } - - OutputBatchCallback output_callback_; - FinishedCallback finished_callback_; - ScheduleCallback schedule_callback_; - int64_t batch_size_; - float scale_factor_; - std::shared_ptr gen_; - std::shared_ptr schema_; - }; - - class NationGenerator : public TpchTableGenerator - { - public: - Status Init( - std::vector columns, - float /*scale_factor*/, - int64_t /*batch_size*/) override - { - ARROW_ASSIGN_OR_RAISE(schema_, - SetOutputColumns( - columns, - kTypes, - kNameMap, - column_indices_)); - rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); - return Status::OK(); - } - - Status StartProducing( - size_t /*num_threads*/, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback /*schedule_task_callback*/) override - { - std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(kNationKey, sizeof(kNationKey)); - ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_NATIONKEY_buffer) }); - - ARROW_ASSIGN_OR_RAISE(std::unique_ptr N_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); - char *N_NAME = reinterpret_cast(N_NAME_buffer->mutable_data()); - for(size_t i = 0; i < kRowCount; i++) - std::strncpy(N_NAME + kNameByteWidth * i, kCountryNames[i], kNameByteWidth); - ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, { nullptr, std::move(N_NAME_buffer) }); - - std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); - ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(N_REGIONKEY_buffer) }); - - ARROW_ASSIGN_OR_RAISE(Datum N_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 114, rng_)); - - std::vector fields = - { - std::move(N_NATIONKEY_arraydata), - std::move(N_NAME_arraydata), - std::move(N_REGIONKEY_arraydata), - std::move(N_COMMENT_datum) - }; - - std::vector result; - for(const int &col : column_indices_) - result.push_back(fields[col]); - ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); - output_callback(std::move(batch)); - finished_callback(static_cast(1)); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - private: - random::pcg32_fast rng_; - - static constexpr size_t kRowCount = 25; - static constexpr int32_t kNameByteWidth = 25; - const int32_t kNationKey[kRowCount] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; - const char *kCountryNames[kRowCount] = - { - "ALGERIA", "ARGENTINA", "BRAZIL", - "CANADA", "EGYPT", "ETHIOPIA", - "FRANCE", "GERMANY", "INDIA", - "INDONESIA", "IRAN", "IRAQ", - "JAPAN", "JORDAN", "KENYA", - "MOROCCO", "MOZAMBIQUE", "PERU", - "CHINA", "ROMANIA", "SAUDI ARABIA", - "VIETNAM", "RUSSIA", "UNITED KINGDOM", - "UNITED STATES" - }; - const int32_t kRegionKey[kRowCount] = { 0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1 }; - - struct NATION - { - enum - { - N_NATIONKEY, - N_NAME, - N_REGIONKEY, - N_COMMENT, - }; - }; - - const std::unordered_map kNameMap = - { - { "N_NATIONKEY", NATION::N_NATIONKEY }, - { "N_NAME", NATION::N_NAME }, - { "N_REGIONKEY", NATION::N_REGIONKEY }, - { "N_COMMENT", NATION::N_COMMENT }, - }; - - std::vector> kTypes = - { - int32(), - fixed_size_binary(kNameByteWidth), - int32(), - utf8(), - }; - - std::shared_ptr schema_; - std::vector column_indices_; - }; - - class RegionGenerator : public TpchTableGenerator - { - public: - Status Init( - std::vector columns, - float /*scale_factor*/, - int64_t /*batch_size*/) override - { - ARROW_ASSIGN_OR_RAISE(schema_, - SetOutputColumns( - columns, - kTypes, - kNameMap, - column_indices_)); - rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); - return Status::OK(); - } - - Status StartProducing( - size_t num_threads, - OutputBatchCallback output_callback, - FinishedCallback finished_callback, - ScheduleCallback /*schedule_task_callback*/) override - { - std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); - ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, { nullptr, std::move(R_REGIONKEY_buffer) }); - - ARROW_ASSIGN_OR_RAISE(std::unique_ptr R_NAME_buffer, AllocateBuffer(kRowCount * kNameByteWidth)); - char *R_NAME_data = reinterpret_cast(R_NAME_buffer->mutable_data()); - for(size_t i = 0; i < kRowCount; i++) - std::strncpy(R_NAME_data + kNameByteWidth * i, kRegionNames[i], kNameByteWidth); - ArrayData R_NAME_arraydata(kTypes[static_cast(REGION::R_NAME)], kRowCount, { nullptr, std::move(R_NAME_buffer) }); - - ARROW_ASSIGN_OR_RAISE(Datum R_COMMENT_datum, g_text.GenerateComments(kRowCount, 31, 115, rng_)); - - std::vector fields = { std::move(R_REGIONKEY_arraydata), std::move(R_NAME_arraydata), std::move(R_COMMENT_datum) }; - std::vector result; - for(const int &col : column_indices_) - result.push_back(fields[col]); - ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); - output_callback(std::move(batch)); - finished_callback(static_cast(1)); - return Status::OK(); - } - - std::shared_ptr schema() const override - { - return schema_; - } - - random::pcg32_fast rng_; - - static constexpr size_t kRowCount = 5; - static constexpr int32_t kNameByteWidth = 25; - const int32_t kRegionKey[kRowCount] = { 0, 1, 2, 3, 4 }; - const char *kRegionNames[kRowCount] = - { - "AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST" - }; - - struct REGION - { - enum - { - R_REGIONKEY, - R_NAME, - R_COMMENT, - kNumColumns, - }; - }; - - const std::unordered_map kNameMap = - { - { "R_REGIONKEY", REGION::R_REGIONKEY }, - { "R_NAME", REGION::R_NAME }, - { "R_COMMENT", REGION::R_COMMENT }, - }; - - const std::vector> kTypes = - { - int32(), - fixed_size_binary(kNameByteWidth), - utf8(), - }; - - std::shared_ptr schema_; - std::vector column_indices_; - }; - - class TpchNode : public ExecNode - { - public: - TpchNode(ExecPlan *plan, - const char *name, - std::unique_ptr generator) - : ExecNode(plan, {}, {}, generator->schema(), /*num_outputs=*/1), - name_(name), - generator_(std::move(generator)) - { - } - - const char *kind_name() const override - { - return name_; - } - - [[noreturn]] - static void NoInputs() - { - Unreachable("TPC-H node should never have any inputs"); - } - - [[noreturn]] - void InputReceived(ExecNode *, ExecBatch) override - { - NoInputs(); - } - - [[noreturn]] - void ErrorReceived(ExecNode *, Status) override - { - NoInputs(); - } - - [[noreturn]] - void InputFinished(ExecNode *, int) override - { - NoInputs(); - } - - Status StartProducing() override - { - return generator_->StartProducing( - thread_indexer_.Capacity(), - [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, - [this](int64_t num_batches) { this->FinishedCallback(num_batches); }, - [this](std::function func) -> Status { return this->ScheduleTaskCallback(std::move(func)); } - ); - } - - void PauseProducing(ExecNode *output) override {} - void ResumeProducing(ExecNode *output) override {} - - void StopProducing(ExecNode *output) override - { - DCHECK_EQ(output, outputs_[0]); - StopProducing(); - } - - void StopProducing() override - { - if(generator_->Abort()) - std::ignore = task_group_.End(); - } - - Future<> finished() override - { - return task_group_.OnFinished(); - } - - private: - void OutputBatchCallback(ExecBatch batch) - { - outputs_[0]->InputReceived(this, std::move(batch)); - } - - void FinishedCallback(int64_t total_num_batches) - { - outputs_[0]->InputFinished(this, static_cast(total_num_batches)); - std::ignore = task_group_.End(); - } - - Status ScheduleTaskCallback(std::function func) - { - auto executor = plan_->exec_context()->executor(); - if (executor) - { - RETURN_NOT_OK(task_group_.AddTask([&] - { - return executor->Submit([this, func] - { - size_t thread_index = thread_indexer_(); - Status status = func(thread_index); - if (!status.ok()) - { - StopProducing(); - ErrorIfNotOk(status); - return; - } - }); - })); - } - else - { - return func(0); - } - return Status::OK(); - } - - const char *name_; - std::unique_ptr generator_; - - util::AsyncTaskGroup task_group_; - ThreadIndexer thread_indexer_; - }; - - Result TpchGen::Make(ExecPlan *plan, float scale_factor, int64_t batch_size) - { - TpchGen result(plan, scale_factor, batch_size); - return result; - } + std::vector> kTypes = { + int32(), + utf8(), + utf8(), + int32(), + fixed_size_binary(15), + decimal(12, 2), + fixed_size_binary(10), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) { + if (done_.load()) return Status::OK(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + tld.custkey_start = rows_generated_.fetch_add(batch_size_); + if (tld.custkey_start >= rows_to_generate_) return Status::OK(); + + tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.custkey_start); + + tld.batch.resize(CUSTOMER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); + for (int col : gen_list_) RETURN_NOT_OK(kGenerators[col](thread_index)); + + std::vector result(gen_list_.size()); + for (size_t i = 0; i < gen_list_.size(); i++) { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_generated_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_generated_before_this_one == (batches_to_generate - 1); + output_callback_(std::move(eb)); + if (is_last_batch) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + return Status::OK(); + } + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(kTypes[column], tld.to_generate, {nullptr, std::move(buff)}); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status C_CUSTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_CUSTKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_CUSTKEY)); + int32_t* c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + c_custkey[irow] = static_cast(tld.custkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status C_NAME(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_NAME].kind() == Datum::NONE) { + RETURN_NOT_OK(C_CUSTKEY(thread_index)); + const int32_t* c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((tld.to_generate + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + const char* customer = "Customer#"; + const size_t customer_length = std::strlen(customer); + offsets[0] = 0; + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + int num_digits = GetNumDigits(c_custkey[irow]); + int num_chars = std::max(num_digits, 9); + offsets[irow + 1] = + static_cast(offsets[irow] + num_chars + customer_length); + } + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, + AllocateBuffer(offsets[tld.to_generate])); + char* str = reinterpret_cast(str_buff->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + char* out = str + offsets[irow]; + std::memcpy(out, customer, customer_length); + AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); + } + ArrayData ad(utf8(), tld.to_generate, + {nullptr, std::move(offset_buff), std::move(str_buff)}); + tld.batch[CUSTOMER::C_NAME] = std::move(ad); + } + return Status::OK(); + } + + Status C_ADDRESS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_ADDRESS].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status C_NATIONKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_NATIONKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t* c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + c_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status C_PHONE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_PHONE].kind() == Datum::NONE) { + RETURN_NOT_OK(C_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_PHONE]); + const int32_t* c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data()); + char* c_phone = reinterpret_cast( + tld.batch[CUSTOMER::C_PHONE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + GeneratePhoneNumber(c_phone + irow * byte_width, tld.rng, c_nationkey[irow]); + } + } + return Status::OK(); + } + + Status C_ACCTBAL(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_ACCTBAL].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_ACCTBAL)); + Decimal128* c_acctbal = reinterpret_cast( + tld.batch[CUSTOMER::C_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + c_acctbal[irow] = {dist(tld.rng)}; + } + return Status::OK(); + } + + Status C_MKTSEGMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_MKTSEGMENT]); + char* c_mktsegment = reinterpret_cast( + tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(0, kNumSegments - 1); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + char* out = c_mktsegment + irow * byte_width; + int str_idx = dist(tld.rng); + std::strncpy(out, Segments[str_idx], byte_width); + } + } + return Status::OK(); + } + + Status C_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], + g_text.GenerateComments(tld.to_generate, 29, 116, tld.rng)); + } + return Status::OK(); + } + + struct ThreadLocalData { + random::pcg32_fast rng; + int64_t custkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_{0}; + std::atomic rows_generated_ = {0}; + float scale_factor_{0}; + int64_t batch_size_{0}; + std::vector gen_list_; + std::shared_ptr schema_; +}; + +class OrdersGenerator : public TpchTableGenerator { + public: + explicit OrdersGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, float scale_factor, + int64_t batch_size) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetOrdersOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, gen_->NextOrdersBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->orders_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + float scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class LineitemGenerator : public TpchTableGenerator { + public: + explicit LineitemGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, float scale_factor, + int64_t batch_size) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetLineItemOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextLineItemBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->lineitem_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + float scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class NationGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, float /*scale_factor*/, + int64_t /*batch_size*/) override { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, column_indices_)); + rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); + return Status::OK(); + } + + Status StartProducing(size_t /*num_threads*/, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override { + std::shared_ptr N_NATIONKEY_buffer = + Buffer::Wrap(kNationKey, sizeof(kNationKey)); + ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, + {nullptr, std::move(N_NATIONKEY_buffer)}); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr N_NAME_buffer, + AllocateBuffer(kRowCount * kNameByteWidth)); + char* N_NAME = reinterpret_cast(N_NAME_buffer->mutable_data()); + for (size_t i = 0; i < kRowCount; i++) + std::strncpy(N_NAME + kNameByteWidth * i, kCountryNames[i], kNameByteWidth); + ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, + {nullptr, std::move(N_NAME_buffer)}); + + std::shared_ptr N_REGIONKEY_buffer = + Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); + ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, + {nullptr, std::move(N_REGIONKEY_buffer)}); + + ARROW_ASSIGN_OR_RAISE(Datum N_COMMENT_datum, + g_text.GenerateComments(kRowCount, 31, 114, rng_)); + + std::vector fields = { + std::move(N_NATIONKEY_arraydata), std::move(N_NAME_arraydata), + std::move(N_REGIONKEY_arraydata), std::move(N_COMMENT_datum)}; + + std::vector result; + for (const int& col : column_indices_) result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 25; + static constexpr int32_t kNameByteWidth = 25; + const int32_t kNationKey[kRowCount] = {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24}; + const char* kCountryNames[kRowCount] = { + "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", + "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", + "IRAN", "IRAQ", "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", + "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}; + const int32_t kRegionKey[kRowCount] = {0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, + 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; + + struct NATION { + enum { + N_NATIONKEY, + N_NAME, + N_REGIONKEY, + N_COMMENT, + }; + }; + + const std::unordered_map kNameMap = { + {"N_NATIONKEY", NATION::N_NATIONKEY}, + {"N_NAME", NATION::N_NAME}, + {"N_REGIONKEY", NATION::N_REGIONKEY}, + {"N_COMMENT", NATION::N_COMMENT}, + }; + + std::vector> kTypes = { + int32(), + fixed_size_binary(kNameByteWidth), + int32(), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; +}; + +class RegionGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, float /*scale_factor*/, + int64_t /*batch_size*/) override { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, column_indices_)); + rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override { + std::shared_ptr R_REGIONKEY_buffer = + Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); + ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, + {nullptr, std::move(R_REGIONKEY_buffer)}); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr R_NAME_buffer, + AllocateBuffer(kRowCount * kNameByteWidth)); + char* R_NAME_data = reinterpret_cast(R_NAME_buffer->mutable_data()); + for (size_t i = 0; i < kRowCount; i++) + std::strncpy(R_NAME_data + kNameByteWidth * i, kRegionNames[i], kNameByteWidth); + ArrayData R_NAME_arraydata(kTypes[static_cast(REGION::R_NAME)], kRowCount, + {nullptr, std::move(R_NAME_buffer)}); + + ARROW_ASSIGN_OR_RAISE(Datum R_COMMENT_datum, + g_text.GenerateComments(kRowCount, 31, 115, rng_)); + + std::vector fields = {std::move(R_REGIONKEY_arraydata), + std::move(R_NAME_arraydata), std::move(R_COMMENT_datum)}; + std::vector result; + for (const int& col : column_indices_) result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 5; + static constexpr int32_t kNameByteWidth = 25; + const int32_t kRegionKey[kRowCount] = {0, 1, 2, 3, 4}; + const char* kRegionNames[kRowCount] = {"AFRICA", "AMERICA", "ASIA", "EUROPE", + "MIDDLE EAST"}; + + struct REGION { + enum { + R_REGIONKEY, + R_NAME, + R_COMMENT, + kNumColumns, + }; + }; + + const std::unordered_map kNameMap = { + {"R_REGIONKEY", REGION::R_REGIONKEY}, + {"R_NAME", REGION::R_NAME}, + {"R_COMMENT", REGION::R_COMMENT}, + }; + + const std::vector> kTypes = { + int32(), + fixed_size_binary(kNameByteWidth), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; +}; + +class TpchNode : public ExecNode { + public: + TpchNode(ExecPlan* plan, const char* name, + std::unique_ptr generator) + : ExecNode(plan, {}, {}, generator->schema(), /*num_outputs=*/1), + name_(name), + generator_(std::move(generator)) {} + + const char* kind_name() const override { return name_; } + + [[noreturn]] static void NoInputs() { + Unreachable("TPC-H node should never have any inputs"); + } + + [[noreturn]] void InputReceived(ExecNode*, ExecBatch) override { NoInputs(); } + + [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); } + + [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); } + + Status StartProducing() override { + return generator_->StartProducing( + thread_indexer_.Capacity(), + [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, + [this](int64_t num_batches) { this->FinishedCallback(num_batches); }, + [this](std::function func) -> Status { + return this->ScheduleTaskCallback(std::move(func)); + }); + } + + void PauseProducing(ExecNode* output) override {} + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override { + if (generator_->Abort()) std::ignore = task_group_.End(); + } + + Future<> finished() override { return task_group_.OnFinished(); } + + private: + void OutputBatchCallback(ExecBatch batch) { + outputs_[0]->InputReceived(this, std::move(batch)); + } + + void FinishedCallback(int64_t total_num_batches) { + outputs_[0]->InputFinished(this, static_cast(total_num_batches)); + std::ignore = task_group_.End(); + } + + Status ScheduleTaskCallback(std::function func) { + auto executor = plan_->exec_context()->executor(); + if (executor) { + RETURN_NOT_OK(task_group_.AddTask([&] { + return executor->Submit([this, func] { + size_t thread_index = thread_indexer_(); + Status status = func(thread_index); + if (!status.ok()) { + StopProducing(); + ErrorIfNotOk(status); + return; + } + }); + })); + } else { + return func(0); + } + return Status::OK(); + } - template - Result TpchGen::CreateNode(const char *name, std::vector columns) - { - std::unique_ptr generator = arrow::internal::make_unique(); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, name, std::move(generator)); - } + const char* name_; + std::unique_ptr generator_; - Result TpchGen::Supplier(std::vector columns) - { - return CreateNode("Supplier", std::move(columns)); - } + util::AsyncTaskGroup task_group_; + ThreadIndexer thread_indexer_; +}; - Result TpchGen::Part(std::vector columns) - { - if(!part_and_part_supp_generator_) - { - part_and_part_supp_generator_ = std::make_shared(); - } - std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, "Part", std::move(generator)); - } +Result TpchGen::Make(ExecPlan* plan, float scale_factor, int64_t batch_size) { + TpchGen result(plan, scale_factor, batch_size); + return result; +} - Result TpchGen::PartSupp(std::vector columns) - { - if(!part_and_part_supp_generator_) - { - part_and_part_supp_generator_ = std::make_shared(); - } - std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); - } +template +Result TpchGen::CreateNode(const char* name, + std::vector columns) { + std::unique_ptr generator = arrow::internal::make_unique(); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, name, std::move(generator)); +} - Result TpchGen::Customer(std::vector columns) - { - return CreateNode("Customer", std::move(columns)); - } +Result TpchGen::Supplier(std::vector columns) { + return CreateNode("Supplier", std::move(columns)); +} - Result TpchGen::Orders(std::vector columns) - { - if(!orders_and_line_item_generator_) - { - orders_and_line_item_generator_ = std::make_shared(); - } - std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); - } +Result TpchGen::Part(std::vector columns) { + if (!part_and_part_supp_generator_) { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, "Part", std::move(generator)); +} - Result TpchGen::Lineitem(std::vector columns) - { - if(!orders_and_line_item_generator_) - { - orders_and_line_item_generator_ = std::make_shared(); - } - std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); - return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); - } +Result TpchGen::PartSupp(std::vector columns) { + if (!part_and_part_supp_generator_) { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); +} - Result TpchGen::Nation(std::vector columns) - { - return CreateNode("Nation", std::move(columns)); - } +Result TpchGen::Customer(std::vector columns) { + return CreateNode("Customer", std::move(columns)); +} - Result TpchGen::Region(std::vector columns) - { - return CreateNode("Region", std::move(columns)); - } - } +Result TpchGen::Orders(std::vector columns) { + if (!orders_and_line_item_generator_) { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); +} + +Result TpchGen::Lineitem(std::vector columns) { + if (!orders_and_line_item_generator_) { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); +} + +Result TpchGen::Nation(std::vector columns) { + return CreateNode("Nation", std::move(columns)); +} + +Result TpchGen::Region(std::vector columns) { + return CreateNode("Region", std::move(columns)); } +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index f286a66abe7..302057efbee 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -17,62 +17,55 @@ #pragma once -#include "arrow/compute/exec/options.h" +#include +#include #include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/options.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/pcg_random.h" -#include -#include - -namespace arrow -{ - namespace compute - { - class OrdersAndLineItemGenerator; - class PartAndPartSupplierGenerator; +namespace arrow { +namespace compute { +class OrdersAndLineItemGenerator; +class PartAndPartSupplierGenerator; - class ARROW_EXPORT TpchGen - { - public: - /* - * \brief Create a factory for nodes that generate TPC-H data - * - * Note: Individual tables will reference each other. It is important that you only create a single TpchGen - * instance for each plan and then you can create nodes for each table from that single TpchGen instance. - * Note: Every batch will be scheduled as a new task using the ExecPlan's scheduler. - */ - static Result Make(ExecPlan *plan, float scale_factor = 1.0f, int64_t batch_size = 4096); +class ARROW_EXPORT TpchGen { + public: + /* + * \brief Create a factory for nodes that generate TPC-H data + * + * Note: Individual tables will reference each other. It is important that you only + * create a single TpchGen instance for each plan and then you can create nodes for each + * table from that single TpchGen instance. Note: Every batch will be scheduled as a new + * task using the ExecPlan's scheduler. + */ + static Result Make(ExecPlan* plan, float scale_factor = 1.0f, + int64_t batch_size = 4096); - Result Supplier(std::vector columns = {}); - Result Part(std::vector columns = {}); - Result PartSupp(std::vector columns = {}); - Result Customer(std::vector columns = {}); - Result Orders(std::vector columns = {}); - Result Lineitem(std::vector columns = {}); - Result Nation(std::vector columns = {}); - Result Region(std::vector columns = {}); + Result Supplier(std::vector columns = {}); + Result Part(std::vector columns = {}); + Result PartSupp(std::vector columns = {}); + Result Customer(std::vector columns = {}); + Result Orders(std::vector columns = {}); + Result Lineitem(std::vector columns = {}); + Result Nation(std::vector columns = {}); + Result Region(std::vector columns = {}); - private: - TpchGen(ExecPlan *plan, float scale_factor, int64_t batch_size) - : plan_(plan), - scale_factor_(scale_factor), - batch_size_(batch_size), - part_and_part_supp_generator_(nullptr), - orders_and_line_item_generator_(nullptr) - {} + private: + TpchGen(ExecPlan* plan, float scale_factor, int64_t batch_size) + : plan_(plan), scale_factor_(scale_factor), batch_size_(batch_size) {} - template - Result CreateNode(const char *name, std::vector columns); + template + Result CreateNode(const char* name, std::vector columns); - ExecPlan *plan_; - float scale_factor_; - int64_t batch_size_; + ExecPlan* plan_; + float scale_factor_; + int64_t batch_size_; - std::shared_ptr part_and_part_supp_generator_; - std::shared_ptr orders_and_line_item_generator_; - }; - } -} + std::shared_ptr part_and_part_supp_generator_{}; + std::shared_ptr orders_and_line_item_generator_{}; +}; +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 686b909811a..21ff4fb8a83 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -18,12 +18,13 @@ #include #include "arrow/api.h" +#include "arrow/array/validate.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/tpch_node.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/kernels/row_encoder.h" #include "arrow/compute/kernels/test_util.h" -#include "arrow/compute/exec/tpch_node.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" @@ -31,698 +32,592 @@ #include "arrow/util/make_unique.h" #include "arrow/util/pcg_random.h" #include "arrow/util/thread_pool.h" -#include "arrow/array/validate.h" #include -#include #include +#include -namespace arrow -{ - namespace compute - { - static constexpr uint32_t kStartDate = 8035; // January 1, 1992 is 8035 days after January 1, 1970 - static constexpr uint32_t kEndDate = 10591; // December 12, 1998 is 10591 days after January 1, 1970 - - void ValidateBatch(const ExecBatch &batch) - { - for(const Datum &d : batch.values) - ASSERT_OK(arrow::internal::ValidateArray(*d.array())); - } - - void VerifyUniqueKey( - std::unordered_set &seen, - const Datum &d, - int32_t min, - int32_t max) - { - const int32_t *keys = reinterpret_cast(d.array()->buffers[1]->data()); - int64_t num_keys = d.length(); - for(int64_t i = 0; i < num_keys; i++) - { - ASSERT_TRUE(seen.insert(keys[i]).second); - ASSERT_LE(keys[i], max); - ASSERT_GE(keys[i], min); - } - } - - void VerifyStringAndNumber_Single( - const char *row, - const char *prefix, - const int64_t i, - const int32_t *nums, - int byte_width, - bool verify_padding) - { - int num_offset = static_cast(std::strlen(prefix)); - ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) << row << ", prefix=" << prefix << ", i=" << i; - const char *num_str = row + num_offset; - int64_t num = 0; - int ibyte = static_cast(num_offset); - for(; *num_str && ibyte < byte_width; ibyte++) - { - num *= 10; - ASSERT_TRUE(std::isdigit(*num_str)); - num += *num_str++ - '0'; - } - if(nums) - { - ASSERT_EQ(static_cast(num), nums[i]); - } - if(verify_padding) - { - int num_chars = ibyte - num_offset; - ASSERT_GE(num_chars, 9); - } - } - - void VerifyStringAndNumber_FixedWidth( - const Datum &strings, - const Datum &numbers, - int byte_width, - const char *prefix, - bool verify_padding = true) - { - int64_t length = strings.length(); - const char *str = reinterpret_cast( - strings.array()->buffers[1]->data()); - - const int32_t *nums = nullptr; - if(numbers.kind() != Datum::NONE) - { - ASSERT_EQ(length, numbers.length()); - nums = reinterpret_cast( - numbers.array()->buffers[1]->data()); - } - - for(int64_t i = 0; i < length; i++) - { - const char *row = str + i * byte_width; - VerifyStringAndNumber_Single(row, prefix, i, nums, byte_width, verify_padding); - } - } - - void VerifyStringAndNumber_Varlen( - const Datum &strings, - const Datum &numbers, - const char *prefix, - bool verify_padding = true) - { - int64_t length = strings.length(); - const int32_t *offsets = reinterpret_cast( - strings.array()->buffers[1]->data()); - const char *str = reinterpret_cast( - strings.array()->buffers[2]->data()); - - const int32_t *nums = nullptr; - if(numbers.kind() != Datum::NONE) - { - ASSERT_EQ(length, numbers.length()); - nums = reinterpret_cast( - numbers.array()->buffers[1]->data()); - } - - for(int64_t i = 0; i < length; i++) - { - char tmp_str[256] = {}; - int32_t start = offsets[i]; - int32_t str_len = offsets[i + 1] - offsets[i]; - std::memcpy(tmp_str, str + start, str_len); - VerifyStringAndNumber_Single( - tmp_str, - prefix, - i, - nums, - sizeof(tmp_str), - verify_padding); - } - } - - void VerifyVString(const Datum &d, int min_length, int max_length) - { - int64_t length = d.length(); - const int32_t *off = reinterpret_cast( - d.array()->buffers[1]->data()); - const char *str = reinterpret_cast( - d.array()->buffers[2]->data()); - for(int64_t i = 0; i < length; i++) - { - int32_t start = off[i]; - int32_t end = off[i + 1]; - int32_t str_len = end - start; - ASSERT_LE(str_len, max_length); - ASSERT_GE(str_len, min_length); - for(int32_t i = start; i < end; i++) - { - bool is_valid = std::isdigit(str[i]) || std::isalpha(str[i]) || str[i] == ',' || str[i] == ' '; - ASSERT_TRUE(is_valid) << "Character " << str[i] << " is not a digit, a letter, a comma, or a space"; - } - } - } - - void VerifyModuloBetween(const Datum &d, int32_t min, int32_t max, int32_t mod) - { - int64_t length = d.length(); - const int32_t *n = reinterpret_cast(d.array()->buffers[1]->data()); - for(int64_t i = 0; i < length; i++) - { - int32_t m = n[i] % mod; - ASSERT_GE(m, min) << "Value must be between " << min << " and " << max << " mod " << mod << ", " << n[i] << " % " << mod << " = " << m; - ASSERT_LE(m, max) << "Value must be between " << min << " and " << max << " mod " << mod << ", " << n[i] << " % " << mod << " = " << m; - } - } - - void VerifyAllBetween(const Datum &d, int32_t min, int32_t max) - { - int64_t length = d.length(); - const int32_t *n = reinterpret_cast(d.array()->buffers[1]->data()); - for(int64_t i = 0; i < length; i++) - { - ASSERT_GE(n[i], min) << "Value must be between " << min << " and " << max << ", got " << n[i]; - ASSERT_LE(n[i], max) << "Value must be between " << min << " and " << max << ", got " << n[i]; - } - } - - void VerifyNationKey(const Datum &d) - { - VerifyAllBetween(d, 0, 24); - } - - void VerifyPhone(const Datum &d) - { - int64_t length = d.length(); - const char *phones = reinterpret_cast(d.array()->buffers[1]->data()); - constexpr int kByteWidth = 15; // This is common for all PHONE columns - for(int64_t i = 0; i < length; i++) - { - const char *row = phones + i * kByteWidth; - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_EQ(*row++, '-'); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_EQ(*row++, '-'); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_EQ(*row++, '-'); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - } - } - - void VerifyDecimalsBetween(const Datum &d, int64_t min, int64_t max) - { - int64_t length = d.length(); - const Decimal128 *decs = reinterpret_cast( - d.array()->buffers[1]->data()); - for(int64_t i = 0; i < length; i++) - { - int64_t val = static_cast(decs[i]); - ASSERT_LE(val, max); - ASSERT_GE(val, min); - } - } - - void VerifyCorrectNumberOfWords_Varlen(const Datum &d, int num_words) - { - int expected_num_spaces = num_words - 1; - int64_t length = d.length(); - const int32_t *offsets = reinterpret_cast( - d.array()->buffers[1]->data()); - const char *str = reinterpret_cast( - d.array()->buffers[2]->data()); - - for(int64_t i = 0; i < length; i++) - { - int actual_num_spaces = 0; - - int32_t start = offsets[i]; - int32_t end = offsets[i + 1]; - int32_t str_len = end - start; - char tmp_str[256] = {}; - std::memcpy(tmp_str, str + start, str_len); - bool is_only_alphas_or_spaces = true; - for(int32_t j = offsets[i]; j < offsets[i + 1]; j++) - { - bool is_space = str[j] == ' '; - actual_num_spaces += is_space; - is_only_alphas_or_spaces &= (is_space || std::isalpha(str[j])); - } - ASSERT_TRUE(is_only_alphas_or_spaces) << "Words must be composed only of letters, got " << tmp_str; - ASSERT_EQ(actual_num_spaces, expected_num_spaces) << "Wrong number of spaces in " << tmp_str; - } - } - - void VerifyCorrectNumberOfWords_FixedWidth( - const Datum &d, - int num_words, - int byte_width) - { - int expected_num_spaces = num_words - 1; - int64_t length = d.length(); - const char *str = reinterpret_cast( - d.array()->buffers[1]->data()); - - for(int64_t i = 0; i < length; i++) - { - int actual_num_spaces = 0; - const char *row = str + i * byte_width; - bool is_only_alphas_or_spaces = true; - for(int32_t j = 0; j < byte_width && row[j]; j++) - { - bool is_space = row[j] == ' '; - actual_num_spaces += is_space; - is_only_alphas_or_spaces &= (is_space || std::isalpha(row[j])); - } - ASSERT_TRUE(is_only_alphas_or_spaces) << "Words must be composed only of letters, got " << row; - ASSERT_EQ(actual_num_spaces, expected_num_spaces) << "Wrong number of spaces in " << row; - } - } - - void VerifyOneOf(const Datum &d, const std::unordered_set &possibilities) - { - int64_t length = d.length(); - const char *col = reinterpret_cast( - d.array()->buffers[1]->data()); - for(int64_t i = 0; i < length; i++) - ASSERT_TRUE(possibilities.find(col[i]) != possibilities.end()); - } - - void VerifyOneOf( - const Datum &d, - int32_t byte_width, - const std::unordered_set &possibilities) - { - int64_t length = d.length(); - const char *col = reinterpret_cast( - d.array()->buffers[1]->data()); - for(int64_t i = 0; i < length; i++) - { - const char *row = col + i * byte_width; - char tmp_str[256] = {}; - std::memcpy(tmp_str, row, byte_width); - ASSERT_TRUE(possibilities.find(tmp_str) != possibilities.end()) << tmp_str << " is not a valid string."; - } - } - - void CountInstances(std::unordered_map &counts, const Datum &d) - { - int64_t length = d.length(); - const int32_t *nums = reinterpret_cast( - d.array()->buffers[1]->data()); - for(int64_t i = 0; i < length; i++) - counts[nums[i]]++; - } - - void CountModifiedComments(const Datum &d, int &good_count, int &bad_count) - { - int64_t length = d.length(); - const int32_t *offsets = reinterpret_cast( - d.array()->buffers[1]->data()); - const char *str = reinterpret_cast( - d.array()->buffers[2]->data()); - // Length of S_COMMENT is at most 100 - char tmp_string[101]; - for(int64_t i = 0; i < length; i++) - { - const char *row = str + offsets[i]; - int32_t row_length = offsets[i + 1] - offsets[i]; - std::memset(tmp_string, 0, sizeof(tmp_string)); - std::memcpy(tmp_string, row, row_length); - char *customer = std::strstr(tmp_string, "Customer"); - char *recommends = std::strstr(tmp_string, "Recommends"); - char *complaints = std::strstr(tmp_string, "Complaints"); - if(customer) - { - ASSERT_TRUE((recommends != nullptr) ^ (complaints != nullptr)); - if(recommends) - good_count++; - if(complaints) - bad_count++; - } - } - } - - TEST(TpchNode, ScaleFactor) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get(), 0.25f); - ExecNode *table = *gen.Supplier(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - int64_t kExpectedRows = 2500; - int64_t num_rows = 0; - for(auto &batch : res) - num_rows += batch.length; - ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Supplier) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Supplier(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - int64_t kExpectedRows = 10000; - int64_t num_rows = 0; - - std::unordered_set seen_suppkey; - int good_count = 0; - int bad_count = 0; - for(auto &batch : res) - { - ValidateBatch(batch); - VerifyUniqueKey( - seen_suppkey, - batch[0], +namespace arrow { +namespace compute { +static constexpr uint32_t kStartDate = + 8035; // January 1, 1992 is 8035 days after January 1, 1970 +static constexpr uint32_t kEndDate = + 10591; // December 12, 1998 is 10591 days after January 1, 1970 + +void ValidateBatch(const ExecBatch& batch) { + for (const Datum& d : batch.values) + ASSERT_OK(arrow::internal::ValidateArray(*d.array())); +} + +void VerifyUniqueKey(std::unordered_set& seen, const Datum& d, int32_t min, + int32_t max) { + const int32_t* keys = reinterpret_cast(d.array()->buffers[1]->data()); + int64_t num_keys = d.length(); + for (int64_t i = 0; i < num_keys; i++) { + ASSERT_TRUE(seen.insert(keys[i]).second); + ASSERT_LE(keys[i], max); + ASSERT_GE(keys[i], min); + } +} + +void VerifyStringAndNumber_Single(const char* row, const char* prefix, const int64_t i, + const int32_t* nums, int byte_width, + bool verify_padding) { + int num_offset = static_cast(std::strlen(prefix)); + ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) + << row << ", prefix=" << prefix << ", i=" << i; + const char* num_str = row + num_offset; + int64_t num = 0; + int ibyte = static_cast(num_offset); + for (; *num_str && ibyte < byte_width; ibyte++) { + num *= 10; + ASSERT_TRUE(std::isdigit(*num_str)); + num += *num_str++ - '0'; + } + if (nums) { + ASSERT_EQ(static_cast(num), nums[i]); + } + if (verify_padding) { + int num_chars = ibyte - num_offset; + ASSERT_GE(num_chars, 9); + } +} + +void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers, + int byte_width, const char* prefix, + bool verify_padding = true) { + int64_t length = strings.length(); + const char* str = reinterpret_cast(strings.array()->buffers[1]->data()); + + const int32_t* nums = nullptr; + if (numbers.kind() != Datum::NONE) { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast(numbers.array()->buffers[1]->data()); + } + + for (int64_t i = 0; i < length; i++) { + const char* row = str + i * byte_width; + VerifyStringAndNumber_Single(row, prefix, i, nums, byte_width, verify_padding); + } +} + +void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, + const char* prefix, bool verify_padding = true) { + int64_t length = strings.length(); + const int32_t* offsets = + reinterpret_cast(strings.array()->buffers[1]->data()); + const char* str = reinterpret_cast(strings.array()->buffers[2]->data()); + + const int32_t* nums = nullptr; + if (numbers.kind() != Datum::NONE) { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast(numbers.array()->buffers[1]->data()); + } + + for (int64_t i = 0; i < length; i++) { + char tmp_str[256] = {}; + int32_t start = offsets[i]; + int32_t str_len = offsets[i + 1] - offsets[i]; + std::memcpy(tmp_str, str + start, str_len); + VerifyStringAndNumber_Single(tmp_str, prefix, i, nums, sizeof(tmp_str), + verify_padding); + } +} + +void VerifyVString(const Datum& d, int min_length, int max_length) { + int64_t length = d.length(); + const int32_t* off = reinterpret_cast(d.array()->buffers[1]->data()); + const char* str = reinterpret_cast(d.array()->buffers[2]->data()); + for (int64_t i = 0; i < length; i++) { + int32_t start = off[i]; + int32_t end = off[i + 1]; + int32_t str_len = end - start; + ASSERT_LE(str_len, max_length); + ASSERT_GE(str_len, min_length); + for (int32_t i = start; i < end; i++) { + bool is_valid = + std::isdigit(str[i]) || std::isalpha(str[i]) || str[i] == ',' || str[i] == ' '; + ASSERT_TRUE(is_valid) << "Character " << str[i] + << " is not a digit, a letter, a comma, or a space"; + } + } +} + +void VerifyModuloBetween(const Datum& d, int32_t min, int32_t max, int32_t mod) { + int64_t length = d.length(); + const int32_t* n = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + int32_t m = n[i] % mod; + ASSERT_GE(m, min) << "Value must be between " << min << " and " << max << " mod " + << mod << ", " << n[i] << " % " << mod << " = " << m; + ASSERT_LE(m, max) << "Value must be between " << min << " and " << max << " mod " + << mod << ", " << n[i] << " % " << mod << " = " << m; + } +} + +void VerifyAllBetween(const Datum& d, int32_t min, int32_t max) { + int64_t length = d.length(); + const int32_t* n = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + ASSERT_GE(n[i], min) << "Value must be between " << min << " and " << max << ", got " + << n[i]; + ASSERT_LE(n[i], max) << "Value must be between " << min << " and " << max << ", got " + << n[i]; + } +} + +void VerifyNationKey(const Datum& d) { VerifyAllBetween(d, 0, 24); } + +void VerifyPhone(const Datum& d) { + int64_t length = d.length(); + const char* phones = reinterpret_cast(d.array()->buffers[1]->data()); + constexpr int kByteWidth = 15; // This is common for all PHONE columns + for (int64_t i = 0; i < length; i++) { + const char* row = phones + i * kByteWidth; + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_EQ(*row++, '-'); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::isdigit(*row++)); + } +} + +void VerifyDecimalsBetween(const Datum& d, int64_t min, int64_t max) { + int64_t length = d.length(); + const Decimal128* decs = + reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + int64_t val = static_cast(decs[i]); + ASSERT_LE(val, max); + ASSERT_GE(val, min); + } +} + +void VerifyCorrectNumberOfWords_Varlen(const Datum& d, int num_words) { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const int32_t* offsets = + reinterpret_cast(d.array()->buffers[1]->data()); + const char* str = reinterpret_cast(d.array()->buffers[2]->data()); + + for (int64_t i = 0; i < length; i++) { + int actual_num_spaces = 0; + + int32_t start = offsets[i]; + int32_t end = offsets[i + 1]; + int32_t str_len = end - start; + char tmp_str[256] = {}; + std::memcpy(tmp_str, str + start, str_len); + bool is_only_alphas_or_spaces = true; + for (int32_t j = offsets[i]; j < offsets[i + 1]; j++) { + bool is_space = str[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(str[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) + << "Words must be composed only of letters, got " << tmp_str; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) + << "Wrong number of spaces in " << tmp_str; + } +} + +void VerifyCorrectNumberOfWords_FixedWidth(const Datum& d, int num_words, + int byte_width) { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const char* str = reinterpret_cast(d.array()->buffers[1]->data()); + + for (int64_t i = 0; i < length; i++) { + int actual_num_spaces = 0; + const char* row = str + i * byte_width; + bool is_only_alphas_or_spaces = true; + for (int32_t j = 0; j < byte_width && row[j]; j++) { + bool is_space = row[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(row[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) + << "Words must be composed only of letters, got " << row; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) + << "Wrong number of spaces in " << row; + } +} + +void VerifyOneOf(const Datum& d, const std::unordered_set& possibilities) { + int64_t length = d.length(); + const char* col = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) + ASSERT_TRUE(possibilities.find(col[i]) != possibilities.end()); +} + +void VerifyOneOf(const Datum& d, int32_t byte_width, + const std::unordered_set& possibilities) { + int64_t length = d.length(); + const char* col = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + const char* row = col + i * byte_width; + char tmp_str[256] = {}; + std::memcpy(tmp_str, row, byte_width); + ASSERT_TRUE(possibilities.find(tmp_str) != possibilities.end()) + << tmp_str << " is not a valid string."; + } +} + +void CountInstances(std::unordered_map& counts, const Datum& d) { + int64_t length = d.length(); + const int32_t* nums = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) counts[nums[i]]++; +} + +void CountModifiedComments(const Datum& d, int& good_count, int& bad_count) { + int64_t length = d.length(); + const int32_t* offsets = + reinterpret_cast(d.array()->buffers[1]->data()); + const char* str = reinterpret_cast(d.array()->buffers[2]->data()); + // Length of S_COMMENT is at most 100 + char tmp_string[101]; + for (int64_t i = 0; i < length; i++) { + const char* row = str + offsets[i]; + int32_t row_length = offsets[i + 1] - offsets[i]; + std::memset(tmp_string, 0, sizeof(tmp_string)); + std::memcpy(tmp_string, row, row_length); + char* customer = std::strstr(tmp_string, "Customer"); + char* recommends = std::strstr(tmp_string, "Recommends"); + char* complaints = std::strstr(tmp_string, "Complaints"); + if (customer) { + ASSERT_TRUE((recommends != nullptr) ^ (complaints != nullptr)); + if (recommends) good_count++; + if (complaints) bad_count++; + } + } +} + +TEST(TpchNode, ScaleFactor) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get(), 0.25f); + ExecNode* table = *gen.Supplier(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 2500; + int64_t num_rows = 0; + for (auto& batch : res) num_rows += batch.length; + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Supplier) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Supplier(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 10000; + int64_t num_rows = 0; + + std::unordered_set seen_suppkey; + int good_count = 0; + int bad_count = 0; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(seen_suppkey, batch[0], /*min=*/1, /*max=*/static_cast(kExpectedRows)); - VerifyStringAndNumber_FixedWidth(batch[1], batch[0], /*byte_width=*/25, "Supplie#r"); - VerifyVString(batch[2], /*min_length=*/10, /*max_length=*/40); - VerifyNationKey(batch[3]); - VerifyPhone(batch[4]); - VerifyDecimalsBetween(batch[5], -99999, 999999); - CountModifiedComments(batch[6], good_count, bad_count); - num_rows += batch.length; - } - ASSERT_EQ(seen_suppkey.size(), kExpectedRows); - ASSERT_EQ(num_rows, kExpectedRows); - ASSERT_EQ(good_count, 5); - ASSERT_EQ(bad_count, 5); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Part) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Part(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - int64_t kExpectedRows = 200000; - int64_t num_rows = 0; - - std::unordered_set seen_partkey; - for(auto &batch : res) - { - ValidateBatch(batch); - VerifyUniqueKey( - seen_partkey, - batch[0], + VerifyStringAndNumber_FixedWidth(batch[1], batch[0], /*byte_width=*/25, "Supplie#r"); + VerifyVString(batch[2], /*min_length=*/10, /*max_length=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + CountModifiedComments(batch[6], good_count, bad_count); + num_rows += batch.length; + } + ASSERT_EQ(seen_suppkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + ASSERT_EQ(good_count, 5); + ASSERT_EQ(bad_count, 5); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Part) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Part(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + int64_t kExpectedRows = 200000; + int64_t num_rows = 0; + + std::unordered_set seen_partkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(seen_partkey, batch[0], /*min=*/1, /*max=*/static_cast(kExpectedRows)); - VerifyCorrectNumberOfWords_Varlen( - batch[1], - /*num_words*=*/5); - VerifyStringAndNumber_FixedWidth( - batch[2], - Datum(), - /*byte_width=*/25, - "Manufacturer#", - /*verify_padding=*/false); - VerifyStringAndNumber_FixedWidth( - batch[3], - Datum(), - /*byte_width=*/10, - "Brand#", - /*verify_padding=*/false); - VerifyCorrectNumberOfWords_Varlen( - batch[4], - /*num_words=*/3); - VerifyAllBetween(batch[5], /*min=*/1, /*max=*/50); - VerifyCorrectNumberOfWords_FixedWidth( - batch[6], - /*num_words=*/2, - /*byte_width=*/10); - num_rows += batch.length; - } - ASSERT_EQ(seen_partkey.size(), kExpectedRows); - ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, PartSupp) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.PartSupp(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - constexpr int64_t kExpectedRows = 800000; - int64_t num_rows = 0; - - std::unordered_map counts; - for(auto &batch : res) - { - ValidateBatch(batch); - CountInstances(counts, batch[0]); - VerifyAllBetween(batch[2], 1, 9999); - VerifyDecimalsBetween(batch[3], 100, 100000); - num_rows += batch.length; - } - for(auto &partkey : counts) - ASSERT_EQ(partkey.second, 4) << "Key " << partkey.first << " has count " << partkey.second; - ASSERT_EQ(counts.size(), kExpectedRows / 4); - - ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Customer) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Customer(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - const int64_t kExpectedRows = 150000; - int64_t num_rows = 0; - - std::unordered_set seen_custkey; - for(auto &batch : res) - { - ValidateBatch(batch); - VerifyUniqueKey( - seen_custkey, - batch[0], + VerifyCorrectNumberOfWords_Varlen(batch[1], + /*num_words*=*/5); + VerifyStringAndNumber_FixedWidth(batch[2], Datum(), + /*byte_width=*/25, "Manufacturer#", + /*verify_padding=*/false); + VerifyStringAndNumber_FixedWidth(batch[3], Datum(), + /*byte_width=*/10, "Brand#", + /*verify_padding=*/false); + VerifyCorrectNumberOfWords_Varlen(batch[4], + /*num_words=*/3); + VerifyAllBetween(batch[5], /*min=*/1, /*max=*/50); + VerifyCorrectNumberOfWords_FixedWidth(batch[6], + /*num_words=*/2, + /*byte_width=*/10); + num_rows += batch.length; + } + ASSERT_EQ(seen_partkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, PartSupp) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.PartSupp(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 800000; + int64_t num_rows = 0; + + std::unordered_map counts; + for (auto& batch : res) { + ValidateBatch(batch); + CountInstances(counts, batch[0]); + VerifyAllBetween(batch[2], 1, 9999); + VerifyDecimalsBetween(batch[3], 100, 100000); + num_rows += batch.length; + } + for (auto& partkey : counts) + ASSERT_EQ(partkey.second, 4) + << "Key " << partkey.first << " has count " << partkey.second; + ASSERT_EQ(counts.size(), kExpectedRows / 4); + + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Customer) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Customer(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + const int64_t kExpectedRows = 150000; + int64_t num_rows = 0; + + std::unordered_set seen_custkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(seen_custkey, batch[0], /*min=*/1, /*max=*/static_cast(kExpectedRows)); - VerifyStringAndNumber_Varlen( - batch[1], - batch[0], - "Customer#"); - VerifyVString(batch[2], /*min=*/10, /*max=*/40); - VerifyNationKey(batch[3]); - VerifyPhone(batch[4]); - VerifyDecimalsBetween(batch[5], -99999, 999999); - VerifyCorrectNumberOfWords_FixedWidth( - batch[6], - /*num_words=*/1, - /*byte_width=*/10); - num_rows += batch.length; - } - ASSERT_EQ(seen_custkey.size(), kExpectedRows); - ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Orders) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Orders(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - constexpr int64_t kExpectedRows = 1500000; - int64_t num_rows = 0; - - std::unordered_set seen_orderkey; - for(auto &batch : res) - { - ValidateBatch(batch); - VerifyUniqueKey( - seen_orderkey, - batch[0], + VerifyStringAndNumber_Varlen(batch[1], batch[0], "Customer#"); + VerifyVString(batch[2], /*min=*/10, /*max=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + VerifyCorrectNumberOfWords_FixedWidth(batch[6], + /*num_words=*/1, + /*byte_width=*/10); + num_rows += batch.length; + } + ASSERT_EQ(seen_custkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Orders) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Orders(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 1500000; + int64_t num_rows = 0; + + std::unordered_set seen_orderkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(seen_orderkey, batch[0], /*min=*/1, /*max=*/static_cast(4 * kExpectedRows)); - VerifyAllBetween(batch[1], /*min=*/1, /*max=*/static_cast(kExpectedRows)); - VerifyModuloBetween(batch[1], /*min=*/1, /*max=*/2, /*mod=*/3); - VerifyOneOf(batch[2], { 'F', 'O', 'P' }); - VerifyAllBetween(batch[4], kStartDate, kEndDate - 151); - VerifyOneOf(batch[5], - /*byte_width=*/15, - { - "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", - }); - VerifyStringAndNumber_FixedWidth( - batch[6], - Datum(), - /*byte_width=*/15, - "Clerk#", - /*verify_padding=*/true); - VerifyAllBetween(batch[7], /*min=*/0, /*max=*/0); - num_rows += batch.length; - } - ASSERT_EQ(seen_orderkey.size(), kExpectedRows); - ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Lineitem) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Lineitem(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - std::unordered_map counts; - for(auto &batch : res) - { - ValidateBatch(batch); - CountInstances(counts, batch[0]); - VerifyAllBetween(batch[1], /*min=*/1, /*max=*/200000); - VerifyAllBetween(batch[3], /*min=*/1, /*max=*/7); - VerifyDecimalsBetween(batch[4], /*min=*/100, /*max=*/5000); - VerifyDecimalsBetween(batch[6], /*min=*/0, /*max=*/10); - VerifyDecimalsBetween(batch[7], /*min=*/0, /*max=*/8); - VerifyOneOf(batch[8], { 'R', 'A', 'N' }); - VerifyOneOf(batch[9], { 'O', 'F' }); - VerifyAllBetween(batch[10], kStartDate + 1, kEndDate - 151 + 121); - VerifyAllBetween(batch[11], kStartDate + 30, kEndDate - 151 + 90); - VerifyAllBetween(batch[12], kStartDate + 2, kEndDate - 151 + 121 + 30); - VerifyOneOf( - batch[13], - /*byte_width=*/25, - { - "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN", - }); - VerifyOneOf( - batch[14], - /*byte_width=*/10, - { - "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", - }); - } - for(auto &count : counts) - { - ASSERT_GE(count.second, 1); - ASSERT_LE(count.second, 7); - } - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Nation) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Nation(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - constexpr int64_t kExpectedRows = 25; - int64_t num_rows = 0; - - std::unordered_set seen_nationkey; - for(auto &batch : res) - { - ValidateBatch(batch); - VerifyUniqueKey(seen_nationkey, batch[0], 0, kExpectedRows - 1); - VerifyOneOf( - batch[1], - /*byte_width=*/25, - { - "ALGERIA", "ARGENTINA", "BRAZIL", - "CANADA", "EGYPT", "ETHIOPIA", - "FRANCE", "GERMANY", "INDIA", - "INDONESIA", "IRAN", "IRAQ", - "JAPAN", "JORDAN", "KENYA", - "MOROCCO", "MOZAMBIQUE", "PERU", - "CHINA", "ROMANIA", "SAUDI ARABIA", - "VIETNAM", "RUSSIA", "UNITED KINGDOM", - "UNITED STATES" - }); - VerifyAllBetween(batch[2], 0, 4); - num_rows += batch.length; - } - ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - - TEST(TpchNode, Region) - { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode *table = *gen.Region(); - AsyncGenerator> sink_gen; - Declaration sink("sink", { Declaration::Input(table) }, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - - constexpr int64_t kExpectedRows = 5; - int64_t num_rows = 0; - - std::unordered_set seen_regionkey; - for(auto &batch : res) - { - ValidateBatch(batch); - VerifyUniqueKey(seen_regionkey, batch[0], 0, kExpectedRows - 1); - VerifyOneOf( - batch[1], - /*byte_width=*/25, - { - "AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST" - }); - - num_rows += batch.length; - } - ASSERT_EQ(num_rows, 5); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); - } - } + VerifyAllBetween(batch[1], /*min=*/1, /*max=*/static_cast(kExpectedRows)); + VerifyModuloBetween(batch[1], /*min=*/1, /*max=*/2, /*mod=*/3); + VerifyOneOf(batch[2], {'F', 'O', 'P'}); + VerifyAllBetween(batch[4], kStartDate, kEndDate - 151); + VerifyOneOf(batch[5], + /*byte_width=*/15, + { + "1-URGENT", + "2-HIGH", + "3-MEDIUM", + "4-NOT SPECIFIED", + "5-LOW", + }); + VerifyStringAndNumber_FixedWidth(batch[6], Datum(), + /*byte_width=*/15, "Clerk#", + /*verify_padding=*/true); + VerifyAllBetween(batch[7], /*min=*/0, /*max=*/0); + num_rows += batch.length; + } + ASSERT_EQ(seen_orderkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Lineitem) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Lineitem(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + std::unordered_map counts; + for (auto& batch : res) { + ValidateBatch(batch); + CountInstances(counts, batch[0]); + VerifyAllBetween(batch[1], /*min=*/1, /*max=*/200000); + VerifyAllBetween(batch[3], /*min=*/1, /*max=*/7); + VerifyDecimalsBetween(batch[4], /*min=*/100, /*max=*/5000); + VerifyDecimalsBetween(batch[6], /*min=*/0, /*max=*/10); + VerifyDecimalsBetween(batch[7], /*min=*/0, /*max=*/8); + VerifyOneOf(batch[8], {'R', 'A', 'N'}); + VerifyOneOf(batch[9], {'O', 'F'}); + VerifyAllBetween(batch[10], kStartDate + 1, kEndDate - 151 + 121); + VerifyAllBetween(batch[11], kStartDate + 30, kEndDate - 151 + 90); + VerifyAllBetween(batch[12], kStartDate + 2, kEndDate - 151 + 121 + 30); + VerifyOneOf(batch[13], + /*byte_width=*/25, + { + "DELIVER IN PERSON", + "COLLECT COD", + "NONE", + "TAKE BACK RETURN", + }); + VerifyOneOf(batch[14], + /*byte_width=*/10, + { + "REG AIR", + "AIR", + "RAIL", + "SHIP", + "TRUCK", + "MAIL", + "FOB", + }); + } + for (auto& count : counts) { + ASSERT_GE(count.second, 1); + ASSERT_LE(count.second, 7); + } + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Nation) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Nation(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 25; + int64_t num_rows = 0; + + std::unordered_set seen_nationkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(seen_nationkey, batch[0], 0, kExpectedRows - 1); + VerifyOneOf( + batch[1], + /*byte_width=*/25, + {"ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", + "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", + "IRAN", "IRAQ", "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", + "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}); + VerifyAllBetween(batch[2], 0, 4); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, kExpectedRows); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); +} + +TEST(TpchNode, Region) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(&ctx); + TpchGen gen = *TpchGen::Make(plan.get()); + ExecNode* table = *gen.Region(); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); + std::ignore = *sink.AddToPlan(plan.get()); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + + constexpr int64_t kExpectedRows = 5; + int64_t num_rows = 0; + + std::unordered_set seen_regionkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(seen_regionkey, batch[0], 0, kExpectedRows - 1); + VerifyOneOf(batch[1], + /*byte_width=*/25, + {"AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST"}); + + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 5); + arrow::internal::GetCpuThreadPool()->WaitForIdle(); } +} // namespace compute +} // namespace arrow diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 9d3ccc73b52..73c25366584 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -24,8 +24,8 @@ #include #include #include -// TODO: We probably don't want to add dataset + filesystem here, so instead we'll probably -// want to move the definition of Tpch_Dbgen_Write if it works +// TODO: We probably don't want to add dataset + filesystem here, so instead we'll +// probably want to move the definition of Tpch_Dbgen_Write if it works #include #include #include @@ -303,14 +303,11 @@ std::shared_ptr ExecNode_TableSourceNode( // [[arrow::export]] std::shared_ptr Tpch_Dbgen( - const std::shared_ptr& plan, - int scale_factor, - std::string table_name - ) { - + const std::shared_ptr& plan, int scale_factor, + std::string table_name) { auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); - compute::ExecNode *table; + compute::ExecNode* table; if (table_name == "part") { table = ValueOrStop(gen.Part()); } else if (table_name == "supplier") { @@ -333,40 +330,38 @@ std::shared_ptr Tpch_Dbgen( arrow::AsyncGenerator> sink_gen; - MakeExecNodeOrStop("sink", plan.get(), {table}, - compute::SinkNodeOptions{&sink_gen}); + MakeExecNodeOrStop("sink", plan.get(), {table}, compute::SinkNodeOptions{&sink_gen}); StopIfNotOk(plan->Validate()); StopIfNotOk(plan->StartProducing()); // If the generator is destroyed before being completely drained, inform plan std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); + bool not_finished_yet = + plan->finished().TryAddCallback([&plan] { + return [plan](const arrow::Status&) {}; + }); - if (not_finished_yet) { - plan->StopProducing(); - } - }}; + if (not_finished_yet) { + plan->StopProducing(); + } + }}; return compute::MakeGeneratorReader( - table->output_schema(), - [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool()); + table->output_schema(), [stop_producing, plan, sink_gen] { return sink_gen(); }, + gc_memory_pool()); } // [[arrow::export]] -void Tpch_Dbgen_Write( - const std::shared_ptr& plan, - int scale_factor, - std::string table_name, - const std::shared_ptr& filesystem, std::string base_dir, - arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions -) { +void Tpch_Dbgen_Write(const std::shared_ptr& plan, int scale_factor, + std::string table_name, + const std::shared_ptr& filesystem, + std::string base_dir, + arrow::dataset::ExistingDataBehavior existing_data_behavior, + int max_partitions) { auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); - compute::ExecNode *table; + compute::ExecNode* table; if (table_name == "part") { table = ValueOrStop(gen.Part()); } else if (table_name == "supplier") { @@ -388,14 +383,15 @@ void Tpch_Dbgen_Write( } // TODO: unhardcode this once it's working - auto base_path = base_dir + "/parquet_dataset"; + auto base_path = base_dir + "/parquet_dataset"; filesystem->CreateDir(base_path); auto format = std::make_shared(); ds::FileSystemDatasetWriteOptions write_options; write_options.file_write_options = format->DefaultWriteOptions(); - write_options.existing_data_behavior = ds::ExistingDataBehavior::kDeleteMatchingPartitions; + write_options.existing_data_behavior = + ds::ExistingDataBehavior::kDeleteMatchingPartitions; write_options.filesystem = filesystem; write_options.base_dir = base_path; write_options.partitioning = arrow::dataset::Partitioning::Default(); @@ -406,8 +402,7 @@ void Tpch_Dbgen_Write( // but I ran into namespace issues when doing it so I took it out to see if it // worked, but maybe that's what's causing the sefault? const ds::WriteNodeOptions options = - ds::WriteNodeOptions{write_options, table->output_schema()}; - + ds::WriteNodeOptions{write_options, table->output_schema()}; MakeExecNodeOrStop("consuming_sink", plan.get(), {table}, options); From 4b16296b4ef8cd3b3d440e8b7f8af32a89a16788 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Wed, 16 Mar 2022 13:45:38 -0700 Subject: [PATCH 22/34] Remove R stuff (so we can put it in another PR) --- r/DESCRIPTION | 1 - r/NAMESPACE | 1 - r/R/arrowExports.R | 8 --- r/R/tpch.R | 54 --------------- r/man/tpch_dbgen.Rd | 20 ------ r/src/arrowExports.cpp | 39 ----------- r/src/compute-exec.cpp | 128 ----------------------------------- r/tests/testthat/test-tpch.R | 53 --------------- 8 files changed, 304 deletions(-) delete mode 100644 r/R/tpch.R delete mode 100644 r/man/tpch_dbgen.Rd delete mode 100644 r/tests/testthat/test-tpch.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index ecbbfb79ac2..36a55c05b26 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -126,5 +126,4 @@ Collate: 'reexports-bit64.R' 'reexports-tidyselect.R' 'schema.R' - 'tpch.R' 'util.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index b24cad1fdb4..ae06e8e03aa 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -290,7 +290,6 @@ export(time64) export(timestamp) export(to_arrow) export(to_duckdb) -export(tpch_dbgen) export(type) export(uint16) export(uint32) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 01b73a71a96..e56e157413e 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -444,14 +444,6 @@ ExecNode_TableSourceNode <- function(plan, table) { .Call(`_arrow_ExecNode_TableSourceNode`, plan, table) } -Tpch_Dbgen <- function(plan, scale_factor, table_name) { - .Call(`_arrow_Tpch_Dbgen`, plan, scale_factor, table_name) -} - -Tpch_Dbgen_Write <- function(plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions) { - invisible(.Call(`_arrow_Tpch_Dbgen_Write`, plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions)) -} - RecordBatch__cast <- function(batch, schema, options) { .Call(`_arrow_RecordBatch__cast`, batch, schema, options) } diff --git a/r/R/tpch.R b/r/R/tpch.R deleted file mode 100644 index ef0e002a6e5..00000000000 --- a/r/R/tpch.R +++ /dev/null @@ -1,54 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -tpch_tables <- c("customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier") - - -#' Generate a RecordBatchReader with TPC-H data in it -#' -#' @param table the table to generate -#' @param scale_factor the scale factor to generate -#' -#' @return a RecordBatchReader that will contain the generated data -#' @export -#' -#' @keywords internal -tpch_dbgen <- function(table = tpch_tables, scale_factor) { - table <- match.arg(table) - - Tpch_Dbgen(ExecPlan$create(), scale_factor, table) -} - -tpch_dbgen_write <- function(table = tpch_tables, scale_factor, path, ...) { - table <- match.arg(table) - - path_and_fs <- get_path_and_filesystem(path) - - existing_data_behavior <- 0L - max_partitions <- 1024L - - Tpch_Dbgen_Write( - ExecPlan$create(), - scale_factor, - table, - path_and_fs$fs, - path_and_fs$path, - existing_data_behavior, - max_partitions - ) -} - diff --git a/r/man/tpch_dbgen.Rd b/r/man/tpch_dbgen.Rd deleted file mode 100644 index 88cc1cf1857..00000000000 --- a/r/man/tpch_dbgen.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tpch.R -\name{tpch_dbgen} -\alias{tpch_dbgen} -\title{Generate a RecordBatchReader with TPC-H data in it} -\usage{ -tpch_dbgen(table = tpch_tables, scale_factor) -} -\arguments{ -\item{table}{the table to generate} - -\item{scale_factor}{the scale factor to generate} -} -\value{ -a RecordBatchReader that will contain the generated data -} -\description{ -Generate a RecordBatchReader with TPC-H data in it -} -\keyword{internal} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 8a781cf0bea..8508b601703 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1749,45 +1749,6 @@ extern "C" SEXP _arrow_ExecNode_TableSourceNode(SEXP plan_sexp, SEXP table_sexp) } #endif -// compute-exec.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Tpch_Dbgen(const std::shared_ptr& plan, int scale_factor, std::string table_name); -extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type plan(plan_sexp); - arrow::r::Input::type scale_factor(scale_factor_sexp); - arrow::r::Input::type table_name(table_name_sexp); - return cpp11::as_sexp(Tpch_Dbgen(plan, scale_factor, table_name)); -END_CPP11 -} -#else -extern "C" SEXP _arrow_Tpch_Dbgen(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp){ - Rf_error("Cannot call Tpch_Dbgen(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - -// compute-exec.cpp -#if defined(ARROW_R_WITH_ARROW) -void Tpch_Dbgen_Write(const std::shared_ptr& plan, int scale_factor, std::string table_name, const std::shared_ptr& filesystem, std::string base_dir, arrow::dataset::ExistingDataBehavior existing_data_behavior, int max_partitions); -extern "C" SEXP _arrow_Tpch_Dbgen_Write(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type plan(plan_sexp); - arrow::r::Input::type scale_factor(scale_factor_sexp); - arrow::r::Input::type table_name(table_name_sexp); - arrow::r::Input&>::type filesystem(filesystem_sexp); - arrow::r::Input::type base_dir(base_dir_sexp); - arrow::r::Input::type existing_data_behavior(existing_data_behavior_sexp); - arrow::r::Input::type max_partitions(max_partitions_sexp); - Tpch_Dbgen_Write(plan, scale_factor, table_name, filesystem, base_dir, existing_data_behavior, max_partitions); - return R_NilValue; -END_CPP11 -} -#else -extern "C" SEXP _arrow_Tpch_Dbgen_Write(SEXP plan_sexp, SEXP scale_factor_sexp, SEXP table_name_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP existing_data_behavior_sexp, SEXP max_partitions_sexp){ - Rf_error("Cannot call Tpch_Dbgen_Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - // compute.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr RecordBatch__cast(const std::shared_ptr& batch, const std::shared_ptr& schema, cpp11::list options); diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 73c25366584..f46c3cefb36 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -23,12 +23,6 @@ #include #include #include -#include -// TODO: We probably don't want to add dataset + filesystem here, so instead we'll -// probably want to move the definition of Tpch_Dbgen_Write if it works -#include -#include -#include #include #include #include @@ -38,10 +32,6 @@ #include namespace compute = ::arrow::compute; -// TODO: We probably don't want to add dataset + fs here, so instead we'll probably -// want to move the definition of Tpch_Dbgen_Write if it works -namespace ds = ::arrow::dataset; -namespace fs = ::arrow::fs; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); @@ -301,122 +291,4 @@ std::shared_ptr ExecNode_TableSourceNode( return MakeExecNodeOrStop("table_source", plan.get(), {}, options); } -// [[arrow::export]] -std::shared_ptr Tpch_Dbgen( - const std::shared_ptr& plan, int scale_factor, - std::string table_name) { - auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); - - compute::ExecNode* table; - if (table_name == "part") { - table = ValueOrStop(gen.Part()); - } else if (table_name == "supplier") { - table = ValueOrStop(gen.Supplier()); - } else if (table_name == "partsupp") { - table = ValueOrStop(gen.PartSupp()); - } else if (table_name == "customer") { - table = ValueOrStop(gen.Customer()); - } else if (table_name == "nation") { - table = ValueOrStop(gen.Nation()); - } else if (table_name == "lineitem") { - table = ValueOrStop(gen.Lineitem()); - } else if (table_name == "region") { - table = ValueOrStop(gen.Region()); - } else if (table_name == "orders") { - table = ValueOrStop(gen.Orders()); - } else { - cpp11::stop("That's not a valid table name"); - } - - arrow::AsyncGenerator> sink_gen; - - MakeExecNodeOrStop("sink", plan.get(), {table}, compute::SinkNodeOptions{&sink_gen}); - - StopIfNotOk(plan->Validate()); - StopIfNotOk(plan->StartProducing()); - - // If the generator is destroyed before being completely drained, inform plan - std::shared_ptr stop_producing{nullptr, [plan](...) { - bool not_finished_yet = - plan->finished().TryAddCallback([&plan] { - return [plan](const arrow::Status&) {}; - }); - - if (not_finished_yet) { - plan->StopProducing(); - } - }}; - - return compute::MakeGeneratorReader( - table->output_schema(), [stop_producing, plan, sink_gen] { return sink_gen(); }, - gc_memory_pool()); -} - -// [[arrow::export]] -void Tpch_Dbgen_Write(const std::shared_ptr& plan, int scale_factor, - std::string table_name, - const std::shared_ptr& filesystem, - std::string base_dir, - arrow::dataset::ExistingDataBehavior existing_data_behavior, - int max_partitions) { - auto gen = ValueOrStop(arrow::compute::TpchGen::Make(plan.get(), scale_factor)); - - compute::ExecNode* table; - if (table_name == "part") { - table = ValueOrStop(gen.Part()); - } else if (table_name == "supplier") { - table = ValueOrStop(gen.Supplier()); - } else if (table_name == "partsupp") { - table = ValueOrStop(gen.PartSupp()); - } else if (table_name == "customer") { - table = ValueOrStop(gen.Customer()); - } else if (table_name == "nation") { - table = ValueOrStop(gen.Nation()); - } else if (table_name == "lineitem") { - table = ValueOrStop(gen.Lineitem()); - } else if (table_name == "region") { - table = ValueOrStop(gen.Region()); - } else if (table_name == "orders") { - table = ValueOrStop(gen.Orders()); - } else { - cpp11::stop("That's not a valid table name"); - } - - // TODO: unhardcode this once it's working - auto base_path = base_dir + "/parquet_dataset"; - filesystem->CreateDir(base_path); - - auto format = std::make_shared(); - - ds::FileSystemDatasetWriteOptions write_options; - write_options.file_write_options = format->DefaultWriteOptions(); - write_options.existing_data_behavior = - ds::ExistingDataBehavior::kDeleteMatchingPartitions; - write_options.filesystem = filesystem; - write_options.base_dir = base_path; - write_options.partitioning = arrow::dataset::Partitioning::Default(); - write_options.basename_template = "part{i}.parquet"; - write_options.max_partitions = 1024; - - // TODO: this had a checked_cast in front of it in the code I adapted it from - // but I ran into namespace issues when doing it so I took it out to see if it - // worked, but maybe that's what's causing the sefault? - const ds::WriteNodeOptions options = - ds::WriteNodeOptions{write_options, table->output_schema()}; - - MakeExecNodeOrStop("consuming_sink", plan.get(), {table}, options); - - cpp11::message("Just after consume"); - - StopIfNotOk(plan->Validate()); - - cpp11::message("Just after validate"); - - StopIfNotOk(plan->StartProducing()); - - cpp11::message("Just after start"); - - StopIfNotOk(plan->finished().status()); -} - #endif diff --git a/r/tests/testthat/test-tpch.R b/r/tests/testthat/test-tpch.R deleted file mode 100644 index eedf8954807..00000000000 --- a/r/tests/testthat/test-tpch.R +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -test_that("tpch_dbgen()", { - lineitem_rbr <- tpch_dbgen("lineitem", 1) - lineitem_tab <- lineitem_rbr$read_table() - expect_identical(ncol(lineitem_tab), 16L) - - # and check a handful of types - expect_type_equal(lineitem_tab[["L_ORDERKEY"]], int32()) - expect_type_equal(lineitem_tab[["L_RECEIPTDATE"]], date32()) - - region_rbr <- tpch_dbgen("region", 1) - region_tab <- region_rbr$read_table() - expect_identical(dim(region_tab), c(5L, 3L)) - - # and check a handful of types - expect_type_equal(region_tab[["R_REGIONKEY"]], int32()) - expect_type_equal(region_tab[["R_COMMENT"]], string()) - - part_rbr <- tpch_dbgen("part", 1) - part_tab <- part_rbr$read_table() - expect_identical(dim(part_tab), c(200000L, 9L)) - - # and check a handful of types - expect_type_equal(part_tab[["P_PARTKEY"]], int32()) - expect_type_equal(part_tab[["P_NAME"]], string()) -}) - -# these three are tested above, but test that we can get tables for all the rest -tpch_tables_up <- setdiff(tpch_tables, c("lineitem", "region", "part")) - -for (table_name in tpch_tables_up) { - test_that(paste0("Generating table: ", table_name), { - rbr <- tpch_dbgen(table_name, 1) - tab <- rbr$read_table() - expect_r6_class(tab, "Table") - }) -} From 7e9dce3bccf442f421093e0fa5ae16e43bd673b7 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 17 Mar 2022 22:43:49 -0700 Subject: [PATCH 23/34] Respond to comments --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 12 +- cpp/src/arrow/compute/exec/tpch_node.cc | 245 +++++++++++-------- cpp/src/arrow/compute/exec/tpch_node.h | 20 +- cpp/src/arrow/compute/exec/tpch_node_test.cc | 100 +++++--- cpp/src/arrow/compute/kernels/vector_sort.cc | 3 +- 5 files changed, 227 insertions(+), 153 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index db5161055af..3dcfddb63b9 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "benchmark/benchmark.h" +#include #include "arrow/compute/cast.h" #include "arrow/compute/exec/test_util.h" @@ -25,19 +25,20 @@ namespace arrow { namespace compute { +namespace internal { std::shared_ptr Plan_Q1(AsyncGenerator>* sink_gen, int scale_factor) { ExecContext* ctx = default_exec_context(); *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::shared_ptr plan = *ExecPlan::Make(ctx); - TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); + TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); ExecNode* lineitem = *gen.Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", "L_SHIPDATE", "L_RETURNFLAG", "L_LINESTATUS"}); - std::shared_ptr sept_2_1998 = std::make_shared( + auto sept_2_1998 = std::make_shared( 10471); // September 2, 1998 is 10471 days after January 1, 1970 Expression filter = less_equal(field_ref("L_SHIPDATE"), literal(std::move(sept_2_1998))); @@ -79,7 +80,6 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin {"hash_mean", &sum_opts}, {"hash_count", &count_opts}}; std::vector cols = {2, 3, 4, 5, 6, 7, 8, 2}; - std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", "sum_charge", "avg_qty", "avg_price", "avg_disc", "count_order"}; @@ -114,7 +114,7 @@ static void BM_Tpch_Q1(benchmark::State& st) { } } -BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({"SF"}); - +BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({"ScaleFactor"}); +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 1ada5d5398a..954631e26d8 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -16,7 +16,9 @@ // under the License. #include "arrow/compute/exec/tpch_node.h" +#include "arrow/util/formatting.h" #include "arrow/util/future.h" +#include "arrow/util/io_util.h" #include "arrow/util/make_unique.h" #include "arrow/util/unreachable.h" @@ -32,8 +34,41 @@ namespace arrow { using internal::checked_cast; +using internal::GetRandomSeed; namespace compute { +namespace internal { +/* +Architecture of the generator: + This is a multithreaded implementation of TPC-H's DBGen data generator. For each table + that doesn't depend on any other tables, it gets its own generator. For tables that + are dependent on each other (namely [ORDERS, LINEITEM] and [PART, PARTSUPP]), we +implement a "double generator". Each generator is given a list of columns to generate. +Columns are then generated lazily a batch at a time. If column A depends on another column +B, column A will ensure that column B is always generated. We don't have to worry about +breaking cycles of dependencies because the dependency graph is acyclic (so there's no +code to even bother breaking cycles). Double generators work by maintaining two output +queues. Batches for each of the tables are generated in sync, and batches that are not +being output immediately are appended to the queue. + + To generate a batch, we grab a lock, increment a counter, check the output queue (if +applicable), and then call the necessary generator functions. To generate a column, we +first check if it's already been generated. If not, we allocate the batch and then fill it +according to the spec. + + There are a few types of columns that get generated: + - Primary Keys: incrementing counters from 1 to N (with N being the number of rows). +These are generated by incrementing a counter (this counter is gated under a lock). + - V-String: Random-length string of alphanumerics + - Phone Number: 2, 3, 3, and 4, digit numbers separated by -'s + - Random Numbers: random numbers within some range + - Expressions: expressions based on some other columns + Please consult the spec for intended behavior of each individual column. Columns are +generated by a function of the same name (so e.g. the column PS_PARTKEY is generated by a +function PS_PARTKEY). +*/ + +namespace { const char* NameParts[] = { "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", "blush", "brown", "burlywood", @@ -52,42 +87,42 @@ const char* NameParts[] = { "tan", "thistle", "tomato", "turquoise", "violet", "wheat", "white", "yellow", }; -static constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); +constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); const char* Types_1[] = { "STANDARD ", "SMALL ", "MEDIUM ", "LARGE ", "ECONOMY ", "PROMO ", }; -static constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); +constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); const char* Types_2[] = { "ANODIZED ", "BURNISHED ", "PLATED ", "POLISHED ", "BRUSHED ", }; -static constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); +constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); const char* Types_3[] = { "TIN", "NICKEL", "BRASS", "STEEL", "COPPER", }; -static constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); +constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); const char* Containers_1[] = { "SM ", "LG ", "MD ", "JUMBO ", "WRAP ", }; -static constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); +constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); const char* Containers_2[] = { "CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM", }; -static constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); +constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); const char* Segments[] = { "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD", }; -static constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); +constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); const char* Priorities[] = { "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", }; -static constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); +constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); const char* Instructions[] = { "DELIVER IN PERSON", @@ -95,12 +130,12 @@ const char* Instructions[] = { "NONE", "TAKE BACK RETURN", }; -static constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); +constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); const char* Modes[] = { "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", }; -static constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); +constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); const char* Nouns[] = { "foxes ", "ideas ", "theodolites ", "pinto beans ", "instructions ", @@ -113,7 +148,7 @@ const char* Nouns[] = { "sentiments ", "decoys ", "realms ", "pains ", "grouches ", "escapades ", }; -static constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); +constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); const char* Verbs[] = { "sleep ", "wake ", "are ", "cajole ", "haggle ", "nag ", "use ", @@ -123,7 +158,7 @@ const char* Verbs[] = { "poach ", "serve ", "run ", "dazzle ", "snooze ", "doze ", "unwind ", "kindle ", "play ", "hang ", "believe ", "doubt ", }; -static constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); +constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); const char* Adjectives[] = { "furious ", "sly ", "careful ", "blithe ", "quick ", "fluffy ", "slow ", @@ -131,7 +166,7 @@ const char* Adjectives[] = { "stealthy ", "permanent ", "enticing ", "idle ", "busy ", "regular ", "final ", "ironic ", "even ", "bold ", "silent ", }; -static constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); +constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); const char* Adverbs[] = { "sometimes ", "always ", "never ", "furiously ", "slyly ", "carefully ", @@ -140,7 +175,7 @@ const char* Adverbs[] = { "permanently ", "enticingly ", "idly ", "busily ", "regularly ", "finally ", "ironically ", "evenly ", "boldly ", "silently ", }; -static constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); +constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); const char* Prepositions[] = { "about ", "above ", "according to ", "across ", "after ", "against ", @@ -152,7 +187,7 @@ const char* Prepositions[] = { "since ", "through ", "throughout ", "to ", "toward ", "under ", "until ", "up ", "upon ", "without ", "with ", "within ", }; -static constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); +constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); const char* Auxiliaries[] = { "do ", @@ -174,12 +209,23 @@ const char* Auxiliaries[] = { "need to ", "try to ", }; -static constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); +constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); const char* Terminators[] = { ".", ";", ":", "?", "!", "--", }; -static constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); +constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + +constexpr uint32_t kStartDate = + 8035; // January 1, 1992 is 8035 days after January 1, 1970 +constexpr uint32_t kCurrentDate = + 9298; // June 17, 1995 is 9298 days after January 1, 1970 +constexpr uint32_t kEndDate = + 10591; // December 12, 1998 is 10591 days after January 1, 1970 + +std::uniform_int_distribution kSeedDist(std::numeric_limits::min(), + std::numeric_limits::max()); +} // namespace // The spec says to generate a 300 MB string according to a grammar. This is a // concurrent implementation of the generator. Each thread generates the text in @@ -450,8 +496,8 @@ class TpchTableGenerator { using ScheduleCallback = std::function; using AbortCallback = std::function; - virtual Status Init(std::vector columns, float scale_factor, - int64_t batch_size) = 0; + virtual Status Init(std::vector columns, double scale_factor, + int64_t batch_size, int64_t seed) = 0; virtual Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, @@ -467,6 +513,7 @@ class TpchTableGenerator { virtual ~TpchTableGenerator() = default; protected: + int64_t seed_ = {0}; std::atomic done_ = {false}; std::atomic batches_outputted_ = {0}; }; @@ -494,20 +541,14 @@ int GetNumDigits(int64_t x) { if (x < 10000000000000000ll) return 16; if (x < 100000000000000000ll) return 17; if (x < 1000000000000000000ll) return 18; + Unreachable("Positive 64-bit integer should never have more than 18 digits"); return -1; } void AppendNumberPaddedToNineDigits(char* out, int64_t x) { - // We do all of this to avoid calling snprintf, which needs to handle locale, - // which can be slow, especially on Mac and Windows. - int num_digits = GetNumDigits(x); - int num_padding_zeros = std::max(9 - num_digits, 0); - std::memset(out, '0', static_cast(num_padding_zeros)); - while (x > 0) { - *(out + num_padding_zeros + num_digits - 1) = ('0' + x % 10); - num_digits -= 1; - x /= 10; - } + size_t kPad = 9; + out += std::max(kPad, static_cast(GetNumDigits(x))); + arrow::internal::detail::FormatAllDigitsLeftPadded(x, kPad, '0', &out); } Result> SetOutputColumns( @@ -564,15 +605,6 @@ Result RandomVString(random::pcg32_fast& rng, int64_t num_rows, int32_t m return std::move(ad); } -void AppendNumber(char*& out, int num_digits, int32_t x) { - out += (num_digits - 1); - while (x > 0) { - *out-- = '0' + (x % 10); - x /= 10; - } - out += (num_digits + 1); -} - void GeneratePhoneNumber(char* out, random::pcg32_fast& rng, int32_t country) { std::uniform_int_distribution three_digit(100, 999); std::uniform_int_distribution four_digit(1000, 9999); @@ -581,37 +613,32 @@ void GeneratePhoneNumber(char* out, random::pcg32_fast& rng, int32_t country) { int32_t l1 = three_digit(rng); int32_t l2 = three_digit(rng); int32_t l3 = four_digit(rng); - AppendNumber(out, 2, country_code); - *out++ = '-'; - AppendNumber(out, 3, l1); - *out++ = '-'; - AppendNumber(out, 3, l2); - *out++ = '-'; - AppendNumber(out, 4, l3); -} -static constexpr uint32_t kStartDate = - 8035; // January 1, 1992 is 8035 days after January 1, 1970 -static constexpr uint32_t kCurrentDate = - 9298; // June 17, 1995 is 9298 days after January 1, 1970 -static constexpr uint32_t kEndDate = - 10591; // December 12, 1998 is 10591 days after January 1, 1970 + out += 15; + arrow::internal::detail::FormatAllDigits(l3, &out); + *(--out) = '-'; + arrow::internal::detail::FormatAllDigits(l2, &out); + *(--out) = '-'; + arrow::internal::detail::FormatAllDigits(l1, &out); + *(--out) = '-'; + arrow::internal::detail::FormatTwoDigits(country_code, &out); +} using GenerateColumnFn = std::function; class PartAndPartSupplierGenerator { public: - Status Init(size_t num_threads, int64_t batch_size, float scale_factor) { + Status Init(size_t num_threads, int64_t batch_size, double scale_factor, int64_t seed) { if (!inited_) { inited_ = true; batch_size_ = batch_size; scale_factor_ = scale_factor; - arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); + random::pcg64_fast seed_rng(seed); for (ThreadLocalData& tld : thread_local_data_) { constexpr int kMaxNumDistinctStrings = 5; tld.string_indices.resize(kMaxNumDistinctStrings * batch_size_); - tld.rng.seed(seq); + tld.rng.seed(kSeedDist(seed_rng)); } part_rows_to_generate_ = static_cast(scale_factor_ * 200000); } @@ -1135,6 +1162,7 @@ class PartAndPartSupplierGenerator { ipartsupp++, irun++) { int32_t supplier = static_cast(ipartsupp); int32_t partkey = p_partkey[ipart]; + // Magic formula from TPCH spec. ps_suppkey[batch_offset++] = (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; } @@ -1233,7 +1261,7 @@ class PartAndPartSupplierGenerator { std::queue part_output_queue_; std::queue partsupp_output_queue_; int64_t batch_size_{0}; - float scale_factor_{0}; + double scale_factor_{0}; int64_t part_rows_to_generate_{0}; int64_t part_rows_generated_{0}; std::vector part_cols_; @@ -1247,17 +1275,17 @@ class PartAndPartSupplierGenerator { class OrdersAndLineItemGenerator { public: - Status Init(size_t num_threads, int64_t batch_size, float scale_factor) { + Status Init(size_t num_threads, int64_t batch_size, double scale_factor, int64_t seed) { if (!inited_) { inited_ = true; batch_size_ = batch_size; scale_factor_ = scale_factor; - arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); + random::pcg64_fast seed_rng(seed); for (ThreadLocalData& tld : thread_local_data_) { tld.items_per_order.resize(batch_size_); - tld.rng.seed(seq); + tld.rng.seed(kSeedDist(seed_rng)); } orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); } @@ -2335,7 +2363,7 @@ class OrdersAndLineItemGenerator { std::queue orders_output_queue_; std::queue lineitem_output_queue_; int64_t batch_size_; - float scale_factor_; + double scale_factor_; int64_t orders_rows_to_generate_; int64_t orders_rows_generated_; std::vector orders_cols_; @@ -2348,8 +2376,8 @@ class OrdersAndLineItemGenerator { class SupplierGenerator : public TpchTableGenerator { public: - Status Init(std::vector columns, float scale_factor, - int64_t batch_size) override { + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { scale_factor_ = scale_factor; batch_size_ = batch_size; rows_to_generate_ = static_cast(scale_factor_ * 10000); @@ -2357,7 +2385,10 @@ class SupplierGenerator : public TpchTableGenerator { ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns(columns, kTypes, kNameMap, gen_list_)); - random::pcg32_fast rng; + random::pcg64_fast seed_rng(seed); + seed_ = kSeedDist(seed_rng); + + random::pcg32_fast rng(kSeedDist(seed_rng)); std::uniform_int_distribution dist(0, rows_to_generate_ - 1); size_t num_special_rows = static_cast(5 * scale_factor_); std::unordered_set good_rows_set; @@ -2385,9 +2416,8 @@ class SupplierGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { - arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); - for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(seq); + for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(GetRandomSeed()); output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); @@ -2616,7 +2646,7 @@ class SupplierGenerator : public TpchTableGenerator { ScheduleCallback schedule_callback_; int64_t rows_to_generate_; std::atomic rows_generated_; - float scale_factor_; + double scale_factor_; int64_t batch_size_; std::vector gen_list_; std::shared_ptr schema_; @@ -2627,8 +2657,9 @@ class PartGenerator : public TpchTableGenerator { explicit PartGenerator(std::shared_ptr gen) : gen_(std::move(gen)) {} - Status Init(std::vector columns, float scale_factor, - int64_t batch_size) override { + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; scale_factor_ = scale_factor; batch_size_ = batch_size; ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetPartOutputColumns(columns)); @@ -2638,7 +2669,7 @@ class PartGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); @@ -2675,7 +2706,7 @@ class PartGenerator : public TpchTableGenerator { FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - float scale_factor_; + double scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -2685,8 +2716,9 @@ class PartSuppGenerator : public TpchTableGenerator { explicit PartSuppGenerator(std::shared_ptr gen) : gen_(std::move(gen)) {} - Status Init(std::vector columns, float scale_factor, - int64_t batch_size) override { + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; scale_factor_ = scale_factor; batch_size_ = batch_size; ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetPartSuppOutputColumns(columns)); @@ -2696,7 +2728,7 @@ class PartSuppGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); @@ -2734,15 +2766,16 @@ class PartSuppGenerator : public TpchTableGenerator { FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - float scale_factor_; + double scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; class CustomerGenerator : public TpchTableGenerator { public: - Status Init(std::vector columns, float scale_factor, - int64_t batch_size) override { + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; scale_factor_ = scale_factor; batch_size_ = batch_size; rows_to_generate_ = static_cast(scale_factor_ * 150000); @@ -2755,9 +2788,9 @@ class CustomerGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { - arrow_vendored::pcg_extras::seed_seq_from seq; thread_local_data_.resize(num_threads); - for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(seq); + random::pcg64_fast seed_rng(seed_); + for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(kSeedDist(seed_rng)); output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); @@ -2989,7 +3022,7 @@ class CustomerGenerator : public TpchTableGenerator { ScheduleCallback schedule_callback_; int64_t rows_to_generate_{0}; std::atomic rows_generated_ = {0}; - float scale_factor_{0}; + double scale_factor_{0}; int64_t batch_size_{0}; std::vector gen_list_; std::shared_ptr schema_; @@ -3000,8 +3033,9 @@ class OrdersGenerator : public TpchTableGenerator { explicit OrdersGenerator(std::shared_ptr gen) : gen_(std::move(gen)) {} - Status Init(std::vector columns, float scale_factor, - int64_t batch_size) override { + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; scale_factor_ = scale_factor; batch_size_ = batch_size; ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetOrdersOutputColumns(columns)); @@ -3011,7 +3045,7 @@ class OrdersGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); @@ -3048,7 +3082,7 @@ class OrdersGenerator : public TpchTableGenerator { FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - float scale_factor_; + double scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; @@ -3058,8 +3092,9 @@ class LineitemGenerator : public TpchTableGenerator { explicit LineitemGenerator(std::shared_ptr gen) : gen_(std::move(gen)) {} - Status Init(std::vector columns, float scale_factor, - int64_t batch_size) override { + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; scale_factor_ = scale_factor; batch_size_ = batch_size; ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetLineItemOutputColumns(columns)); @@ -3069,7 +3104,7 @@ class LineitemGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback schedule_callback) override { - RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_)); + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); output_callback_ = std::move(output_callback); finished_callback_ = std::move(finished_callback); schedule_callback_ = std::move(schedule_callback); @@ -3107,18 +3142,19 @@ class LineitemGenerator : public TpchTableGenerator { FinishedCallback finished_callback_; ScheduleCallback schedule_callback_; int64_t batch_size_; - float scale_factor_; + double scale_factor_; std::shared_ptr gen_; std::shared_ptr schema_; }; class NationGenerator : public TpchTableGenerator { public: - Status Init(std::vector columns, float /*scale_factor*/, - int64_t /*batch_size*/) override { + Status Init(std::vector columns, double /*scale_factor*/, + int64_t /*batch_size*/, int64_t seed) override { ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns(columns, kTypes, kNameMap, column_indices_)); - rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); + seed_ = seed; + rng_.seed(seed_); return Status::OK(); } @@ -3206,11 +3242,12 @@ class NationGenerator : public TpchTableGenerator { class RegionGenerator : public TpchTableGenerator { public: - Status Init(std::vector columns, float /*scale_factor*/, - int64_t /*batch_size*/) override { + Status Init(std::vector columns, double /*scale_factor*/, + int64_t /*batch_size*/, int64_t seed) override { ARROW_ASSIGN_OR_RAISE(schema_, SetOutputColumns(columns, kTypes, kNameMap, column_indices_)); - rng_.seed(arrow_vendored::pcg_extras::seed_seq_from{}); + seed_ = seed; + rng_.seed(seed_); return Status::OK(); } @@ -3359,8 +3396,10 @@ class TpchNode : public ExecNode { ThreadIndexer thread_indexer_; }; -Result TpchGen::Make(ExecPlan* plan, float scale_factor, int64_t batch_size) { - TpchGen result(plan, scale_factor, batch_size); +Result TpchGen::Make(ExecPlan* plan, double scale_factor, int64_t batch_size, + util::optional seed) { + if (!seed.has_value()) seed = GetRandomSeed(); + TpchGen result(plan, scale_factor, batch_size, *seed); return result; } @@ -3368,7 +3407,8 @@ template Result TpchGen::CreateNode(const char* name, std::vector columns) { std::unique_ptr generator = arrow::internal::make_unique(); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, name, std::move(generator)); } @@ -3382,7 +3422,8 @@ Result TpchGen::Part(std::vector columns) { } std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "Part", std::move(generator)); } @@ -3392,7 +3433,8 @@ Result TpchGen::PartSupp(std::vector columns) { } std::unique_ptr generator = arrow::internal::make_unique(part_and_part_supp_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); } @@ -3406,7 +3448,8 @@ Result TpchGen::Orders(std::vector columns) { } std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); } @@ -3416,7 +3459,8 @@ Result TpchGen::Lineitem(std::vector columns) { } std::unique_ptr generator = arrow::internal::make_unique(orders_and_line_item_generator_); - RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_)); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); } @@ -3427,5 +3471,6 @@ Result TpchGen::Nation(std::vector columns) { Result TpchGen::Region(std::vector columns) { return CreateNode("Region", std::move(columns)); } +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index 302057efbee..42dd9e4adbc 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -28,6 +28,7 @@ namespace arrow { namespace compute { +namespace internal { class OrdersAndLineItemGenerator; class PartAndPartSupplierGenerator; @@ -41,9 +42,13 @@ class ARROW_EXPORT TpchGen { * table from that single TpchGen instance. Note: Every batch will be scheduled as a new * task using the ExecPlan's scheduler. */ - static Result Make(ExecPlan* plan, float scale_factor = 1.0f, - int64_t batch_size = 4096); + static Result Make(ExecPlan* plan, double scale_factor = 1.0, + int64_t batch_size = 4096, + util::optional seed = util::nullopt); + // The below methods will create and add an ExecNode to the plan that generates + // data for the desired table. If columns is empty, all columns will be generated. + // The methods return the added ExecNode, which should be used for inputs. Result Supplier(std::vector columns = {}); Result Part(std::vector columns = {}); Result PartSupp(std::vector columns = {}); @@ -54,18 +59,23 @@ class ARROW_EXPORT TpchGen { Result Region(std::vector columns = {}); private: - TpchGen(ExecPlan* plan, float scale_factor, int64_t batch_size) - : plan_(plan), scale_factor_(scale_factor), batch_size_(batch_size) {} + TpchGen(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + seed_rng_(seed) {} template Result CreateNode(const char* name, std::vector columns); ExecPlan* plan_; - float scale_factor_; + double scale_factor_; int64_t batch_size_; + random::pcg64_fast seed_rng_; std::shared_ptr part_and_part_supp_generator_{}; std::shared_ptr orders_and_line_item_generator_{}; }; +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 21ff4fb8a83..9f8fefff235 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -17,8 +17,6 @@ #include -#include "arrow/api.h" -#include "arrow/array/validate.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" @@ -39,50 +37,63 @@ namespace arrow { namespace compute { +namespace internal { static constexpr uint32_t kStartDate = 8035; // January 1, 1992 is 8035 days after January 1, 1970 static constexpr uint32_t kEndDate = 10591; // December 12, 1998 is 10591 days after January 1, 1970 +// Verifies that the data is valid Arrow and ensures it's not null. void ValidateBatch(const ExecBatch& batch) { - for (const Datum& d : batch.values) - ASSERT_OK(arrow::internal::ValidateArray(*d.array())); + for (const Datum& d : batch.values) { + ASSERT_EQ(d.array()->buffers[0].get(), nullptr); + ASSERT_OK(d.make_array()->ValidateFull()); + } } -void VerifyUniqueKey(std::unordered_set& seen, const Datum& d, int32_t min, +// Verifies that each element is seen exactly once, and that it's between min and max +// inclusive +void VerifyUniqueKey(std::unordered_set* seen, const Datum& d, int32_t min, int32_t max) { const int32_t* keys = reinterpret_cast(d.array()->buffers[1]->data()); int64_t num_keys = d.length(); for (int64_t i = 0; i < num_keys; i++) { - ASSERT_TRUE(seen.insert(keys[i]).second); + ASSERT_TRUE(seen->insert(keys[i]).second); ASSERT_LE(keys[i], max); ASSERT_GE(keys[i], min); } } -void VerifyStringAndNumber_Single(const char* row, const char* prefix, const int64_t i, - const int32_t* nums, int byte_width, +void VerifyStringAndNumber_Single(const util::string_view& row, const char* prefix, + const int64_t i, const int32_t* nums, bool verify_padding) { - int num_offset = static_cast(std::strlen(prefix)); - ASSERT_EQ(std::memcmp(row, prefix, num_offset), 0) - << row << ", prefix=" << prefix << ", i=" << i; - const char* num_str = row + num_offset; + size_t num_offset = static_cast(std::strlen(prefix)); + ASSERT_TRUE(row.starts_with(prefix)) << row << ", prefix=" << prefix << ", i=" << i; + const char* num_str = row.data() + num_offset; int64_t num = 0; - int ibyte = static_cast(num_offset); - for (; *num_str && ibyte < byte_width; ibyte++) { + size_t ibyte = num_offset; + // Parse the number out + for (; *num_str && ibyte < row.size(); ibyte++) { num *= 10; ASSERT_TRUE(std::isdigit(*num_str)); num += *num_str++ - '0'; } + // If nums is not null, ensure it matches the parsed number if (nums) { ASSERT_EQ(static_cast(num), nums[i]); } + // TPC-H requires only ever requires padding up to 9 digits, so we ensure that + // the total length of the string was at least 9 (could be more for bigger numbers). if (verify_padding) { - int num_chars = ibyte - num_offset; + int64_t num_chars = static_cast(ibyte - num_offset); ASSERT_GE(num_chars, 9); } } +// Verifies that each row is the string "prefix" followed by a number. If numbers is not +// EMPTY, it also checks that the number following the prefix is equal to the +// corresponding row in numbers. Some TPC-H data is padded to 9 zeros, which this function +// can optionally verify as well. This string function verifies fixed width columns. void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers, int byte_width, const char* prefix, bool verify_padding = true) { @@ -97,10 +108,12 @@ void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers for (int64_t i = 0; i < length; i++) { const char* row = str + i * byte_width; - VerifyStringAndNumber_Single(row, prefix, i, nums, byte_width, verify_padding); + util::string_view view(row, byte_width); + VerifyStringAndNumber_Single(view, prefix, i, nums, verify_padding); } } +// Same as above but for variable length columns void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, const char* prefix, bool verify_padding = true) { int64_t length = strings.length(); @@ -115,15 +128,16 @@ void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, } for (int64_t i = 0; i < length; i++) { - char tmp_str[256] = {}; int32_t start = offsets[i]; int32_t str_len = offsets[i + 1] - offsets[i]; - std::memcpy(tmp_str, str + start, str_len); - VerifyStringAndNumber_Single(tmp_str, prefix, i, nums, sizeof(tmp_str), - verify_padding); + util::string_view view(str + start, str_len); + VerifyStringAndNumber_Single(view, prefix, i, nums, verify_padding); } } +// Verifies that each row is a V-string, which is defined in the spec to be +// a string of random length between min_length and max_length, that is composed +// of alphanumeric characters, commas, or spaces. void VerifyVString(const Datum& d, int min_length, int max_length) { int64_t length = d.length(); const int32_t* off = reinterpret_cast(d.array()->buffers[1]->data()); @@ -143,6 +157,7 @@ void VerifyVString(const Datum& d, int min_length, int max_length) { } } +// Verifies that each 32-bit element modulo "mod" is between min and max. void VerifyModuloBetween(const Datum& d, int32_t min, int32_t max, int32_t mod) { int64_t length = d.length(); const int32_t* n = reinterpret_cast(d.array()->buffers[1]->data()); @@ -155,6 +170,7 @@ void VerifyModuloBetween(const Datum& d, int32_t min, int32_t max, int32_t mod) } } +// Verifies that each 32-bit element is between min and max. void VerifyAllBetween(const Datum& d, int32_t min, int32_t max) { int64_t length = d.length(); const int32_t* n = reinterpret_cast(d.array()->buffers[1]->data()); @@ -168,6 +184,7 @@ void VerifyAllBetween(const Datum& d, int32_t min, int32_t max) { void VerifyNationKey(const Datum& d) { VerifyAllBetween(d, 0, 24); } +// Verifies that each row satisfies the phone number spec. void VerifyPhone(const Datum& d) { int64_t length = d.length(); const char* phones = reinterpret_cast(d.array()->buffers[1]->data()); @@ -192,6 +209,7 @@ void VerifyPhone(const Datum& d) { } } +// Verifies that each decimal is between min and max void VerifyDecimalsBetween(const Datum& d, int64_t min, int64_t max) { int64_t length = d.length(); const Decimal128* decs = @@ -203,6 +221,8 @@ void VerifyDecimalsBetween(const Datum& d, int64_t min, int64_t max) { } } +// Verifies that each variable-length row is a series of words separated by +// spaces. Number of words is determined by the number of spaces. void VerifyCorrectNumberOfWords_Varlen(const Datum& d, int num_words) { int expected_num_spaces = num_words - 1; int64_t length = d.length(); @@ -231,6 +251,7 @@ void VerifyCorrectNumberOfWords_Varlen(const Datum& d, int num_words) { } } +// Same as above but for fixed width columns. void VerifyCorrectNumberOfWords_FixedWidth(const Datum& d, int num_words, int byte_width) { int expected_num_spaces = num_words - 1; @@ -253,6 +274,7 @@ void VerifyCorrectNumberOfWords_FixedWidth(const Datum& d, int num_words, } } +// Verifies that each row of the single-byte-wide column is one of the possibilities. void VerifyOneOf(const Datum& d, const std::unordered_set& possibilities) { int64_t length = d.length(); const char* col = reinterpret_cast(d.array()->buffers[1]->data()); @@ -260,25 +282,31 @@ void VerifyOneOf(const Datum& d, const std::unordered_set& possibilities) ASSERT_TRUE(possibilities.find(col[i]) != possibilities.end()); } +// Verifies that each fixed-width row is one of the possibilities void VerifyOneOf(const Datum& d, int32_t byte_width, - const std::unordered_set& possibilities) { + const std::unordered_set& possibilities) { int64_t length = d.length(); const char* col = reinterpret_cast(d.array()->buffers[1]->data()); for (int64_t i = 0; i < length; i++) { const char* row = col + i * byte_width; - char tmp_str[256] = {}; - std::memcpy(tmp_str, row, byte_width); - ASSERT_TRUE(possibilities.find(tmp_str) != possibilities.end()) - << tmp_str << " is not a valid string."; + int32_t row_len = 0; + while (row[row_len] && row_len < byte_width) row_len++; + util::string_view view(row, row_len); + ASSERT_TRUE(possibilities.find(view) != possibilities.end()) + << view << " is not a valid string."; } } +// Counts the number of instances of each integer void CountInstances(std::unordered_map& counts, const Datum& d) { int64_t length = d.length(); const int32_t* nums = reinterpret_cast(d.array()->buffers[1]->data()); for (int64_t i = 0; i < length; i++) counts[nums[i]]++; } +// For the S_COMMENT column, some of the columns must be modified to contain +// "Customer...Complaints" or "Customer...Recommends". This function counts the number of +// good and bad comments. void CountModifiedComments(const Datum& d, int& good_count, int& bad_count) { int64_t length = d.length(); const int32_t* offsets = @@ -317,7 +345,6 @@ TEST(TpchNode, ScaleFactor) { int64_t num_rows = 0; for (auto& batch : res) num_rows += batch.length; ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Supplier) { @@ -339,7 +366,7 @@ TEST(TpchNode, Supplier) { int bad_count = 0; for (auto& batch : res) { ValidateBatch(batch); - VerifyUniqueKey(seen_suppkey, batch[0], + VerifyUniqueKey(&seen_suppkey, batch[0], /*min=*/1, /*max=*/static_cast(kExpectedRows)); VerifyStringAndNumber_FixedWidth(batch[1], batch[0], /*byte_width=*/25, "Supplie#r"); @@ -354,7 +381,6 @@ TEST(TpchNode, Supplier) { ASSERT_EQ(num_rows, kExpectedRows); ASSERT_EQ(good_count, 5); ASSERT_EQ(bad_count, 5); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Part) { @@ -374,7 +400,7 @@ TEST(TpchNode, Part) { std::unordered_set seen_partkey; for (auto& batch : res) { ValidateBatch(batch); - VerifyUniqueKey(seen_partkey, batch[0], + VerifyUniqueKey(&seen_partkey, batch[0], /*min=*/1, /*max=*/static_cast(kExpectedRows)); VerifyCorrectNumberOfWords_Varlen(batch[1], @@ -395,7 +421,6 @@ TEST(TpchNode, Part) { } ASSERT_EQ(seen_partkey.size(), kExpectedRows); ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, PartSupp) { @@ -426,7 +451,6 @@ TEST(TpchNode, PartSupp) { ASSERT_EQ(counts.size(), kExpectedRows / 4); ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Customer) { @@ -446,7 +470,7 @@ TEST(TpchNode, Customer) { std::unordered_set seen_custkey; for (auto& batch : res) { ValidateBatch(batch); - VerifyUniqueKey(seen_custkey, batch[0], + VerifyUniqueKey(&seen_custkey, batch[0], /*min=*/1, /*max=*/static_cast(kExpectedRows)); VerifyStringAndNumber_Varlen(batch[1], batch[0], "Customer#"); @@ -461,7 +485,6 @@ TEST(TpchNode, Customer) { } ASSERT_EQ(seen_custkey.size(), kExpectedRows); ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Orders) { @@ -481,7 +504,7 @@ TEST(TpchNode, Orders) { std::unordered_set seen_orderkey; for (auto& batch : res) { ValidateBatch(batch); - VerifyUniqueKey(seen_orderkey, batch[0], + VerifyUniqueKey(&seen_orderkey, batch[0], /*min=*/1, /*max=*/static_cast(4 * kExpectedRows)); VerifyAllBetween(batch[1], /*min=*/1, /*max=*/static_cast(kExpectedRows)); @@ -505,7 +528,6 @@ TEST(TpchNode, Orders) { } ASSERT_EQ(seen_orderkey.size(), kExpectedRows); ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Lineitem) { @@ -556,7 +578,6 @@ TEST(TpchNode, Lineitem) { ASSERT_GE(count.second, 1); ASSERT_LE(count.second, 7); } - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Nation) { @@ -576,7 +597,7 @@ TEST(TpchNode, Nation) { std::unordered_set seen_nationkey; for (auto& batch : res) { ValidateBatch(batch); - VerifyUniqueKey(seen_nationkey, batch[0], 0, kExpectedRows - 1); + VerifyUniqueKey(&seen_nationkey, batch[0], 0, kExpectedRows - 1); VerifyOneOf( batch[1], /*byte_width=*/25, @@ -589,7 +610,6 @@ TEST(TpchNode, Nation) { num_rows += batch.length; } ASSERT_EQ(num_rows, kExpectedRows); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } TEST(TpchNode, Region) { @@ -609,7 +629,7 @@ TEST(TpchNode, Region) { std::unordered_set seen_regionkey; for (auto& batch : res) { ValidateBatch(batch); - VerifyUniqueKey(seen_regionkey, batch[0], 0, kExpectedRows - 1); + VerifyUniqueKey(&seen_regionkey, batch[0], 0, kExpectedRows - 1); VerifyOneOf(batch[1], /*byte_width=*/25, {"AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST"}); @@ -617,7 +637,7 @@ TEST(TpchNode, Region) { num_rows += batch.length; } ASSERT_EQ(num_rows, 5); - arrow::internal::GetCpuThreadPool()->WaitForIdle(); } +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index ffa3b30a5d4..1f62ce5b42a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -890,8 +890,7 @@ class TableSorter { TableSorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const Table& table, const SortOptions& options) - : status_(), - ctx_(ctx), + : ctx_(ctx), table_(table), batches_(MakeBatches(table, &status_)), options_(options), From 8e18bf6182bc1a294e30c1e346927f3fa9177276 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Fri, 18 Mar 2022 11:10:24 -0700 Subject: [PATCH 24/34] Switch to regex, switch to named args --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 15 +++++++++------ cpp/src/arrow/compute/exec/tpch_node_test.cc | 18 +++--------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 3dcfddb63b9..905ab05ba94 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -70,7 +70,7 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin std::vector project_names = { "l_returnflag", "l_linestatus", "sum_qty", "sum_base_price", "sum_disc_price", "sum_charge", "avg_qty", "avg_price", "avg_disc"}; - ProjectNodeOptions project_opts(std::move(projection_list)); + ProjectNodeOptions project_opts(std::move(projection_list), std::move(project_names)); ScalarAggregateOptions sum_opts = ScalarAggregateOptions::Defaults(); CountOptions count_opts(CountOptions::CountMode::ALL); @@ -79,16 +79,19 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin {"hash_sum", &sum_opts}, {"hash_mean", &sum_opts}, {"hash_mean", &sum_opts}, {"hash_mean", &sum_opts}, {"hash_count", &count_opts}}; - std::vector cols = {2, 3, 4, 5, 6, 7, 8, 2}; + std::vector to_aggregate = {"sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", + "avg_disc", "sum_qty"}; + std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", "sum_charge", "avg_qty", "avg_price", "avg_disc", "count_order"}; - std::vector keys = {"L_RETURNFLAG", "L_LINESTATUS"}; - AggregateNodeOptions agg_opts(aggs, cols, names, keys); + std::vector keys = {"l_returnflag", "l_linestatus"}; + AggregateNodeOptions agg_opts(aggs, to_aggregate, names, keys); - SortKey l_returnflag_key("L_RETURNFLAG"); - SortKey l_linestatus_key("L_LINESTATUS"); + SortKey l_returnflag_key("l_returnflag"); + SortKey l_linestatus_key("l_linestatus"); SortOptions sort_opts({l_returnflag_key, l_linestatus_key}); OrderBySinkNodeOptions order_by_opts(sort_opts, sink_gen); diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 9f8fefff235..a69e20a8a6e 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -32,6 +32,7 @@ #include "arrow/util/thread_pool.h" #include +#include #include #include @@ -189,23 +190,10 @@ void VerifyPhone(const Datum& d) { int64_t length = d.length(); const char* phones = reinterpret_cast(d.array()->buffers[1]->data()); constexpr int kByteWidth = 15; // This is common for all PHONE columns + std::regex exp("\\d{2}-\\d{3}-\\d{3}-\\d{4}"); for (int64_t i = 0; i < length; i++) { const char* row = phones + i * kByteWidth; - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_EQ(*row++, '-'); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_EQ(*row++, '-'); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_EQ(*row++, '-'); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); - ASSERT_TRUE(std::isdigit(*row++)); + ASSERT_TRUE(std::regex_match(row, row + kByteWidth, exp)); } } From 36967e94750ba9bf555a8cb60b1aa54bd190b032 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 24 Mar 2022 13:24:09 -0700 Subject: [PATCH 25/34] Respond to more comments, fix spurious crash (I think) --- cpp/src/arrow/compute/exec/tpch_node.cc | 2 +- cpp/src/arrow/compute/exec/tpch_node_test.cc | 148 ++++++------------- cpp/src/arrow/util/async_util.cc | 27 ++++ cpp/src/arrow/util/async_util.h | 4 + 4 files changed, 75 insertions(+), 106 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 954631e26d8..bcd16fd58cd 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -3372,7 +3372,7 @@ class TpchNode : public ExecNode { Status ScheduleTaskCallback(std::function func) { auto executor = plan_->exec_context()->executor(); if (executor) { - RETURN_NOT_OK(task_group_.AddTask([&] { + RETURN_NOT_OK(task_group_.AddTaskIfNotEnded([&] { return executor->Submit([this, func] { size_t thread_index = thread_indexer_(); Status status = func(thread_index); diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index a69e20a8a6e..4eb81068eed 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -44,6 +44,20 @@ static constexpr uint32_t kStartDate = static constexpr uint32_t kEndDate = 10591; // December 12, 1998 is 10591 days after January 1, 1970 +Result> GenerateTable( + Result (TpchGen::*table)(std::vector), + double scale_factor = 1.0) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(&ctx)); + ARROW_ASSIGN_OR_RAISE(TpchGen gen, TpchGen::Make(plan.get(), scale_factor)); + ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.*table)({}))); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table_node)}, SinkNodeOptions{&sink_gen}); + ARROW_RETURN_NOT_OK(sink.AddToPlan(plan.get())); + auto fut = StartAndCollect(plan.get(), sink_gen); + return fut.MoveResult(); +} + // Verifies that the data is valid Arrow and ensures it's not null. void ValidateBatch(const ExecBatch& batch) { for (const Datum& d : batch.values) { @@ -224,18 +238,17 @@ void VerifyCorrectNumberOfWords_Varlen(const Datum& d, int num_words) { int32_t start = offsets[i]; int32_t end = offsets[i + 1]; int32_t str_len = end - start; - char tmp_str[256] = {}; - std::memcpy(tmp_str, str + start, str_len); + util::string_view view(str + start, str_len); bool is_only_alphas_or_spaces = true; - for (int32_t j = offsets[i]; j < offsets[i + 1]; j++) { - bool is_space = str[j] == ' '; + for (const char& c : view) { + bool is_space = c == ' '; actual_num_spaces += is_space; - is_only_alphas_or_spaces &= (is_space || std::isalpha(str[j])); + is_only_alphas_or_spaces &= (is_space || std::isalpha(c)); } ASSERT_TRUE(is_only_alphas_or_spaces) - << "Words must be composed only of letters, got " << tmp_str; + << "Words must be composed only of letters, got " << view; ASSERT_EQ(actual_num_spaces, expected_num_spaces) - << "Wrong number of spaces in " << tmp_str; + << "Wrong number of spaces in " << view; } } @@ -286,48 +299,37 @@ void VerifyOneOf(const Datum& d, int32_t byte_width, } // Counts the number of instances of each integer -void CountInstances(std::unordered_map& counts, const Datum& d) { +void CountInstances(std::unordered_map* counts, const Datum& d) { int64_t length = d.length(); const int32_t* nums = reinterpret_cast(d.array()->buffers[1]->data()); - for (int64_t i = 0; i < length; i++) counts[nums[i]]++; + for (int64_t i = 0; i < length; i++) (*counts)[nums[i]]++; } // For the S_COMMENT column, some of the columns must be modified to contain // "Customer...Complaints" or "Customer...Recommends". This function counts the number of // good and bad comments. -void CountModifiedComments(const Datum& d, int& good_count, int& bad_count) { +void CountModifiedComments(const Datum& d, int* good_count, int* bad_count) { int64_t length = d.length(); const int32_t* offsets = reinterpret_cast(d.array()->buffers[1]->data()); const char* str = reinterpret_cast(d.array()->buffers[2]->data()); - // Length of S_COMMENT is at most 100 - char tmp_string[101]; for (int64_t i = 0; i < length; i++) { const char* row = str + offsets[i]; int32_t row_length = offsets[i + 1] - offsets[i]; - std::memset(tmp_string, 0, sizeof(tmp_string)); - std::memcpy(tmp_string, row, row_length); - char* customer = std::strstr(tmp_string, "Customer"); - char* recommends = std::strstr(tmp_string, "Recommends"); - char* complaints = std::strstr(tmp_string, "Complaints"); + util::string_view view(row, row_length); + bool customer = view.find("Customer") != util::string_view::npos; + bool recommends = view.find("Recommends") != util::string_view::npos; + bool complaints = view.find("Complaints") != util::string_view::npos; if (customer) { - ASSERT_TRUE((recommends != nullptr) ^ (complaints != nullptr)); - if (recommends) good_count++; - if (complaints) bad_count++; + ASSERT_TRUE(recommends ^ complaints); + if (recommends) *good_count += 1; + if (complaints) *bad_count += 1; } } } TEST(TpchNode, ScaleFactor) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get(), 0.25f); - ExecNode* table = *gen.Supplier(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Supplier, 0.25)); int64_t kExpectedRows = 2500; int64_t num_rows = 0; @@ -336,16 +338,7 @@ TEST(TpchNode, ScaleFactor) { } TEST(TpchNode, Supplier) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Supplier(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); - + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Supplier)); int64_t kExpectedRows = 10000; int64_t num_rows = 0; @@ -362,7 +355,7 @@ TEST(TpchNode, Supplier) { VerifyNationKey(batch[3]); VerifyPhone(batch[4]); VerifyDecimalsBetween(batch[5], -99999, 999999); - CountModifiedComments(batch[6], good_count, bad_count); + CountModifiedComments(batch[6], &good_count, &bad_count); num_rows += batch.length; } ASSERT_EQ(seen_suppkey.size(), kExpectedRows); @@ -372,15 +365,7 @@ TEST(TpchNode, Supplier) { } TEST(TpchNode, Part) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Part(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Part)); int64_t kExpectedRows = 200000; int64_t num_rows = 0; @@ -412,15 +397,7 @@ TEST(TpchNode, Part) { } TEST(TpchNode, PartSupp) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.PartSupp(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::PartSupp)); constexpr int64_t kExpectedRows = 800000; int64_t num_rows = 0; @@ -428,7 +405,7 @@ TEST(TpchNode, PartSupp) { std::unordered_map counts; for (auto& batch : res) { ValidateBatch(batch); - CountInstances(counts, batch[0]); + CountInstances(&counts, batch[0]); VerifyAllBetween(batch[2], 1, 9999); VerifyDecimalsBetween(batch[3], 100, 100000); num_rows += batch.length; @@ -442,15 +419,7 @@ TEST(TpchNode, PartSupp) { } TEST(TpchNode, Customer) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Customer(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Customer)); const int64_t kExpectedRows = 150000; int64_t num_rows = 0; @@ -476,15 +445,7 @@ TEST(TpchNode, Customer) { } TEST(TpchNode, Orders) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Orders(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Orders)); constexpr int64_t kExpectedRows = 1500000; int64_t num_rows = 0; @@ -519,19 +480,12 @@ TEST(TpchNode, Orders) { } TEST(TpchNode, Lineitem) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Lineitem(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Lineitem)); + std::unordered_map counts; for (auto& batch : res) { ValidateBatch(batch); - CountInstances(counts, batch[0]); + CountInstances(&counts, batch[0]); VerifyAllBetween(batch[1], /*min=*/1, /*max=*/200000); VerifyAllBetween(batch[3], /*min=*/1, /*max=*/7); VerifyDecimalsBetween(batch[4], /*min=*/100, /*max=*/5000); @@ -569,15 +523,7 @@ TEST(TpchNode, Lineitem) { } TEST(TpchNode, Nation) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Nation(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Nation)); constexpr int64_t kExpectedRows = 25; int64_t num_rows = 0; @@ -601,15 +547,7 @@ TEST(TpchNode, Nation) { } TEST(TpchNode, Region) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(&ctx); - TpchGen gen = *TpchGen::Make(plan.get()); - ExecNode* table = *gen.Region(); - AsyncGenerator> sink_gen; - Declaration sink("sink", {Declaration::Input(table)}, SinkNodeOptions{&sink_gen}); - std::ignore = *sink.AddToPlan(plan.get()); - auto fut = StartAndCollect(plan.get(), sink_gen); - auto res = *fut.MoveResult(); + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Region)); constexpr int64_t kExpectedRows = 5; int64_t num_rows = 0; diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 45355019ab8..3dacaa18b44 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -57,6 +57,22 @@ Status AsyncTaskGroup::AddTask(std::function>()> task) { return AddTaskUnlocked(*maybe_task_fut, std::move(guard)); } +Status AsyncTaskGroup::AddTaskIfNotEnded(std::function>()> task) { + auto guard = mutex_.Lock(); + if (finished_adding_) { + return Status::OK(); + } + if (!err_.ok()) { + return err_; + } + Result> maybe_task_fut = task(); + if (!maybe_task_fut.ok()) { + err_ = maybe_task_fut.status(); + return err_; + } + return AddTaskUnlocked(*maybe_task_fut, std::move(guard)); +} + Status AsyncTaskGroup::AddTaskUnlocked(const Future<>& task_fut, util::Mutex::Guard guard) { // If the task is already finished there is nothing to track so lets save @@ -89,6 +105,17 @@ Status AsyncTaskGroup::AddTask(const Future<>& task_fut) { return AddTaskUnlocked(task_fut, std::move(guard)); } +Status AsyncTaskGroup::AddTaskIfNotEnded(const Future<>& task_fut) { + auto guard = mutex_.Lock(); + if (finished_adding_) { + return Status::OK(); + } + if (!err_.ok()) { + return err_; + } + return AddTaskUnlocked(task_fut, std::move(guard)); +} + Future<> AsyncTaskGroup::End() { auto guard = mutex_.Lock(); finished_adding_ = true; diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index fdac025030a..ab43aeee197 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -116,8 +116,12 @@ class ARROW_EXPORT AsyncTaskGroup { /// If WaitForTasksToFinish has been called and the returned future has been marked /// completed then adding a task will fail. Status AddTask(std::function>()> task); + /// Same as AddTask but doesn't add the task if End() has been called. + Status AddTaskIfNotEnded(std::function>()> task); /// Add a task that has already been started Status AddTask(const Future<>& task); + /// Same as AddTask but doesn't add the task if End() has been called. + Status AddTaskIfNotEnded(const Future<>& task); /// Signal that top level tasks are done being added /// /// It is allowed for tasks to be added after this call provided the future has not yet From d8016d67e77752fbeb6ce7931b2d7aaf4f007826 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 24 Mar 2022 13:40:42 -0700 Subject: [PATCH 26/34] Fix rebase error --- cpp/src/arrow/compute/exec/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index 1292213dc45..b2a21c2bd6b 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -32,7 +32,6 @@ add_arrow_compute_test(hash_join_node_test hash_join_node_test.cc bloom_filter_test.cc key_hash_test.cc) -add_arrow_compute_test(hash_join_node_test PREFIX "arrow-compute") add_arrow_compute_test(tpch_node_test PREFIX "arrow-compute") add_arrow_compute_test(union_node_test PREFIX "arrow-compute") add_arrow_compute_test(util_test PREFIX "arrow-compute") From 38958c6b33a06b2bab1c6003a62a0e5ffd03e364 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 24 Mar 2022 16:21:41 -0700 Subject: [PATCH 27/34] Make constant tables have static storage --- cpp/src/arrow/compute/exec/tpch_node.cc | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index bcd16fd58cd..37788edd0b4 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -3201,17 +3201,17 @@ class NationGenerator : public TpchTableGenerator { static constexpr size_t kRowCount = 25; static constexpr int32_t kNameByteWidth = 25; - const int32_t kNationKey[kRowCount] = {0, 1, 2, 3, 4, 5, 6, 7, 8, + static constexpr int32_t kNationKey[kRowCount] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; - const char* kCountryNames[kRowCount] = { + static constexpr const char* kCountryNames[kRowCount] = { "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", "IRAN", "IRAQ", "JAPAN", "JORDAN", "KENYA", "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}; - const int32_t kRegionKey[kRowCount] = {0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, - 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; + static constexpr int32_t kRegionKey[kRowCount] = {0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, + 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; struct NATION { enum { @@ -3240,6 +3240,10 @@ class NationGenerator : public TpchTableGenerator { std::vector column_indices_; }; +constexpr int32_t NationGenerator::kNationKey[NationGenerator::kRowCount]; +constexpr const char *NationGenerator::kCountryNames[NationGenerator::kRowCount]; +constexpr int32_t NationGenerator::kRegionKey[NationGenerator::kRowCount]; + class RegionGenerator : public TpchTableGenerator { public: Status Init(std::vector columns, double /*scale_factor*/, @@ -3286,9 +3290,9 @@ class RegionGenerator : public TpchTableGenerator { static constexpr size_t kRowCount = 5; static constexpr int32_t kNameByteWidth = 25; - const int32_t kRegionKey[kRowCount] = {0, 1, 2, 3, 4}; - const char* kRegionNames[kRowCount] = {"AFRICA", "AMERICA", "ASIA", "EUROPE", - "MIDDLE EAST"}; + static constexpr int32_t kRegionKey[kRowCount] = {0, 1, 2, 3, 4}; + static constexpr const char* kRegionNames[kRowCount] = {"AFRICA", "AMERICA", "ASIA", "EUROPE", + "MIDDLE EAST"}; struct REGION { enum { @@ -3315,6 +3319,9 @@ class RegionGenerator : public TpchTableGenerator { std::vector column_indices_; }; +constexpr int32_t RegionGenerator::kRegionKey[RegionGenerator::kRowCount]; +constexpr const char *RegionGenerator::kRegionNames[RegionGenerator::kRowCount]; + class TpchNode : public ExecNode { public: TpchNode(ExecPlan* plan, const char* name, From 4caad353bf7c7e3d28595706b47aa04ed036585b Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 24 Mar 2022 17:26:09 -0700 Subject: [PATCH 28/34] clang-format --- cpp/src/arrow/compute/exec/tpch_node.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 37788edd0b4..5e5b2d75b80 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -3202,8 +3202,8 @@ class NationGenerator : public TpchTableGenerator { static constexpr size_t kRowCount = 25; static constexpr int32_t kNameByteWidth = 25; static constexpr int32_t kNationKey[kRowCount] = {0, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24}; + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24}; static constexpr const char* kCountryNames[kRowCount] = { "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", @@ -3211,7 +3211,7 @@ class NationGenerator : public TpchTableGenerator { "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}; static constexpr int32_t kRegionKey[kRowCount] = {0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, - 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; + 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; struct NATION { enum { @@ -3241,7 +3241,7 @@ class NationGenerator : public TpchTableGenerator { }; constexpr int32_t NationGenerator::kNationKey[NationGenerator::kRowCount]; -constexpr const char *NationGenerator::kCountryNames[NationGenerator::kRowCount]; +constexpr const char* NationGenerator::kCountryNames[NationGenerator::kRowCount]; constexpr int32_t NationGenerator::kRegionKey[NationGenerator::kRowCount]; class RegionGenerator : public TpchTableGenerator { @@ -3291,8 +3291,8 @@ class RegionGenerator : public TpchTableGenerator { static constexpr size_t kRowCount = 5; static constexpr int32_t kNameByteWidth = 25; static constexpr int32_t kRegionKey[kRowCount] = {0, 1, 2, 3, 4}; - static constexpr const char* kRegionNames[kRowCount] = {"AFRICA", "AMERICA", "ASIA", "EUROPE", - "MIDDLE EAST"}; + static constexpr const char* kRegionNames[kRowCount] = {"AFRICA", "AMERICA", "ASIA", + "EUROPE", "MIDDLE EAST"}; struct REGION { enum { @@ -3320,7 +3320,7 @@ class RegionGenerator : public TpchTableGenerator { }; constexpr int32_t RegionGenerator::kRegionKey[RegionGenerator::kRowCount]; -constexpr const char *RegionGenerator::kRegionNames[RegionGenerator::kRowCount]; +constexpr const char* RegionGenerator::kRegionNames[RegionGenerator::kRowCount]; class TpchNode : public ExecNode { public: From dd37803fae6a56cc7493687af93477e01180d231 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 28 Mar 2022 14:51:33 -0700 Subject: [PATCH 29/34] Add comment --- cpp/src/arrow/compute/exec/tpch_node.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 5e5b2d75b80..e016be3341b 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -3378,6 +3378,14 @@ class TpchNode : public ExecNode { Status ScheduleTaskCallback(std::function func) { auto executor = plan_->exec_context()->executor(); + + // Due to the way that the generators schedule tasks, there may be more tasks + // than output batches. After outputting the last batch, the generator will + // end the task group, but there may still be other threads that try to schedule + // tasks while the task group is being ended. This can result in adding tasks after + // the task group is ended. If those tasks were to be executed, correctness would + // not be affected as they'd see the generator is done and exit immediately. As such, + // if the task group is ended we can just skip scheduling these tasks in general. if (executor) { RETURN_NOT_OK(task_group_.AddTaskIfNotEnded([&] { return executor->Submit([this, func] { From a7adc5c1ad6ac8d6af44f89ac9b7e125150953d9 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 29 Mar 2022 15:15:41 +0200 Subject: [PATCH 30/34] Some nits --- cpp/src/arrow/compute/exec/tpch_node_test.cc | 37 +++++++++++--------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 4eb81068eed..c7f960644f2 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -39,6 +39,7 @@ namespace arrow { namespace compute { namespace internal { + static constexpr uint32_t kStartDate = 8035; // January 1, 1992 is 8035 days after January 1, 1970 static constexpr uint32_t kEndDate = @@ -61,8 +62,11 @@ Result> GenerateTable( // Verifies that the data is valid Arrow and ensures it's not null. void ValidateBatch(const ExecBatch& batch) { for (const Datum& d : batch.values) { - ASSERT_EQ(d.array()->buffers[0].get(), nullptr); - ASSERT_OK(d.make_array()->ValidateFull()); + ASSERT_EQ(d.kind(), Datum::ARRAY); + const auto array = d.make_array(); + ASSERT_OK(array->ValidateFull()); + TestInitialized(*array); + ASSERT_EQ(array->data()->buffers[0].get(), nullptr); } } @@ -79,28 +83,27 @@ void VerifyUniqueKey(std::unordered_set* seen, const Datum& d, int32_t } } -void VerifyStringAndNumber_Single(const util::string_view& row, const char* prefix, - const int64_t i, const int32_t* nums, - bool verify_padding) { - size_t num_offset = static_cast(std::strlen(prefix)); +void VerifyStringAndNumber_Single(const util::string_view& row, + const util::string_view& prefix, const int64_t i, + const int32_t* nums, bool verify_padding) { ASSERT_TRUE(row.starts_with(prefix)) << row << ", prefix=" << prefix << ", i=" << i; - const char* num_str = row.data() + num_offset; + const char* num_str = row.data() + prefix.size(); + const char* num_str_end = row.data() + row.size(); int64_t num = 0; - size_t ibyte = num_offset; - // Parse the number out - for (; *num_str && ibyte < row.size(); ibyte++) { + // Parse the number out; note that it can be padded with NUL chars at the end + for (; *num_str && num_str < num_str_end; num_str++) { num *= 10; - ASSERT_TRUE(std::isdigit(*num_str)); - num += *num_str++ - '0'; + ASSERT_TRUE(std::isdigit(*num_str)) << row << ", prefix=" << prefix << ", i=" << i; + num += *num_str - '0'; } // If nums is not null, ensure it matches the parsed number if (nums) { - ASSERT_EQ(static_cast(num), nums[i]); + ASSERT_EQ(num, nums[i]); } // TPC-H requires only ever requires padding up to 9 digits, so we ensure that // the total length of the string was at least 9 (could be more for bigger numbers). if (verify_padding) { - int64_t num_chars = static_cast(ibyte - num_offset); + const auto num_chars = num_str - (row.data() + prefix.size()); ASSERT_GE(num_chars, 9); } } @@ -110,7 +113,7 @@ void VerifyStringAndNumber_Single(const util::string_view& row, const char* pref // corresponding row in numbers. Some TPC-H data is padded to 9 zeros, which this function // can optionally verify as well. This string function verifies fixed width columns. void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers, - int byte_width, const char* prefix, + int byte_width, const util::string_view& prefix, bool verify_padding = true) { int64_t length = strings.length(); const char* str = reinterpret_cast(strings.array()->buffers[1]->data()); @@ -130,7 +133,8 @@ void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers // Same as above but for variable length columns void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, - const char* prefix, bool verify_padding = true) { + const util::string_view& prefix, + bool verify_padding = true) { int64_t length = strings.length(); const int32_t* offsets = reinterpret_cast(strings.array()->buffers[1]->data()); @@ -564,6 +568,7 @@ TEST(TpchNode, Region) { } ASSERT_EQ(num_rows, 5); } + } // namespace internal } // namespace compute } // namespace arrow From 31e693e478b2168283bf921bdd3fb11ff6738c71 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 29 Mar 2022 15:33:40 +0200 Subject: [PATCH 31/34] Hide implementation details --- cpp/src/arrow/compute/exec/expression.h | 2 +- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 8 ++- cpp/src/arrow/compute/exec/tpch_node.cc | 70 +++++++++++++++----- cpp/src/arrow/compute/exec/tpch_node.h | 53 +++++---------- cpp/src/arrow/compute/exec/tpch_node_test.cc | 5 +- 5 files changed, 79 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 6c2d9e8e2a5..dbc8da7bbb1 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -117,7 +117,7 @@ class ARROW_EXPORT Expression { // post-bind properties ValueDescr descr; - internal::SmallVector indices; + ::arrow::internal::SmallVector indices; }; const Parameter* parameter() const; diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 905ab05ba94..98a32265712 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -18,6 +18,7 @@ #include #include "arrow/compute/cast.h" +#include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" #include "arrow/testing/future_util.h" @@ -32,11 +33,12 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin ExecContext* ctx = default_exec_context(); *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::shared_ptr plan = *ExecPlan::Make(ctx); - TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); + std::unique_ptr gen = + *TpchGen::Make(plan.get(), static_cast(scale_factor)); ExecNode* lineitem = - *gen.Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", "L_SHIPDATE", - "L_RETURNFLAG", "L_LINESTATUS"}); + *gen->Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", + "L_SHIPDATE", "L_RETURNFLAG", "L_LINESTATUS"}); auto sept_2_1998 = std::make_shared( 10471); // September 2, 1998 is 10471 days after January 1, 1970 diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index e016be3341b..48d77ee0334 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -16,10 +16,13 @@ // under the License. #include "arrow/compute/exec/tpch_node.h" +#include "arrow/buffer.h" +#include "arrow/compute/exec/exec_plan.h" #include "arrow/util/formatting.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" #include "arrow/util/make_unique.h" +#include "arrow/util/pcg_random.h" #include "arrow/util/unreachable.h" #include @@ -69,6 +72,7 @@ function PS_PARTKEY). */ namespace { + const char* NameParts[] = { "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", "blush", "brown", "burlywood", @@ -225,8 +229,6 @@ constexpr uint32_t kEndDate = std::uniform_int_distribution kSeedDist(std::numeric_limits::min(), std::numeric_limits::max()); -} // namespace - // The spec says to generate a 300 MB string according to a grammar. This is a // concurrent implementation of the generator. Each thread generates the text in // (up to) 8KB chunks of text. The generator maintains a cursor into the @@ -3411,27 +3413,49 @@ class TpchNode : public ExecNode { ThreadIndexer thread_indexer_; }; -Result TpchGen::Make(ExecPlan* plan, double scale_factor, int64_t batch_size, - util::optional seed) { - if (!seed.has_value()) seed = GetRandomSeed(); - TpchGen result(plan, scale_factor, batch_size, *seed); - return result; -} +class TpchGenImpl : public TpchGen { + public: + Result Supplier(std::vector columns = {}) override; + Result Part(std::vector columns = {}) override; + Result PartSupp(std::vector columns = {}) override; + Result Customer(std::vector columns = {}) override; + Result Orders(std::vector columns = {}) override; + Result Lineitem(std::vector columns = {}) override; + Result Nation(std::vector columns = {}) override; + Result Region(std::vector columns = {}) override; + + TpchGenImpl(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + seed_rng_(seed) {} + + template + Result CreateNode(const char* name, std::vector columns); + + ExecPlan* plan_; + double scale_factor_; + int64_t batch_size_; + random::pcg64_fast seed_rng_; + + std::shared_ptr part_and_part_supp_generator_{}; + std::shared_ptr orders_and_line_item_generator_{}; +}; template -Result TpchGen::CreateNode(const char* name, - std::vector columns) { +Result TpchGenImpl::CreateNode(const char* name, + std::vector columns) { std::unique_ptr generator = arrow::internal::make_unique(); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, name, std::move(generator)); } -Result TpchGen::Supplier(std::vector columns) { +Result TpchGenImpl::Supplier(std::vector columns) { return CreateNode("Supplier", std::move(columns)); } -Result TpchGen::Part(std::vector columns) { +Result TpchGenImpl::Part(std::vector columns) { if (!part_and_part_supp_generator_) { part_and_part_supp_generator_ = std::make_shared(); } @@ -3442,7 +3466,7 @@ Result TpchGen::Part(std::vector columns) { return plan_->EmplaceNode(plan_, "Part", std::move(generator)); } -Result TpchGen::PartSupp(std::vector columns) { +Result TpchGenImpl::PartSupp(std::vector columns) { if (!part_and_part_supp_generator_) { part_and_part_supp_generator_ = std::make_shared(); } @@ -3453,11 +3477,11 @@ Result TpchGen::PartSupp(std::vector columns) { return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); } -Result TpchGen::Customer(std::vector columns) { +Result TpchGenImpl::Customer(std::vector columns) { return CreateNode("Customer", std::move(columns)); } -Result TpchGen::Orders(std::vector columns) { +Result TpchGenImpl::Orders(std::vector columns) { if (!orders_and_line_item_generator_) { orders_and_line_item_generator_ = std::make_shared(); } @@ -3468,7 +3492,7 @@ Result TpchGen::Orders(std::vector columns) { return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); } -Result TpchGen::Lineitem(std::vector columns) { +Result TpchGenImpl::Lineitem(std::vector columns) { if (!orders_and_line_item_generator_) { orders_and_line_item_generator_ = std::make_shared(); } @@ -3479,13 +3503,23 @@ Result TpchGen::Lineitem(std::vector columns) { return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); } -Result TpchGen::Nation(std::vector columns) { +Result TpchGenImpl::Nation(std::vector columns) { return CreateNode("Nation", std::move(columns)); } -Result TpchGen::Region(std::vector columns) { +Result TpchGenImpl::Region(std::vector columns) { return CreateNode("Region", std::move(columns)); } + +} // namespace + +Result> TpchGen::Make(ExecPlan* plan, double scale_factor, + int64_t batch_size, + util::optional seed) { + if (!seed.has_value()) seed = GetRandomSeed(); + return std::unique_ptr(new TpchGenImpl(plan, scale_factor, batch_size, *seed)); +} + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index 42dd9e4adbc..fb9376982b1 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -17,23 +17,23 @@ #pragma once +#include #include #include -#include "arrow/compute/exec/exec_plan.h" -#include "arrow/compute/exec/options.h" + +#include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/util/pcg_random.h" +#include "arrow/util/optional.h" namespace arrow { namespace compute { namespace internal { -class OrdersAndLineItemGenerator; -class PartAndPartSupplierGenerator; class ARROW_EXPORT TpchGen { public: + virtual ~TpchGen() = default; + /* * \brief Create a factory for nodes that generate TPC-H data * @@ -42,40 +42,23 @@ class ARROW_EXPORT TpchGen { * table from that single TpchGen instance. Note: Every batch will be scheduled as a new * task using the ExecPlan's scheduler. */ - static Result Make(ExecPlan* plan, double scale_factor = 1.0, - int64_t batch_size = 4096, - util::optional seed = util::nullopt); + static Result> Make( + ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096, + util::optional seed = util::nullopt); // The below methods will create and add an ExecNode to the plan that generates // data for the desired table. If columns is empty, all columns will be generated. // The methods return the added ExecNode, which should be used for inputs. - Result Supplier(std::vector columns = {}); - Result Part(std::vector columns = {}); - Result PartSupp(std::vector columns = {}); - Result Customer(std::vector columns = {}); - Result Orders(std::vector columns = {}); - Result Lineitem(std::vector columns = {}); - Result Nation(std::vector columns = {}); - Result Region(std::vector columns = {}); - - private: - TpchGen(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) - : plan_(plan), - scale_factor_(scale_factor), - batch_size_(batch_size), - seed_rng_(seed) {} - - template - Result CreateNode(const char* name, std::vector columns); - - ExecPlan* plan_; - double scale_factor_; - int64_t batch_size_; - random::pcg64_fast seed_rng_; - - std::shared_ptr part_and_part_supp_generator_{}; - std::shared_ptr orders_and_line_item_generator_{}; + virtual Result Supplier(std::vector columns = {}) = 0; + virtual Result Part(std::vector columns = {}) = 0; + virtual Result PartSupp(std::vector columns = {}) = 0; + virtual Result Customer(std::vector columns = {}) = 0; + virtual Result Orders(std::vector columns = {}) = 0; + virtual Result Lineitem(std::vector columns = {}) = 0; + virtual Result Nation(std::vector columns = {}) = 0; + virtual Result Region(std::vector columns = {}) = 0; }; + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index c7f960644f2..8face53eebb 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -50,8 +50,9 @@ Result> GenerateTable( double scale_factor = 1.0) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(&ctx)); - ARROW_ASSIGN_OR_RAISE(TpchGen gen, TpchGen::Make(plan.get(), scale_factor)); - ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.*table)({}))); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr gen, + TpchGen::Make(plan.get(), scale_factor)); + ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.get()->*table)({}))); AsyncGenerator> sink_gen; Declaration sink("sink", {Declaration::Input(table_node)}, SinkNodeOptions{&sink_gen}); ARROW_RETURN_NOT_OK(sink.AddToPlan(plan.get())); From df884c17f65626234702e83dd8cd505e1b129f7d Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 29 Mar 2022 10:44:51 -0700 Subject: [PATCH 32/34] Revert "Hide implementation details" This reverts commit 31e693e478b2168283bf921bdd3fb11ff6738c71. --- cpp/src/arrow/compute/exec/expression.h | 2 +- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 8 +-- cpp/src/arrow/compute/exec/tpch_node.cc | 70 +++++--------------- cpp/src/arrow/compute/exec/tpch_node.h | 53 ++++++++++----- cpp/src/arrow/compute/exec/tpch_node_test.cc | 5 +- 5 files changed, 59 insertions(+), 79 deletions(-) diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index dbc8da7bbb1..6c2d9e8e2a5 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -117,7 +117,7 @@ class ARROW_EXPORT Expression { // post-bind properties ValueDescr descr; - ::arrow::internal::SmallVector indices; + internal::SmallVector indices; }; const Parameter* parameter() const; diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 98a32265712..905ab05ba94 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -18,7 +18,6 @@ #include #include "arrow/compute/cast.h" -#include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" #include "arrow/testing/future_util.h" @@ -33,12 +32,11 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin ExecContext* ctx = default_exec_context(); *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::shared_ptr plan = *ExecPlan::Make(ctx); - std::unique_ptr gen = - *TpchGen::Make(plan.get(), static_cast(scale_factor)); + TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); ExecNode* lineitem = - *gen->Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", - "L_SHIPDATE", "L_RETURNFLAG", "L_LINESTATUS"}); + *gen.Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", "L_SHIPDATE", + "L_RETURNFLAG", "L_LINESTATUS"}); auto sept_2_1998 = std::make_shared( 10471); // September 2, 1998 is 10471 days after January 1, 1970 diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 48d77ee0334..e016be3341b 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -16,13 +16,10 @@ // under the License. #include "arrow/compute/exec/tpch_node.h" -#include "arrow/buffer.h" -#include "arrow/compute/exec/exec_plan.h" #include "arrow/util/formatting.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" #include "arrow/util/make_unique.h" -#include "arrow/util/pcg_random.h" #include "arrow/util/unreachable.h" #include @@ -72,7 +69,6 @@ function PS_PARTKEY). */ namespace { - const char* NameParts[] = { "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", "blush", "brown", "burlywood", @@ -229,6 +225,8 @@ constexpr uint32_t kEndDate = std::uniform_int_distribution kSeedDist(std::numeric_limits::min(), std::numeric_limits::max()); +} // namespace + // The spec says to generate a 300 MB string according to a grammar. This is a // concurrent implementation of the generator. Each thread generates the text in // (up to) 8KB chunks of text. The generator maintains a cursor into the @@ -3413,49 +3411,27 @@ class TpchNode : public ExecNode { ThreadIndexer thread_indexer_; }; -class TpchGenImpl : public TpchGen { - public: - Result Supplier(std::vector columns = {}) override; - Result Part(std::vector columns = {}) override; - Result PartSupp(std::vector columns = {}) override; - Result Customer(std::vector columns = {}) override; - Result Orders(std::vector columns = {}) override; - Result Lineitem(std::vector columns = {}) override; - Result Nation(std::vector columns = {}) override; - Result Region(std::vector columns = {}) override; - - TpchGenImpl(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) - : plan_(plan), - scale_factor_(scale_factor), - batch_size_(batch_size), - seed_rng_(seed) {} - - template - Result CreateNode(const char* name, std::vector columns); - - ExecPlan* plan_; - double scale_factor_; - int64_t batch_size_; - random::pcg64_fast seed_rng_; - - std::shared_ptr part_and_part_supp_generator_{}; - std::shared_ptr orders_and_line_item_generator_{}; -}; +Result TpchGen::Make(ExecPlan* plan, double scale_factor, int64_t batch_size, + util::optional seed) { + if (!seed.has_value()) seed = GetRandomSeed(); + TpchGen result(plan, scale_factor, batch_size, *seed); + return result; +} template -Result TpchGenImpl::CreateNode(const char* name, - std::vector columns) { +Result TpchGen::CreateNode(const char* name, + std::vector columns) { std::unique_ptr generator = arrow::internal::make_unique(); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, name, std::move(generator)); } -Result TpchGenImpl::Supplier(std::vector columns) { +Result TpchGen::Supplier(std::vector columns) { return CreateNode("Supplier", std::move(columns)); } -Result TpchGenImpl::Part(std::vector columns) { +Result TpchGen::Part(std::vector columns) { if (!part_and_part_supp_generator_) { part_and_part_supp_generator_ = std::make_shared(); } @@ -3466,7 +3442,7 @@ Result TpchGenImpl::Part(std::vector columns) { return plan_->EmplaceNode(plan_, "Part", std::move(generator)); } -Result TpchGenImpl::PartSupp(std::vector columns) { +Result TpchGen::PartSupp(std::vector columns) { if (!part_and_part_supp_generator_) { part_and_part_supp_generator_ = std::make_shared(); } @@ -3477,11 +3453,11 @@ Result TpchGenImpl::PartSupp(std::vector columns) { return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); } -Result TpchGenImpl::Customer(std::vector columns) { +Result TpchGen::Customer(std::vector columns) { return CreateNode("Customer", std::move(columns)); } -Result TpchGenImpl::Orders(std::vector columns) { +Result TpchGen::Orders(std::vector columns) { if (!orders_and_line_item_generator_) { orders_and_line_item_generator_ = std::make_shared(); } @@ -3492,7 +3468,7 @@ Result TpchGenImpl::Orders(std::vector columns) { return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); } -Result TpchGenImpl::Lineitem(std::vector columns) { +Result TpchGen::Lineitem(std::vector columns) { if (!orders_and_line_item_generator_) { orders_and_line_item_generator_ = std::make_shared(); } @@ -3503,23 +3479,13 @@ Result TpchGenImpl::Lineitem(std::vector columns) { return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); } -Result TpchGenImpl::Nation(std::vector columns) { +Result TpchGen::Nation(std::vector columns) { return CreateNode("Nation", std::move(columns)); } -Result TpchGenImpl::Region(std::vector columns) { +Result TpchGen::Region(std::vector columns) { return CreateNode("Region", std::move(columns)); } - -} // namespace - -Result> TpchGen::Make(ExecPlan* plan, double scale_factor, - int64_t batch_size, - util::optional seed) { - if (!seed.has_value()) seed = GetRandomSeed(); - return std::unique_ptr(new TpchGenImpl(plan, scale_factor, batch_size, *seed)); -} - } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index fb9376982b1..42dd9e4adbc 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -17,23 +17,23 @@ #pragma once -#include #include #include - -#include "arrow/compute/type_fwd.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/options.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/util/optional.h" +#include "arrow/type.h" +#include "arrow/util/pcg_random.h" namespace arrow { namespace compute { namespace internal { +class OrdersAndLineItemGenerator; +class PartAndPartSupplierGenerator; class ARROW_EXPORT TpchGen { public: - virtual ~TpchGen() = default; - /* * \brief Create a factory for nodes that generate TPC-H data * @@ -42,23 +42,40 @@ class ARROW_EXPORT TpchGen { * table from that single TpchGen instance. Note: Every batch will be scheduled as a new * task using the ExecPlan's scheduler. */ - static Result> Make( - ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096, - util::optional seed = util::nullopt); + static Result Make(ExecPlan* plan, double scale_factor = 1.0, + int64_t batch_size = 4096, + util::optional seed = util::nullopt); // The below methods will create and add an ExecNode to the plan that generates // data for the desired table. If columns is empty, all columns will be generated. // The methods return the added ExecNode, which should be used for inputs. - virtual Result Supplier(std::vector columns = {}) = 0; - virtual Result Part(std::vector columns = {}) = 0; - virtual Result PartSupp(std::vector columns = {}) = 0; - virtual Result Customer(std::vector columns = {}) = 0; - virtual Result Orders(std::vector columns = {}) = 0; - virtual Result Lineitem(std::vector columns = {}) = 0; - virtual Result Nation(std::vector columns = {}) = 0; - virtual Result Region(std::vector columns = {}) = 0; -}; + Result Supplier(std::vector columns = {}); + Result Part(std::vector columns = {}); + Result PartSupp(std::vector columns = {}); + Result Customer(std::vector columns = {}); + Result Orders(std::vector columns = {}); + Result Lineitem(std::vector columns = {}); + Result Nation(std::vector columns = {}); + Result Region(std::vector columns = {}); + + private: + TpchGen(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + seed_rng_(seed) {} + template + Result CreateNode(const char* name, std::vector columns); + + ExecPlan* plan_; + double scale_factor_; + int64_t batch_size_; + random::pcg64_fast seed_rng_; + + std::shared_ptr part_and_part_supp_generator_{}; + std::shared_ptr orders_and_line_item_generator_{}; +}; } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index 8face53eebb..c7f960644f2 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -50,9 +50,8 @@ Result> GenerateTable( double scale_factor = 1.0) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(&ctx)); - ARROW_ASSIGN_OR_RAISE(std::unique_ptr gen, - TpchGen::Make(plan.get(), scale_factor)); - ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.get()->*table)({}))); + ARROW_ASSIGN_OR_RAISE(TpchGen gen, TpchGen::Make(plan.get(), scale_factor)); + ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.*table)({}))); AsyncGenerator> sink_gen; Declaration sink("sink", {Declaration::Input(table_node)}, SinkNodeOptions{&sink_gen}); ARROW_RETURN_NOT_OK(sink.AddToPlan(plan.get())); From 3a8583b43c81d260e70b7ee4745d4fcf5bdbbff0 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 29 Mar 2022 11:13:24 -0700 Subject: [PATCH 33/34] Fix ASAN issue --- cpp/src/arrow/compute/exec/tpch_node.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index e016be3341b..2bf559f331b 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -3162,7 +3162,7 @@ class NationGenerator : public TpchTableGenerator { FinishedCallback finished_callback, ScheduleCallback /*schedule_task_callback*/) override { std::shared_ptr N_NATIONKEY_buffer = - Buffer::Wrap(kNationKey, sizeof(kNationKey)); + Buffer::Wrap(kNationKey, kRowCount); ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, {nullptr, std::move(N_NATIONKEY_buffer)}); @@ -3175,7 +3175,7 @@ class NationGenerator : public TpchTableGenerator { {nullptr, std::move(N_NAME_buffer)}); std::shared_ptr N_REGIONKEY_buffer = - Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); + Buffer::Wrap(kRegionKey, kRowCount); ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, {nullptr, std::move(N_REGIONKEY_buffer)}); @@ -3259,7 +3259,7 @@ class RegionGenerator : public TpchTableGenerator { FinishedCallback finished_callback, ScheduleCallback /*schedule_task_callback*/) override { std::shared_ptr R_REGIONKEY_buffer = - Buffer::Wrap(kRegionKey, sizeof(kRegionKey)); + Buffer::Wrap(kRegionKey, kRowCount); ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, {nullptr, std::move(R_REGIONKEY_buffer)}); From 8bc6b8c5a8e94821773ef050e4c0bde9dfb3ffc9 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 29 Mar 2022 12:27:36 -0700 Subject: [PATCH 34/34] Revert "Revert "Hide implementation details"" This reverts commit df884c17f65626234702e83dd8cd505e1b129f7d. --- cpp/src/arrow/compute/exec/expression.h | 2 +- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 8 +- cpp/src/arrow/compute/exec/tpch_node.cc | 79 ++++++++++++++------ cpp/src/arrow/compute/exec/tpch_node.h | 53 +++++-------- cpp/src/arrow/compute/exec/tpch_node_test.cc | 5 +- 5 files changed, 82 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 6c2d9e8e2a5..dbc8da7bbb1 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -117,7 +117,7 @@ class ARROW_EXPORT Expression { // post-bind properties ValueDescr descr; - internal::SmallVector indices; + ::arrow::internal::SmallVector indices; }; const Parameter* parameter() const; diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 905ab05ba94..98a32265712 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -18,6 +18,7 @@ #include #include "arrow/compute/cast.h" +#include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" #include "arrow/testing/future_util.h" @@ -32,11 +33,12 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin ExecContext* ctx = default_exec_context(); *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::shared_ptr plan = *ExecPlan::Make(ctx); - TpchGen gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); + std::unique_ptr gen = + *TpchGen::Make(plan.get(), static_cast(scale_factor)); ExecNode* lineitem = - *gen.Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", "L_SHIPDATE", - "L_RETURNFLAG", "L_LINESTATUS"}); + *gen->Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", + "L_SHIPDATE", "L_RETURNFLAG", "L_LINESTATUS"}); auto sept_2_1998 = std::make_shared( 10471); // September 2, 1998 is 10471 days after January 1, 1970 diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index 2bf559f331b..c7397970d79 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -16,10 +16,13 @@ // under the License. #include "arrow/compute/exec/tpch_node.h" +#include "arrow/buffer.h" +#include "arrow/compute/exec/exec_plan.h" #include "arrow/util/formatting.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" #include "arrow/util/make_unique.h" +#include "arrow/util/pcg_random.h" #include "arrow/util/unreachable.h" #include @@ -69,6 +72,7 @@ function PS_PARTKEY). */ namespace { + const char* NameParts[] = { "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", "blanched", "blue", "blush", "brown", "burlywood", @@ -225,8 +229,6 @@ constexpr uint32_t kEndDate = std::uniform_int_distribution kSeedDist(std::numeric_limits::min(), std::numeric_limits::max()); -} // namespace - // The spec says to generate a 300 MB string according to a grammar. This is a // concurrent implementation of the generator. Each thread generates the text in // (up to) 8KB chunks of text. The generator maintains a cursor into the @@ -3161,8 +3163,7 @@ class NationGenerator : public TpchTableGenerator { Status StartProducing(size_t /*num_threads*/, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback /*schedule_task_callback*/) override { - std::shared_ptr N_NATIONKEY_buffer = - Buffer::Wrap(kNationKey, kRowCount); + std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(kNationKey, kRowCount); ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, {nullptr, std::move(N_NATIONKEY_buffer)}); @@ -3174,8 +3175,7 @@ class NationGenerator : public TpchTableGenerator { ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, {nullptr, std::move(N_NAME_buffer)}); - std::shared_ptr N_REGIONKEY_buffer = - Buffer::Wrap(kRegionKey, kRowCount); + std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, kRowCount); ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, {nullptr, std::move(N_REGIONKEY_buffer)}); @@ -3258,8 +3258,7 @@ class RegionGenerator : public TpchTableGenerator { Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, FinishedCallback finished_callback, ScheduleCallback /*schedule_task_callback*/) override { - std::shared_ptr R_REGIONKEY_buffer = - Buffer::Wrap(kRegionKey, kRowCount); + std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, kRowCount); ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, {nullptr, std::move(R_REGIONKEY_buffer)}); @@ -3411,27 +3410,49 @@ class TpchNode : public ExecNode { ThreadIndexer thread_indexer_; }; -Result TpchGen::Make(ExecPlan* plan, double scale_factor, int64_t batch_size, - util::optional seed) { - if (!seed.has_value()) seed = GetRandomSeed(); - TpchGen result(plan, scale_factor, batch_size, *seed); - return result; -} +class TpchGenImpl : public TpchGen { + public: + Result Supplier(std::vector columns = {}) override; + Result Part(std::vector columns = {}) override; + Result PartSupp(std::vector columns = {}) override; + Result Customer(std::vector columns = {}) override; + Result Orders(std::vector columns = {}) override; + Result Lineitem(std::vector columns = {}) override; + Result Nation(std::vector columns = {}) override; + Result Region(std::vector columns = {}) override; + + TpchGenImpl(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + seed_rng_(seed) {} + + template + Result CreateNode(const char* name, std::vector columns); + + ExecPlan* plan_; + double scale_factor_; + int64_t batch_size_; + random::pcg64_fast seed_rng_; + + std::shared_ptr part_and_part_supp_generator_{}; + std::shared_ptr orders_and_line_item_generator_{}; +}; template -Result TpchGen::CreateNode(const char* name, - std::vector columns) { +Result TpchGenImpl::CreateNode(const char* name, + std::vector columns) { std::unique_ptr generator = arrow::internal::make_unique(); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, name, std::move(generator)); } -Result TpchGen::Supplier(std::vector columns) { +Result TpchGenImpl::Supplier(std::vector columns) { return CreateNode("Supplier", std::move(columns)); } -Result TpchGen::Part(std::vector columns) { +Result TpchGenImpl::Part(std::vector columns) { if (!part_and_part_supp_generator_) { part_and_part_supp_generator_ = std::make_shared(); } @@ -3442,7 +3463,7 @@ Result TpchGen::Part(std::vector columns) { return plan_->EmplaceNode(plan_, "Part", std::move(generator)); } -Result TpchGen::PartSupp(std::vector columns) { +Result TpchGenImpl::PartSupp(std::vector columns) { if (!part_and_part_supp_generator_) { part_and_part_supp_generator_ = std::make_shared(); } @@ -3453,11 +3474,11 @@ Result TpchGen::PartSupp(std::vector columns) { return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); } -Result TpchGen::Customer(std::vector columns) { +Result TpchGenImpl::Customer(std::vector columns) { return CreateNode("Customer", std::move(columns)); } -Result TpchGen::Orders(std::vector columns) { +Result TpchGenImpl::Orders(std::vector columns) { if (!orders_and_line_item_generator_) { orders_and_line_item_generator_ = std::make_shared(); } @@ -3468,7 +3489,7 @@ Result TpchGen::Orders(std::vector columns) { return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); } -Result TpchGen::Lineitem(std::vector columns) { +Result TpchGenImpl::Lineitem(std::vector columns) { if (!orders_and_line_item_generator_) { orders_and_line_item_generator_ = std::make_shared(); } @@ -3479,13 +3500,23 @@ Result TpchGen::Lineitem(std::vector columns) { return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); } -Result TpchGen::Nation(std::vector columns) { +Result TpchGenImpl::Nation(std::vector columns) { return CreateNode("Nation", std::move(columns)); } -Result TpchGen::Region(std::vector columns) { +Result TpchGenImpl::Region(std::vector columns) { return CreateNode("Region", std::move(columns)); } + +} // namespace + +Result> TpchGen::Make(ExecPlan* plan, double scale_factor, + int64_t batch_size, + util::optional seed) { + if (!seed.has_value()) seed = GetRandomSeed(); + return std::unique_ptr(new TpchGenImpl(plan, scale_factor, batch_size, *seed)); +} + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index 42dd9e4adbc..fb9376982b1 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -17,23 +17,23 @@ #pragma once +#include #include #include -#include "arrow/compute/exec/exec_plan.h" -#include "arrow/compute/exec/options.h" + +#include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/util/pcg_random.h" +#include "arrow/util/optional.h" namespace arrow { namespace compute { namespace internal { -class OrdersAndLineItemGenerator; -class PartAndPartSupplierGenerator; class ARROW_EXPORT TpchGen { public: + virtual ~TpchGen() = default; + /* * \brief Create a factory for nodes that generate TPC-H data * @@ -42,40 +42,23 @@ class ARROW_EXPORT TpchGen { * table from that single TpchGen instance. Note: Every batch will be scheduled as a new * task using the ExecPlan's scheduler. */ - static Result Make(ExecPlan* plan, double scale_factor = 1.0, - int64_t batch_size = 4096, - util::optional seed = util::nullopt); + static Result> Make( + ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096, + util::optional seed = util::nullopt); // The below methods will create and add an ExecNode to the plan that generates // data for the desired table. If columns is empty, all columns will be generated. // The methods return the added ExecNode, which should be used for inputs. - Result Supplier(std::vector columns = {}); - Result Part(std::vector columns = {}); - Result PartSupp(std::vector columns = {}); - Result Customer(std::vector columns = {}); - Result Orders(std::vector columns = {}); - Result Lineitem(std::vector columns = {}); - Result Nation(std::vector columns = {}); - Result Region(std::vector columns = {}); - - private: - TpchGen(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) - : plan_(plan), - scale_factor_(scale_factor), - batch_size_(batch_size), - seed_rng_(seed) {} - - template - Result CreateNode(const char* name, std::vector columns); - - ExecPlan* plan_; - double scale_factor_; - int64_t batch_size_; - random::pcg64_fast seed_rng_; - - std::shared_ptr part_and_part_supp_generator_{}; - std::shared_ptr orders_and_line_item_generator_{}; + virtual Result Supplier(std::vector columns = {}) = 0; + virtual Result Part(std::vector columns = {}) = 0; + virtual Result PartSupp(std::vector columns = {}) = 0; + virtual Result Customer(std::vector columns = {}) = 0; + virtual Result Orders(std::vector columns = {}) = 0; + virtual Result Lineitem(std::vector columns = {}) = 0; + virtual Result Nation(std::vector columns = {}) = 0; + virtual Result Region(std::vector columns = {}) = 0; }; + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index c7f960644f2..8face53eebb 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -50,8 +50,9 @@ Result> GenerateTable( double scale_factor = 1.0) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(&ctx)); - ARROW_ASSIGN_OR_RAISE(TpchGen gen, TpchGen::Make(plan.get(), scale_factor)); - ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.*table)({}))); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr gen, + TpchGen::Make(plan.get(), scale_factor)); + ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.get()->*table)({}))); AsyncGenerator> sink_gen; Declaration sink("sink", {Declaration::Input(table_node)}, SinkNodeOptions{&sink_gen}); ARROW_RETURN_NOT_OK(sink.AddToPlan(plan.get()));