diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index fb5d603d6e8..57483266757 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -399,6 +399,7 @@ if(ARROW_COMPUTE) compute/exec/sink_node.cc compute/exec/source_node.cc compute/exec/task_util.cc + compute/exec/tpch_node.cc compute/exec/union_node.cc compute/exec/util.cc compute/function.cc diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index 53ba77b6088..b2a21c2bd6b 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -32,11 +32,14 @@ add_arrow_compute_test(hash_join_node_test hash_join_node_test.cc bloom_filter_test.cc key_hash_test.cc) +add_arrow_compute_test(tpch_node_test PREFIX "arrow-compute") add_arrow_compute_test(union_node_test PREFIX "arrow-compute") add_arrow_compute_test(util_test PREFIX "arrow-compute") add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(tpch_benchmark PREFIX "arrow-compute") + if(ARROW_BUILD_OPENMP_BENCHMARKS) find_package(OpenMP REQUIRED) add_arrow_benchmark(hash_join_benchmark diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 6c2d9e8e2a5..dbc8da7bbb1 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -117,7 +117,7 @@ class ARROW_EXPORT Expression { // post-bind properties ValueDescr descr; - internal::SmallVector indices; + ::arrow::internal::SmallVector indices; }; const Parameter* parameter() const; diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc new file mode 100644 index 00000000000..98a32265712 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/cast.h" +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/testing/future_util.h" +#include "arrow/util/make_unique.h" + +namespace arrow { +namespace compute { +namespace internal { + +std::shared_ptr Plan_Q1(AsyncGenerator>* sink_gen, + int scale_factor) { + ExecContext* ctx = default_exec_context(); + *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + std::shared_ptr plan = *ExecPlan::Make(ctx); + std::unique_ptr gen = + *TpchGen::Make(plan.get(), static_cast(scale_factor)); + + ExecNode* lineitem = + *gen->Lineitem({"L_QUANTITY", "L_EXTENDEDPRICE", "L_TAX", "L_DISCOUNT", + "L_SHIPDATE", "L_RETURNFLAG", "L_LINESTATUS"}); + + auto sept_2_1998 = std::make_shared( + 10471); // September 2, 1998 is 10471 days after January 1, 1970 + Expression filter = + less_equal(field_ref("L_SHIPDATE"), literal(std::move(sept_2_1998))); + FilterNodeOptions filter_opts(filter); + + Expression l_returnflag = field_ref("L_RETURNFLAG"); + Expression l_linestatus = field_ref("L_LINESTATUS"); + Expression quantity = field_ref("L_QUANTITY"); + Expression base_price = field_ref("L_EXTENDEDPRICE"); + + std::shared_ptr decimal_1 = + std::make_shared(Decimal128{0, 100}, decimal(12, 2)); + Expression discount_multiplier = + call("subtract", {literal(decimal_1), field_ref("L_DISCOUNT")}); + Expression tax_multiplier = call("add", {literal(decimal_1), field_ref("L_TAX")}); + Expression disc_price = + call("multiply", {field_ref("L_EXTENDEDPRICE"), discount_multiplier}); + Expression charge = + call("multiply", + {call("cast", + {call("multiply", {field_ref("L_EXTENDEDPRICE"), discount_multiplier})}, + compute::CastOptions::Unsafe(decimal(12, 2))), + tax_multiplier}); + Expression discount = field_ref("L_DISCOUNT"); + + std::vector projection_list = {l_returnflag, l_linestatus, quantity, + base_price, disc_price, charge, + quantity, base_price, discount}; + std::vector project_names = { + "l_returnflag", "l_linestatus", "sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", "avg_disc"}; + ProjectNodeOptions project_opts(std::move(projection_list), std::move(project_names)); + + ScalarAggregateOptions sum_opts = ScalarAggregateOptions::Defaults(); + CountOptions count_opts(CountOptions::CountMode::ALL); + std::vector aggs = { + {"hash_sum", &sum_opts}, {"hash_sum", &sum_opts}, {"hash_sum", &sum_opts}, + {"hash_sum", &sum_opts}, {"hash_mean", &sum_opts}, {"hash_mean", &sum_opts}, + {"hash_mean", &sum_opts}, {"hash_count", &count_opts}}; + + std::vector to_aggregate = {"sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", + "avg_disc", "sum_qty"}; + + std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", + "sum_charge", "avg_qty", "avg_price", + "avg_disc", "count_order"}; + + std::vector keys = {"l_returnflag", "l_linestatus"}; + AggregateNodeOptions agg_opts(aggs, to_aggregate, names, keys); + + SortKey l_returnflag_key("l_returnflag"); + SortKey l_linestatus_key("l_linestatus"); + SortOptions sort_opts({l_returnflag_key, l_linestatus_key}); + OrderBySinkNodeOptions order_by_opts(sort_opts, sink_gen); + + Declaration filter_decl("filter", {Declaration::Input(lineitem)}, filter_opts); + Declaration project_decl("project", project_opts); + Declaration aggregate_decl("aggregate", agg_opts); + Declaration orderby_decl("order_by_sink", order_by_opts); + + Declaration q1 = + Declaration::Sequence({filter_decl, project_decl, aggregate_decl, orderby_decl}); + std::ignore = *q1.AddToPlan(plan.get()); + return plan; +} + +static void BM_Tpch_Q1(benchmark::State& st) { + for (auto _ : st) { + st.PauseTiming(); + AsyncGenerator> sink_gen; + std::shared_ptr plan = Plan_Q1(&sink_gen, static_cast(st.range(0))); + st.ResumeTiming(); + auto fut = StartAndCollect(plan.get(), sink_gen); + auto res = *fut.MoveResult(); + } +} + +BENCHMARK(BM_Tpch_Q1)->Args({1})->ArgNames({"ScaleFactor"}); +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc new file mode 100644 index 00000000000..c7397970d79 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -0,0 +1,3522 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/buffer.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/util/formatting.h" +#include "arrow/util/future.h" +#include "arrow/util/io_util.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/pcg_random.h" +#include "arrow/util/unreachable.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace arrow { +using internal::checked_cast; +using internal::GetRandomSeed; + +namespace compute { +namespace internal { +/* +Architecture of the generator: + This is a multithreaded implementation of TPC-H's DBGen data generator. For each table + that doesn't depend on any other tables, it gets its own generator. For tables that + are dependent on each other (namely [ORDERS, LINEITEM] and [PART, PARTSUPP]), we +implement a "double generator". Each generator is given a list of columns to generate. +Columns are then generated lazily a batch at a time. If column A depends on another column +B, column A will ensure that column B is always generated. We don't have to worry about +breaking cycles of dependencies because the dependency graph is acyclic (so there's no +code to even bother breaking cycles). Double generators work by maintaining two output +queues. Batches for each of the tables are generated in sync, and batches that are not +being output immediately are appended to the queue. + + To generate a batch, we grab a lock, increment a counter, check the output queue (if +applicable), and then call the necessary generator functions. To generate a column, we +first check if it's already been generated. If not, we allocate the batch and then fill it +according to the spec. + + There are a few types of columns that get generated: + - Primary Keys: incrementing counters from 1 to N (with N being the number of rows). +These are generated by incrementing a counter (this counter is gated under a lock). + - V-String: Random-length string of alphanumerics + - Phone Number: 2, 3, 3, and 4, digit numbers separated by -'s + - Random Numbers: random numbers within some range + - Expressions: expressions based on some other columns + Please consult the spec for intended behavior of each individual column. Columns are +generated by a function of the same name (so e.g. the column PS_PARTKEY is generated by a +function PS_PARTKEY). +*/ + +namespace { + +const char* NameParts[] = { + "almond", "antique", "aquamarine", "azure", "beige", "bisque", + "black", "blanched", "blue", "blush", "brown", "burlywood", + "burnished", "chartreuse", "chiffon", "chocolate", "coral", "cornflower", + "cornsilk", "cream", "cyan", "dark", "deep", "dim", + "dodger", "drab", "firebrick", "floral", "forest", "frosted", + "gainsboro", "ghost", "goldenrod", "green", "grey", "honeydew", + "hot", "indian", "ivory", "khaki", "lace", "lavender", + "lawn", "lemon", "light", "lime", "linen", "magenta", + "maroon", "medium", "metallic", "midnight", "mint", "misty", + "moccasin", "navajo", "navy", "olive", "orange", "orchid", + "pale", "papaya", "peach", "peru", "pink", "plum", + "powder", "puff", "purple", "red", "rose", "rosy", + "royal", "saddle", "salmon", "sandy", "seashell", "sienna", + "sky", "slate", "smoke", "snow", "spring", "steel", + "tan", "thistle", "tomato", "turquoise", "violet", "wheat", + "white", "yellow", +}; +constexpr size_t kNumNameParts = sizeof(NameParts) / sizeof(NameParts[0]); + +const char* Types_1[] = { + "STANDARD ", "SMALL ", "MEDIUM ", "LARGE ", "ECONOMY ", "PROMO ", +}; +constexpr size_t kNumTypes_1 = sizeof(Types_1) / sizeof(Types_1[0]); + +const char* Types_2[] = { + "ANODIZED ", "BURNISHED ", "PLATED ", "POLISHED ", "BRUSHED ", +}; +constexpr size_t kNumTypes_2 = sizeof(Types_2) / sizeof(Types_2[0]); + +const char* Types_3[] = { + "TIN", "NICKEL", "BRASS", "STEEL", "COPPER", +}; +constexpr size_t kNumTypes_3 = sizeof(Types_3) / sizeof(Types_3[0]); + +const char* Containers_1[] = { + "SM ", "LG ", "MD ", "JUMBO ", "WRAP ", +}; +constexpr size_t kNumContainers_1 = sizeof(Containers_1) / sizeof(Containers_1[0]); + +const char* Containers_2[] = { + "CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM", +}; +constexpr size_t kNumContainers_2 = sizeof(Containers_2) / sizeof(Containers_2[0]); + +const char* Segments[] = { + "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD", +}; +constexpr size_t kNumSegments = sizeof(Segments) / sizeof(Segments[0]); + +const char* Priorities[] = { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW", +}; +constexpr size_t kNumPriorities = sizeof(Priorities) / sizeof(Priorities[0]); + +const char* Instructions[] = { + "DELIVER IN PERSON", + "COLLECT COD", + "NONE", + "TAKE BACK RETURN", +}; +constexpr size_t kNumInstructions = sizeof(Instructions) / sizeof(Instructions[0]); + +const char* Modes[] = { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB", +}; +constexpr size_t kNumModes = sizeof(Modes) / sizeof(Modes[0]); + +const char* Nouns[] = { + "foxes ", "ideas ", "theodolites ", "pinto beans ", "instructions ", + "dependencies ", "excuses ", "platelets ", "asymptotes ", "courts ", + "dolphins ", "multipliers ", "sautemes ", "warthogs ", "frets ", + "dinos ", "attainments ", "somas ", "Tiresias '", "patterns ", + "forges ", "braids ", "hockey players ", "frays ", "warhorses ", + "dugouts ", "notomis ", "epitaphs ", "pearls ", "tithes ", + "waters ", "orbits ", "gifts ", "sheaves ", "depths ", + "sentiments ", "decoys ", "realms ", "pains ", "grouches ", + "escapades ", +}; +constexpr size_t kNumNouns = sizeof(Nouns) / sizeof(Nouns[0]); + +const char* Verbs[] = { + "sleep ", "wake ", "are ", "cajole ", "haggle ", "nag ", "use ", + "boost ", "affix ", "detect ", "integrate ", "maintain ", "nod ", "was ", + "lose ", "sublate ", "solve ", "thrash ", "promise ", "engage ", "hinder ", + "print ", "x-ray ", "breach ", "eat ", "grow ", "impress ", "mold ", + "poach ", "serve ", "run ", "dazzle ", "snooze ", "doze ", "unwind ", + "kindle ", "play ", "hang ", "believe ", "doubt ", +}; +constexpr size_t kNumVerbs = sizeof(Verbs) / sizeof(Verbs[0]); + +const char* Adjectives[] = { + "furious ", "sly ", "careful ", "blithe ", "quick ", "fluffy ", "slow ", + "quiet ", "ruthless ", "thin ", "close ", "dogged ", "daring ", "brave ", + "stealthy ", "permanent ", "enticing ", "idle ", "busy ", "regular ", "final ", + "ironic ", "even ", "bold ", "silent ", +}; +constexpr size_t kNumAdjectives = sizeof(Adjectives) / sizeof(Adjectives[0]); + +const char* Adverbs[] = { + "sometimes ", "always ", "never ", "furiously ", "slyly ", "carefully ", + "blithely ", "quickly ", "fluffily ", "slowly ", "quietly ", "ruthlessly ", + "thinly ", "closely ", "doggedly ", "daringly ", "bravely ", "stealthily ", + "permanently ", "enticingly ", "idly ", "busily ", "regularly ", "finally ", + "ironically ", "evenly ", "boldly ", "silently ", +}; +constexpr size_t kNumAdverbs = sizeof(Adverbs) / sizeof(Adverbs[0]); + +const char* Prepositions[] = { + "about ", "above ", "according to ", "across ", "after ", "against ", + "along ", "alongside of ", "among ", "around ", "at ", "atop ", + "before ", "behind ", "beneath ", "beside ", "besides ", "between ", + "beyond ", "beyond ", "by ", "despite ", "during ", "except ", + "for ", "from ", "in place of ", "inside ", "instead of ", "into ", + "near ", "of ", "on ", "outside ", "over ", "past ", + "since ", "through ", "throughout ", "to ", "toward ", "under ", + "until ", "up ", "upon ", "without ", "with ", "within ", +}; +constexpr size_t kNumPrepositions = sizeof(Prepositions) / sizeof(Prepositions[0]); + +const char* Auxiliaries[] = { + "do ", + "may ", + "might ", + "shall ", + "will ", + "would ", + "can ", + "could ", + "should ", + "ought to ", + "must ", + "will have to ", + "shall have to ", + "could have to ", + "should have to ", + "must have to ", + "need to ", + "try to ", +}; +constexpr size_t kNumAuxiliaries = sizeof(Auxiliaries) / sizeof(Auxiliaries[0]); + +const char* Terminators[] = { + ".", ";", ":", "?", "!", "--", +}; +constexpr size_t kNumTerminators = sizeof(Terminators) / sizeof(Terminators[0]); + +constexpr uint32_t kStartDate = + 8035; // January 1, 1992 is 8035 days after January 1, 1970 +constexpr uint32_t kCurrentDate = + 9298; // June 17, 1995 is 9298 days after January 1, 1970 +constexpr uint32_t kEndDate = + 10591; // December 12, 1998 is 10591 days after January 1, 1970 + +std::uniform_int_distribution kSeedDist(std::numeric_limits::min(), + std::numeric_limits::max()); +// The spec says to generate a 300 MB string according to a grammar. This is a +// concurrent implementation of the generator. Each thread generates the text in +// (up to) 8KB chunks of text. The generator maintains a cursor into the +// 300 MB buffer. After generating the chunk, the cursor is incremented +// to reserve space, and the chunk is memcpy-d in. +// This text is used to generate the COMMENT columns. To generate a comment, the spec +// says to pick a random length and a random offset into the 300 MB buffer (it does +// not specify it should be word/sentence aligned), and that slice of text becomes +// the comment. +class TpchPseudotext { + public: + Status EnsureInitialized(random::pcg32_fast& rng); + Result GenerateComments(size_t num_comments, size_t min_length, + size_t max_length, random::pcg32_fast& rng); + + private: + bool GenerateWord(int64_t& offset, random::pcg32_fast& rng, char* arr, + const char** words, size_t num_choices); + bool GenerateNoun(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateVerb(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateAdjective(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateAdverb(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GeneratePreposition(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateAuxiliary(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateTerminator(int64_t& offset, random::pcg32_fast& rng, char* arr); + + bool GenerateNounPhrase(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GenerateVerbPhrase(int64_t& offset, random::pcg32_fast& rng, char* arr); + bool GeneratePrepositionalPhrase(int64_t& offset, random::pcg32_fast& rng, char* arr); + + bool GenerateSentence(int64_t& offset, random::pcg32_fast& rng, char* arr); + + std::atomic done_ = {false}; + int64_t generated_offset_{0}; + std::mutex text_guard_; + std::unique_ptr text_; + static constexpr int64_t kChunkSize = 8192; + static constexpr int64_t kTextBytes = 300 * 1024 * 1024; // 300 MB +}; + +static TpchPseudotext g_text; + +Status TpchPseudotext::EnsureInitialized(random::pcg32_fast& rng) { + if (done_.load()) return Status::OK(); + + { + std::lock_guard lock(text_guard_); + if (!text_) { + ARROW_ASSIGN_OR_RAISE(text_, AllocateBuffer(kTextBytes)); + } + } + char* out = reinterpret_cast(text_->mutable_data()); + char temp_buff[kChunkSize]; + + while (!done_.load()) { + int64_t known_valid_offset = 0; + int64_t try_offset = 0; + while (GenerateSentence(try_offset, rng, temp_buff)) known_valid_offset = try_offset; + + bool last_one; + int64_t offset; + int64_t memcpy_size; + { + std::lock_guard lock(text_guard_); + if (done_.load()) return Status::OK(); + int64_t bytes_remaining = kTextBytes - generated_offset_; + memcpy_size = std::min(known_valid_offset, bytes_remaining); + offset = generated_offset_; + generated_offset_ += memcpy_size; + last_one = generated_offset_ == kTextBytes; + } + std::memcpy(out + offset, temp_buff, memcpy_size); + if (last_one) done_.store(true); + } + return Status::OK(); +} + +Result TpchPseudotext::GenerateComments(size_t num_comments, size_t min_length, + size_t max_length, + random::pcg32_fast& rng) { + RETURN_NOT_OK(EnsureInitialized(rng)); + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buffer, + AllocateBuffer(sizeof(int32_t) * (num_comments + 1))); + int32_t* offsets = reinterpret_cast(offset_buffer->mutable_data()); + offsets[0] = 0; + for (size_t i = 1; i <= num_comments; i++) + offsets[i] = offsets[i - 1] + static_cast(length_dist(rng)); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr comment_buffer, + AllocateBuffer(offsets[num_comments])); + char* comments = reinterpret_cast(comment_buffer->mutable_data()); + for (size_t i = 0; i < num_comments; i++) { + size_t length = offsets[i + 1] - offsets[i]; + std::uniform_int_distribution offset_dist(0, kTextBytes - length); + size_t offset_in_text = offset_dist(rng); + std::memcpy(comments + offsets[i], text_->data() + offset_in_text, length); + } + ArrayData ad(utf8(), num_comments, + {nullptr, std::move(offset_buffer), std::move(comment_buffer)}); + return std::move(ad); +} + +bool TpchPseudotext::GenerateWord(int64_t& offset, random::pcg32_fast& rng, char* arr, + const char** words, size_t num_choices) { + std::uniform_int_distribution dist(0, num_choices - 1); + const char* word = words[dist(rng)]; + size_t length = std::strlen(word); + if (offset + length > kChunkSize) return false; + std::memcpy(arr + offset, word, length); + offset += length; + return true; +} + +bool TpchPseudotext::GenerateNoun(int64_t& offset, random::pcg32_fast& rng, char* arr) { + return GenerateWord(offset, rng, arr, Nouns, kNumNouns); +} + +bool TpchPseudotext::GenerateVerb(int64_t& offset, random::pcg32_fast& rng, char* arr) { + return GenerateWord(offset, rng, arr, Verbs, kNumVerbs); +} + +bool TpchPseudotext::GenerateAdjective(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + return GenerateWord(offset, rng, arr, Adjectives, kNumAdjectives); +} + +bool TpchPseudotext::GenerateAdverb(int64_t& offset, random::pcg32_fast& rng, char* arr) { + return GenerateWord(offset, rng, arr, Adverbs, kNumAdverbs); +} + +bool TpchPseudotext::GeneratePreposition(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + return GenerateWord(offset, rng, arr, Prepositions, kNumPrepositions); +} + +bool TpchPseudotext::GenerateAuxiliary(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + return GenerateWord(offset, rng, arr, Auxiliaries, kNumAuxiliaries); +} + +bool TpchPseudotext::GenerateTerminator(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + bool result = GenerateWord(offset, rng, arr, Terminators, kNumTerminators); + // Swap the space with the terminator + if (result) std::swap(*(arr + offset - 2), *(arr + offset - 1)); + return result; +} + +bool TpchPseudotext::GenerateNounPhrase(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + std::uniform_int_distribution dist(0, 3); + const char* comma_space = ", "; + bool success = true; + switch (dist(rng)) { + case 0: + success &= GenerateNoun(offset, rng, arr); + break; + case 1: + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); + break; + case 2: + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateWord(--offset, rng, arr, &comma_space, 1); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); + break; + case 3: + success &= GenerateAdverb(offset, rng, arr); + success &= GenerateAdjective(offset, rng, arr); + success &= GenerateNoun(offset, rng, arr); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + return success; +} + +bool TpchPseudotext::GenerateVerbPhrase(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + std::uniform_int_distribution dist(0, 3); + bool success = true; + switch (dist(rng)) { + case 0: + success &= GenerateVerb(offset, rng, arr); + break; + case 1: + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); + break; + case 2: + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); + break; + case 3: + success &= GenerateAuxiliary(offset, rng, arr); + success &= GenerateVerb(offset, rng, arr); + success &= GenerateAdverb(offset, rng, arr); + break; + default: + Unreachable("Random number should be between 0 and 3 inclusive"); + break; + } + return success; +} + +bool TpchPseudotext::GeneratePrepositionalPhrase(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + const char* the_space = "the "; + bool success = true; + success &= GeneratePreposition(offset, rng, arr); + success &= GenerateWord(offset, rng, arr, &the_space, 1); + success &= GenerateNounPhrase(offset, rng, arr); + return success; +} + +bool TpchPseudotext::GenerateSentence(int64_t& offset, random::pcg32_fast& rng, + char* arr) { + std::uniform_int_distribution dist(0, 4); + bool success = true; + switch (dist(rng)) { + case 0: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 1: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 2: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 3: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GenerateNounPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + case 4: + success &= GenerateNounPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateVerbPhrase(offset, rng, arr); + success &= GeneratePrepositionalPhrase(offset, rng, arr); + success &= GenerateTerminator(offset, rng, arr); + break; + default: + Unreachable("Random number should be between 0 and 5 inclusive"); + break; + } + return success; +} + +class TpchTableGenerator { + public: + using OutputBatchCallback = std::function; + using FinishedCallback = std::function; + using GenerateFn = std::function; + using ScheduleCallback = std::function; + using AbortCallback = std::function; + + virtual Status Init(std::vector columns, double scale_factor, + int64_t batch_size, int64_t seed) = 0; + + virtual Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) = 0; + + bool Abort() { + bool expected = false; + return done_.compare_exchange_strong(expected, true); + } + + virtual std::shared_ptr schema() const = 0; + + virtual ~TpchTableGenerator() = default; + + protected: + int64_t seed_ = {0}; + std::atomic done_ = {false}; + std::atomic batches_outputted_ = {0}; +}; + +int GetNumDigits(int64_t x) { + // This if statement chain is for MAXIMUM SPEED + // Source: + // https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c + ARROW_DCHECK(x >= 0); + if (x < 10ll) return 1; + if (x < 100ll) return 2; + if (x < 1000ll) return 3; + if (x < 10000ll) return 4; + if (x < 100000ll) return 5; + if (x < 1000000ll) return 6; + if (x < 10000000ll) return 7; + if (x < 100000000ll) return 8; + if (x < 1000000000ll) return 9; + if (x < 10000000000ll) return 10; + if (x < 100000000000ll) return 11; + if (x < 1000000000000ll) return 12; + if (x < 10000000000000ll) return 13; + if (x < 100000000000000ll) return 14; + if (x < 1000000000000000ll) return 15; + if (x < 10000000000000000ll) return 16; + if (x < 100000000000000000ll) return 17; + if (x < 1000000000000000000ll) return 18; + Unreachable("Positive 64-bit integer should never have more than 18 digits"); + return -1; +} + +void AppendNumberPaddedToNineDigits(char* out, int64_t x) { + size_t kPad = 9; + out += std::max(kPad, static_cast(GetNumDigits(x))); + arrow::internal::detail::FormatAllDigitsLeftPadded(x, kPad, '0', &out); +} + +Result> SetOutputColumns( + const std::vector& columns, + const std::vector>& types, + const std::unordered_map& name_map, std::vector& gen_list) { + gen_list.clear(); + std::vector> fields; + if (columns.empty()) { + fields.resize(name_map.size()); + gen_list.resize(name_map.size()); + for (auto pair : name_map) { + int col_idx = pair.second; + fields[col_idx] = field(pair.first, types[col_idx]); + gen_list[col_idx] = col_idx; + } + return schema(std::move(fields)); + } else { + for (const std::string& col : columns) { + auto entry = name_map.find(col); + if (entry == name_map.end()) return Status::Invalid("Not a valid column name"); + int col_idx = static_cast(entry->second); + fields.push_back(field(col, types[col_idx])); + gen_list.push_back(col_idx); + } + return schema(std::move(fields)); + } +} + +Result RandomVString(random::pcg32_fast& rng, int64_t num_rows, int32_t min_length, + int32_t max_length) { + std::uniform_int_distribution length_dist(min_length, max_length); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((num_rows + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for (int64_t i = 1; i <= num_rows; i++) offsets[i] = offsets[i - 1] + length_dist(rng); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, + AllocateBuffer(offsets[num_rows])); + char* str = reinterpret_cast(str_buff->mutable_data()); + + // Spec says to pick random alphanumeric characters from a set of at least + // 64 symbols. Now, let's think critically here: 26 letters in the alphabet, + // so 52 total for upper and lower case, and 10 possible digits gives 62 + // characters... + // dbgen solves this by including a space and a comma as well, so we'll + // copy that. + const char alpha_numerics[65] = + "0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; + std::uniform_int_distribution char_dist(0, 63); + for (int32_t i = 0; i < offsets[num_rows]; i++) str[i] = alpha_numerics[char_dist(rng)]; + + ArrayData ad(utf8(), num_rows, {nullptr, std::move(offset_buff), std::move(str_buff)}); + return std::move(ad); +} + +void GeneratePhoneNumber(char* out, random::pcg32_fast& rng, int32_t country) { + std::uniform_int_distribution three_digit(100, 999); + std::uniform_int_distribution four_digit(1000, 9999); + + int32_t country_code = country + 10; + int32_t l1 = three_digit(rng); + int32_t l2 = three_digit(rng); + int32_t l3 = four_digit(rng); + + out += 15; + arrow::internal::detail::FormatAllDigits(l3, &out); + *(--out) = '-'; + arrow::internal::detail::FormatAllDigits(l2, &out); + *(--out) = '-'; + arrow::internal::detail::FormatAllDigits(l1, &out); + *(--out) = '-'; + arrow::internal::detail::FormatTwoDigits(country_code, &out); +} + +using GenerateColumnFn = std::function; +class PartAndPartSupplierGenerator { + public: + Status Init(size_t num_threads, int64_t batch_size, double scale_factor, int64_t seed) { + if (!inited_) { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + thread_local_data_.resize(num_threads); + random::pcg64_fast seed_rng(seed); + for (ThreadLocalData& tld : thread_local_data_) { + constexpr int kMaxNumDistinctStrings = 5; + tld.string_indices.resize(kMaxNumDistinctStrings * batch_size_); + tld.rng.seed(kSeedDist(seed_rng)); + } + part_rows_to_generate_ = static_cast(scale_factor_ * 200000); + } + return Status::OK(); + } + + int64_t part_batches_generated() const { return part_batches_generated_.load(); } + + int64_t partsupp_batches_generated() const { + return partsupp_batches_generated_.load(); + } + + Result> SetPartOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kPartTypes, kPartNameMap, part_cols_); + } + + Result> SetPartSuppOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kPartsuppTypes, kPartsuppNameMap, partsupp_cols_); + } + + Result> NextPartBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(part_output_queue_mutex_); + if (!part_output_queue_.empty()) { + ExecBatch batch = std::move(part_output_queue_.front()); + part_output_queue_.pop(); + return std::move(batch); + } else if (part_rows_generated_ == part_rows_to_generate_) { + return util::nullopt; + } else { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = + std::min(batch_size_, part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for (int col : part_cols_) RETURN_NOT_OK(kPartGenerators[col](thread_index)); + for (int col : partsupp_cols_) RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); + + std::vector part_result(part_cols_.size()); + for (size_t i = 0; i < part_cols_.size(); i++) { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + if (!partsupp_cols_.empty()) { + std::vector partsupp_results; + for (size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) { + std::vector partsupp_result(partsupp_cols_.size()); + for (size_t icol = 0; icol < partsupp_cols_.size(); icol++) { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(partsupp_output_queue_mutex_); + for (ExecBatch& eb : partsupp_results) { + partsupp_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(part_result)); + } + + Result> NextPartSuppBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(partsupp_output_queue_mutex_); + if (!partsupp_output_queue_.empty()) { + ExecBatch result = std::move(partsupp_output_queue_.front()); + partsupp_output_queue_.pop(); + return std::move(result); + } + } + { + std::lock_guard lock(part_output_queue_mutex_); + if (part_rows_generated_ == part_rows_to_generate_) { + return util::nullopt; + } else { + tld.partkey_start = part_rows_generated_; + tld.part_to_generate = + std::min(batch_size_, part_rows_to_generate_ - part_rows_generated_); + part_rows_generated_ += tld.part_to_generate; + int64_t num_ps_batches = PartsuppBatchesToGenerate(thread_index); + part_batches_generated_.fetch_add(1); + partsupp_batches_generated_.fetch_add(num_ps_batches); + ARROW_DCHECK(part_rows_generated_ <= part_rows_to_generate_); + } + } + tld.part.resize(PART::kNumCols); + std::fill(tld.part.begin(), tld.part.end(), Datum()); + RETURN_NOT_OK(InitPartsupp(thread_index)); + + for (int col : part_cols_) RETURN_NOT_OK(kPartGenerators[col](thread_index)); + for (int col : partsupp_cols_) RETURN_NOT_OK(kPartsuppGenerators[col](thread_index)); + if (!part_cols_.empty()) { + std::vector part_result(part_cols_.size()); + for (size_t i = 0; i < part_cols_.size(); i++) { + int col_idx = part_cols_[i]; + part_result[i] = tld.part[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch part_batch, + ExecBatch::Make(std::move(part_result))); + { + std::lock_guard lock(part_output_queue_mutex_); + part_output_queue_.emplace(std::move(part_batch)); + } + } + std::vector partsupp_results; + for (size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) { + std::vector partsupp_result(partsupp_cols_.size()); + for (size_t icol = 0; icol < partsupp_cols_.size(); icol++) { + int col_idx = partsupp_cols_[icol]; + partsupp_result[icol] = tld.partsupp[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(partsupp_result))); + partsupp_results.emplace_back(std::move(eb)); + } + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(partsupp_output_queue_mutex_); + for (size_t i = 1; i < partsupp_results.size(); i++) + partsupp_output_queue_.emplace(std::move(partsupp_results[i])); + } + return std::move(partsupp_results[0]); + } + + private: +#define FOR_EACH_PART_COLUMN(F) \ + F(P_PARTKEY) \ + F(P_NAME) \ + F(P_MFGR) \ + F(P_BRAND) \ + F(P_TYPE) \ + F(P_SIZE) \ + F(P_CONTAINER) \ + F(P_RETAILPRICE) \ + F(P_COMMENT) + +#define FOR_EACH_PARTSUPP_COLUMN(F) \ + F(PS_PARTKEY) \ + F(PS_SUPPKEY) \ + F(PS_AVAILQTY) \ + F(PS_SUPPLYCOST) \ + F(PS_COMMENT) + +#define MAKE_ENUM(col) col, + struct PART { + enum { + FOR_EACH_PART_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + struct PARTSUPP { + enum { + FOR_EACH_PARTSUPP_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) {#col, PART::col}, + const std::unordered_map kPartNameMap = { + FOR_EACH_PART_COLUMN(MAKE_STRING_MAP)}; +#undef MAKE_STRING_MAP +#define MAKE_STRING_MAP(col) {#col, PARTSUPP::col}, + const std::unordered_map kPartsuppNameMap = { + FOR_EACH_PARTSUPP_COLUMN(MAKE_STRING_MAP)}; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector kPartGenerators = {FOR_EACH_PART_COLUMN(MAKE_FN_ARRAY)}; + std::vector kPartsuppGenerators = { + FOR_EACH_PARTSUPP_COLUMN(MAKE_FN_ARRAY)}; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_LINEITEM_COLUMN +#undef FOR_EACH_ORDERS_COLUMN + + const std::vector> kPartTypes = { + int32(), utf8(), fixed_size_binary(25), fixed_size_binary(10), + utf8(), int32(), fixed_size_binary(10), decimal(12, 2), + utf8(), + }; + + const std::vector> kPartsuppTypes = { + int32(), int32(), int32(), decimal(12, 2), utf8(), + }; + + Status AllocatePartBatch(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.part[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.part_to_generate * byte_width)); + ArrayData ad(kPartTypes[column], tld.part_to_generate, {nullptr, std::move(buff)}); + tld.part[column] = std::move(ad); + return Status::OK(); + } + + Status P_PARTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_PARTKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_PARTKEY)); + int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.part_to_generate; i++) { + p_partkey[i] = static_cast(tld.partkey_start + i + 1); + ARROW_DCHECK(1 <= p_partkey[i] && p_partkey[i] <= part_rows_to_generate_); + } + } + return Status::OK(); + } + + Status P_NAME(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_NAME].kind() == Datum::NONE) { + std::uniform_int_distribution dist( + 0, static_cast(kNumNameParts - 1)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + size_t string_length = 0; + for (int ipart = 0; ipart < 5; ipart++) { + uint8_t name_part_index = static_cast(dist(tld.rng)); + tld.string_indices[irow * 5 + ipart] = name_part_index; + string_length += std::strlen(NameParts[name_part_index]); + } + // Add 4 because there is a space between each word (i.e. four spaces) + offsets[irow + 1] = static_cast(offsets[irow] + string_length + 4); + } + // Add an extra byte for the space after in the very last string. + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, + AllocateBuffer(offsets[tld.part_to_generate] + 1)); + char* strings = reinterpret_cast(string_buffer->mutable_data()); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char* row = strings + offsets[irow]; + for (int ipart = 0; ipart < 5; ipart++) { + uint8_t name_part_index = tld.string_indices[irow * 5 + ipart]; + const char* part = NameParts[name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + *row++ = ' '; + } + } + ArrayData ad(kPartTypes[PART::P_NAME], tld.part_to_generate, + {nullptr, std::move(offset_buff), std::move(string_buffer)}); + Datum datum(ad); + tld.part[PART::P_NAME] = std::move(datum); + } + return Status::OK(); + } + + Status P_MFGR(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_MFGR].kind() == Datum::NONE) { + std::uniform_int_distribution dist(1, 5); + const char* manufacturer = "Manufacturer#"; + const size_t manufacturer_length = std::strlen(manufacturer); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR)); + char* p_mfgr = reinterpret_cast( + tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width); + char mfgr_id = '0' + dist(tld.rng); + *(p_mfgr + byte_width * irow + manufacturer_length) = mfgr_id; + } + } + return Status::OK(); + } + + Status P_BRAND(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_BRAND].kind() == Datum::NONE) { + RETURN_NOT_OK(P_MFGR(thread_index)); + std::uniform_int_distribution dist(1, 5); + const char* brand = "Brand#"; + const size_t brand_length = std::strlen(brand); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_BRAND)); + const char* p_mfgr = reinterpret_cast( + tld.part[PART::P_MFGR].array()->buffers[1]->data()); + char* p_brand = reinterpret_cast( + tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_BRAND]); + int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]); + const size_t mfgr_id_offset = std::strlen("Manufacturer#"); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char* row = p_brand + byte_width * irow; + char mfgr_id = *(p_mfgr + irow * mfgr_byte_width + mfgr_id_offset); + char brand_id = '0' + dist(tld.rng); + std::strncpy(row, brand, byte_width); + *(row + brand_length) = mfgr_id; + *(row + brand_length + 1) = brand_id; + irow += 0; + } + } + return Status::OK(); + } + + Status P_TYPE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_TYPE].kind() == Datum::NONE) { + using D = std::uniform_int_distribution; + D dists[] = { + D{0, static_cast(kNumTypes_1 - 1)}, + D{0, static_cast(kNumTypes_2 - 1)}, + D{0, static_cast(kNumTypes_3 - 1)}, + }; + + const char** types[] = {Types_1, Types_2, Types_3}; + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((tld.part_to_generate + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + offsets[0] = 0; + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + size_t string_length = 0; + for (int ipart = 0; ipart < 3; ipart++) { + uint8_t name_part_index = static_cast(dists[ipart](tld.rng)); + tld.string_indices[irow * 3 + ipart] = name_part_index; + string_length += std::strlen(types[ipart][name_part_index]); + } + offsets[irow + 1] = static_cast(offsets[irow] + string_length); + } + ARROW_ASSIGN_OR_RAISE(std::unique_ptr string_buffer, + AllocateBuffer(offsets[tld.part_to_generate])); + char* strings = reinterpret_cast(string_buffer->mutable_data()); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + char* row = strings + offsets[irow]; + for (int ipart = 0; ipart < 3; ipart++) { + uint8_t name_part_index = tld.string_indices[irow * 3 + ipart]; + const char* part = types[ipart][name_part_index]; + size_t length = std::strlen(part); + std::memcpy(row, part, length); + row += length; + } + } + ArrayData ad(kPartTypes[PART::P_TYPE], tld.part_to_generate, + {nullptr, std::move(offset_buff), std::move(string_buffer)}); + Datum datum(ad); + tld.part[PART::P_TYPE] = std::move(datum); + } + return Status::OK(); + } + + Status P_SIZE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_SIZE].kind() == Datum::NONE) { + std::uniform_int_distribution dist(1, 50); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_SIZE)); + int32_t* p_size = reinterpret_cast( + tld.part[PART::P_SIZE].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.part_to_generate; i++) p_size[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status P_CONTAINER(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_CONTAINER].kind() == Datum::NONE) { + std::uniform_int_distribution dist1( + 0, static_cast(kNumContainers_1 - 1)); + std::uniform_int_distribution dist2( + 0, static_cast(kNumContainers_2 - 1)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER)); + char* p_container = reinterpret_cast( + tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data()); + int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_CONTAINER]); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + int container1_idx = dist1(tld.rng); + int container2_idx = dist2(tld.rng); + const char* container1 = Containers_1[container1_idx]; + const char* container2 = Containers_2[container2_idx]; + size_t container1_length = std::strlen(container1); + size_t container2_length = std::strlen(container2); + + char* row = p_container + byte_width * irow; + std::strncpy(row, container1, byte_width); + std::memcpy(row + container1_length, container2, container2_length); + } + } + return Status::OK(); + } + + Status P_RETAILPRICE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_RETAILPRICE].kind() == Datum::NONE) { + RETURN_NOT_OK(P_PARTKEY(thread_index)); + RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_RETAILPRICE)); + const int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + Decimal128* p_retailprice = reinterpret_cast( + tld.part[PART::P_RETAILPRICE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.part_to_generate; irow++) { + int32_t partkey = p_partkey[irow]; + int64_t retail_price = + (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + p_retailprice[irow] = {retail_price}; + } + } + return Status::OK(); + } + + Status P_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PART::P_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE( + tld.part[PART::P_COMMENT], + g_text.GenerateComments(tld.part_to_generate, 5, 22, tld.rng)); + } + return Status::OK(); + } + + int64_t PartsuppBatchesToGenerate(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t num_batches = (ps_to_generate + batch_size_ - 1) / batch_size_; + return num_batches; + } + + Status InitPartsupp(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + tld.generated_partsupp.reset(); + int64_t num_batches = PartsuppBatchesToGenerate(thread_index); + tld.partsupp.resize(num_batches); + for (std::vector& batch : tld.partsupp) { + batch.resize(PARTSUPP::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); + } + return Status::OK(); + } + + Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + int32_t byte_width = arrow::internal::GetByteWidth(*kPartsuppTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(kPartsuppTypes[column], batch_size_, {nullptr, std::move(buff)}); + tld.partsupp[ibatch][column] = std::move(ad); + return Status::OK(); + } + + Status PS_PARTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_PARTKEY]) { + tld.generated_partsupp[PARTSUPP::PS_PARTKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_PARTKEY)); + int32_t* ps_partkey = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for (int64_t irun = 0; irun < next_run;) { + for (; ipartsupp < kPartSuppRowsPerPart && irun < next_run; ipartsupp++, irun++) + ps_partkey[batch_offset++] = p_partkey[ipart]; + + if (ipartsupp == kPartSuppRowsPerPart) { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_PARTKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_SUPPKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_SUPPKEY]) { + tld.generated_partsupp[PARTSUPP::PS_SUPPKEY] = true; + RETURN_NOT_OK(P_PARTKEY(thread_index)); + const int32_t* p_partkey = reinterpret_cast( + tld.part[PART::P_PARTKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + int64_t ipartsupp = 0; + int64_t ipart = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + const int32_t S = static_cast(scale_factor_ * 10000); + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPKEY)); + int32_t* ps_suppkey = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + + int64_t batch_offset = 0; + for (int64_t irun = 0; irun < next_run;) { + for (; ipartsupp < kPartSuppRowsPerPart && irun < next_run; + ipartsupp++, irun++) { + int32_t supplier = static_cast(ipartsupp); + int32_t partkey = p_partkey[ipart]; + // Magic formula from TPCH spec. + ps_suppkey[batch_offset++] = + (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + if (ipartsupp == kPartSuppRowsPerPart) { + ipartsupp = 0; + ipart++; + } + } + irow += next_run; + tld.partsupp[ibatch][PARTSUPP::PS_SUPPKEY].array()->length = batch_offset; + } + } + return Status::OK(); + } + + Status PS_AVAILQTY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_AVAILQTY]) { + tld.generated_partsupp[PARTSUPP::PS_AVAILQTY] = true; + std::uniform_int_distribution dist(1, 9999); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK(AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_AVAILQTY)); + int32_t* ps_availqty = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for (int64_t irun = 0; irun < next_run; irun++) ps_availqty[irun] = dist(tld.rng); + + tld.partsupp[ibatch][PARTSUPP::PS_AVAILQTY].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_SUPPLYCOST(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST]) { + tld.generated_partsupp[PARTSUPP::PS_SUPPLYCOST] = true; + std::uniform_int_distribution dist(100, 100000); + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + int64_t ibatch = 0; + for (int64_t irow = 0; irow < ps_to_generate; ibatch++) { + RETURN_NOT_OK( + AllocatePartSuppBatch(thread_index, ibatch, PARTSUPP::PS_SUPPLYCOST)); + Decimal128* ps_supplycost = + reinterpret_cast(tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST] + .array() + ->buffers[1] + ->mutable_data()); + int64_t next_run = std::min(batch_size_, ps_to_generate - irow); + for (int64_t irun = 0; irun < next_run; irun++) + ps_supplycost[irun] = {dist(tld.rng)}; + + tld.partsupp[ibatch][PARTSUPP::PS_SUPPLYCOST].array()->length = next_run; + irow += next_run; + } + } + return Status::OK(); + } + + Status PS_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.part[PARTSUPP::PS_COMMENT].kind() == Datum::NONE) { + int64_t irow = 0; + int64_t ps_to_generate = kPartSuppRowsPerPart * tld.part_to_generate; + for (size_t ibatch = 0; ibatch < tld.partsupp.size(); ibatch++) { + int64_t num_rows = std::min(batch_size_, ps_to_generate - irow); + ARROW_ASSIGN_OR_RAISE(tld.partsupp[ibatch][PARTSUPP::PS_COMMENT], + g_text.GenerateComments(num_rows, 49, 198, tld.rng)); + irow += num_rows; + } + } + return Status::OK(); + } + + struct ThreadLocalData { + std::vector part; + std::vector string_indices; + int64_t part_to_generate{0}; + int64_t partkey_start{0}; + + std::vector> partsupp; + std::bitset generated_partsupp; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex part_output_queue_mutex_; + std::mutex partsupp_output_queue_mutex_; + std::queue part_output_queue_; + std::queue partsupp_output_queue_; + int64_t batch_size_{0}; + double scale_factor_{0}; + int64_t part_rows_to_generate_{0}; + int64_t part_rows_generated_{0}; + std::vector part_cols_; + std::vector partsupp_cols_; + ThreadIndexer thread_indexer_; + + std::atomic part_batches_generated_ = {0}; + std::atomic partsupp_batches_generated_ = {0}; + static constexpr int64_t kPartSuppRowsPerPart = 4; +}; + +class OrdersAndLineItemGenerator { + public: + Status Init(size_t num_threads, int64_t batch_size, double scale_factor, int64_t seed) { + if (!inited_) { + inited_ = true; + batch_size_ = batch_size; + scale_factor_ = scale_factor; + + thread_local_data_.resize(num_threads); + random::pcg64_fast seed_rng(seed); + for (ThreadLocalData& tld : thread_local_data_) { + tld.items_per_order.resize(batch_size_); + tld.rng.seed(kSeedDist(seed_rng)); + } + orders_rows_to_generate_ = static_cast(scale_factor_ * 150000 * 10); + } + return Status::OK(); + } + + int64_t orders_batches_generated() const { return orders_batches_generated_.load(); } + + int64_t lineitem_batches_generated() const { + return lineitem_batches_generated_.load(); + } + + Result> SetOrdersOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kOrdersTypes, kOrdersNameMap, orders_cols_); + } + + Result> SetLineItemOutputColumns( + const std::vector& cols) { + return SetOutputColumns(cols, kLineitemTypes, kLineitemNameMap, lineitem_cols_); + } + + Result> NextOrdersBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + { + std::lock_guard lock(orders_output_queue_mutex_); + if (!orders_output_queue_.empty()) { + ExecBatch batch = std::move(orders_output_queue_.front()); + orders_output_queue_.pop(); + return std::move(batch); + } else if (orders_rows_generated_ == orders_rows_to_generate_) { + return util::nullopt; + } else { + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = + std::min(batch_size_, orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1); + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + } + } + tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.first_batch_offset = 0; + tld.generated_lineitem.reset(); + + for (int col : orders_cols_) RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); + for (int col : lineitem_cols_) RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); + + std::vector orders_result(orders_cols_.size()); + for (size_t i = 0; i < orders_cols_.size(); i++) { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + if (!lineitem_cols_.empty()) { + std::vector lineitem_results; + for (size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) { + std::vector lineitem_result(lineitem_cols_.size()); + for (size_t icol = 0; icol < lineitem_cols_.size(); icol++) { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + { + std::lock_guard guard(lineitem_output_queue_mutex_); + for (ExecBatch& eb : lineitem_results) { + lineitem_output_queue_.emplace(std::move(eb)); + } + } + } + return ExecBatch::Make(std::move(orders_result)); + } + + Result> NextLineItemBatch() { + size_t thread_index = thread_indexer_(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + ExecBatch queued; + bool from_queue = false; + { + std::lock_guard lock(lineitem_output_queue_mutex_); + if (!lineitem_output_queue_.empty()) { + queued = std::move(lineitem_output_queue_.front()); + lineitem_output_queue_.pop(); + from_queue = true; + } + } + tld.first_batch_offset = 0; + if (from_queue) { + ARROW_DCHECK(queued.length <= batch_size_); + tld.first_batch_offset = queued.length; + if (queued.length == batch_size_) return std::move(queued); + } + { + std::lock_guard lock(orders_output_queue_mutex_); + if (orders_rows_generated_ == orders_rows_to_generate_) { + if (from_queue) return std::move(queued); + return util::nullopt; + } + + tld.orderkey_start = orders_rows_generated_; + tld.orders_to_generate = + std::min(batch_size_, orders_rows_to_generate_ - orders_rows_generated_); + orders_rows_generated_ += tld.orders_to_generate; + orders_batches_generated_.fetch_add(1ll); + ARROW_DCHECK(orders_rows_generated_ <= orders_rows_to_generate_); + } + tld.orders.resize(ORDERS::kNumCols); + std::fill(tld.orders.begin(), tld.orders.end(), Datum()); + RETURN_NOT_OK(GenerateRowCounts(thread_index)); + tld.generated_lineitem.reset(); + if (from_queue) { + lineitem_batches_generated_.fetch_sub(1); + for (size_t i = 0; i < lineitem_cols_.size(); i++) + if (tld.lineitem[0][lineitem_cols_[i]].kind() == Datum::NONE) + tld.lineitem[0][lineitem_cols_[i]] = std::move(queued[i]); + } + + for (int col : orders_cols_) RETURN_NOT_OK(kOrdersGenerators[col](thread_index)); + for (int col : lineitem_cols_) RETURN_NOT_OK(kLineitemGenerators[col](thread_index)); + + if (!orders_cols_.empty()) { + std::vector orders_result(orders_cols_.size()); + for (size_t i = 0; i < orders_cols_.size(); i++) { + int col_idx = orders_cols_[i]; + orders_result[i] = tld.orders[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch orders_batch, + ExecBatch::Make(std::move(orders_result))); + { + std::lock_guard lock(orders_output_queue_mutex_); + orders_output_queue_.emplace(std::move(orders_batch)); + } + } + std::vector lineitem_results; + for (size_t ibatch = 0; ibatch < tld.lineitem.size(); ibatch++) { + std::vector lineitem_result(lineitem_cols_.size()); + for (size_t icol = 0; icol < lineitem_cols_.size(); icol++) { + int col_idx = lineitem_cols_[icol]; + lineitem_result[icol] = tld.lineitem[ibatch][col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(lineitem_result))); + lineitem_results.emplace_back(std::move(eb)); + } + lineitem_batches_generated_.fetch_add(static_cast(lineitem_results.size())); + // Return the first batch, enqueue the rest. + { + std::lock_guard lock(lineitem_output_queue_mutex_); + for (size_t i = 1; i < lineitem_results.size(); i++) + lineitem_output_queue_.emplace(std::move(lineitem_results[i])); + } + return std::move(lineitem_results[0]); + } + + private: +#define FOR_EACH_ORDERS_COLUMN(F) \ + F(O_ORDERKEY) \ + F(O_CUSTKEY) \ + F(O_ORDERSTATUS) \ + F(O_TOTALPRICE) \ + F(O_ORDERDATE) \ + F(O_ORDERPRIORITY) \ + F(O_CLERK) \ + F(O_SHIPPRIORITY) \ + F(O_COMMENT) + +#define FOR_EACH_LINEITEM_COLUMN(F) \ + F(L_ORDERKEY) \ + F(L_PARTKEY) \ + F(L_SUPPKEY) \ + F(L_LINENUMBER) \ + F(L_QUANTITY) \ + F(L_EXTENDEDPRICE) \ + F(L_DISCOUNT) \ + F(L_TAX) \ + F(L_RETURNFLAG) \ + F(L_LINESTATUS) \ + F(L_SHIPDATE) \ + F(L_COMMITDATE) \ + F(L_RECEIPTDATE) \ + F(L_SHIPINSTRUCT) \ + F(L_SHIPMODE) \ + F(L_COMMENT) + +#define MAKE_ENUM(col) col, + struct ORDERS { + enum { + FOR_EACH_ORDERS_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + struct LINEITEM { + enum { + FOR_EACH_LINEITEM_COLUMN(MAKE_ENUM) kNumCols, + }; + }; + +#define MAKE_STRING_MAP(col) {#col, ORDERS::col}, + const std::unordered_map kOrdersNameMap = { + FOR_EACH_ORDERS_COLUMN(MAKE_STRING_MAP)}; +#undef MAKE_STRING_MAP +#define MAKE_STRING_MAP(col) {#col, LINEITEM::col}, + const std::unordered_map kLineitemNameMap = { + FOR_EACH_LINEITEM_COLUMN(MAKE_STRING_MAP)}; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + const std::vector kOrdersGenerators = { + FOR_EACH_ORDERS_COLUMN(MAKE_FN_ARRAY)}; + const std::vector kLineitemGenerators = { + FOR_EACH_LINEITEM_COLUMN(MAKE_FN_ARRAY)}; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_LINEITEM_COLUMN +#undef FOR_EACH_ORDERS_COLUMN + + const std::vector> kOrdersTypes = {int32(), + int32(), + fixed_size_binary(1), + decimal(12, 2), + date32(), + fixed_size_binary(15), + fixed_size_binary(15), + int32(), + utf8()}; + + const std::vector> kLineitemTypes = { + int32(), + int32(), + int32(), + int32(), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + decimal(12, 2), + fixed_size_binary(1), + fixed_size_binary(1), + date32(), + date32(), + date32(), + fixed_size_binary(25), + fixed_size_binary(10), + utf8(), + }; + + Status AllocateOrdersBatch(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.orders_to_generate * byte_width)); + ArrayData ad(kOrdersTypes[column], tld.orders_to_generate, + {nullptr, std::move(buff)}); + tld.orders[column] = std::move(ad); + return Status::OK(); + } + + Status O_ORDERKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERKEY)); + int32_t* o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + int32_t orderkey_index = static_cast(tld.orderkey_start + i); + int32_t index_of_run = orderkey_index / 8; + int32_t index_in_run = orderkey_index % 8; + o_orderkey[i] = (index_of_run * 32 + index_in_run + 1); + ARROW_DCHECK(1 <= o_orderkey[i] && o_orderkey[i] <= 4 * orders_rows_to_generate_); + } + } + return Status::OK(); + } + + Status O_CUSTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_CUSTKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CUSTKEY)); + + // Spec says it must be a random number between 1 and SF*150000 that is not + // divisible by 3. Rather than repeatedly generating numbers until we get to + // a non-divisible-by-3 number, we just generate a number between + // 0 and SF * 50000 - 1, multiply by 3, and then add either 1 or 2. + int32_t sf_50k = static_cast(scale_factor_ * 50000); + std::uniform_int_distribution base_dist(0, sf_50k - 1); + std::uniform_int_distribution offset_dist(1, 2); + int32_t* o_custkey = reinterpret_cast( + tld.orders[ORDERS::O_CUSTKEY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) + o_custkey[i] = 3 * base_dist(tld.rng) + offset_dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERSTATUS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERSTATUS].kind() == Datum::NONE) { + RETURN_NOT_OK(L_LINESTATUS(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERSTATUS)); + + char* o_orderstatus = reinterpret_cast( + tld.orders[ORDERS::O_ORDERSTATUS].array()->buffers[1]->mutable_data()); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + bool all_f = true; + bool all_o = true; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + const char* l_linestatus = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->buffers[1]->data()); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; + iline++, irun++, batch_offset++) { + all_f &= l_linestatus[batch_offset] == 'F'; + all_o &= l_linestatus[batch_offset] == 'O'; + } + if (iline == tld.items_per_order[iorder]) { + iline = 0; + ARROW_DCHECK(!(all_f && all_o)); + if (all_f) + o_orderstatus[iorder] = 'F'; + else if (all_o) + o_orderstatus[iorder] = 'O'; + else + o_orderstatus[iorder] = 'P'; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_TOTALPRICE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_TOTALPRICE].kind() == Datum::NONE) { + RETURN_NOT_OK(L_EXTENDEDPRICE(thread_index)); + RETURN_NOT_OK(L_TAX(thread_index)); + RETURN_NOT_OK(L_DISCOUNT(thread_index)); + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_TOTALPRICE)); + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + int64_t sum = 0; + Decimal128* o_totalprice = reinterpret_cast( + tld.orders[ORDERS::O_TOTALPRICE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + const Decimal128* l_extendedprice = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->buffers[1]->data()); + const Decimal128* l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->data()); + const Decimal128* l_discount = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->buffers[1]->data()); + + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; + iline++, irun++, batch_offset++) { + int64_t eprice = static_cast(l_extendedprice[batch_offset]); + int64_t tax = static_cast(l_tax[batch_offset]); + int64_t discount = static_cast(l_discount[batch_offset]); + sum += (eprice * (100 + tax) * (100 - discount)); + } + if (iline == tld.items_per_order[iorder]) { + sum /= 100 * 100; + o_totalprice[iorder] = {sum}; + iline = 0; + iorder++; + } + } + irow += next_run; + batch_offset = 0; + } + } + return Status::OK(); + } + + Status O_ORDERDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERDATE].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERDATE)); + + std::uniform_int_distribution dist(kStartDate, kEndDate - 151); + uint32_t* o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) o_orderdate[i] = dist(tld.rng); + } + return Status::OK(); + } + + Status O_ORDERPRIORITY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY)); + int32_t byte_width = + arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_ORDERPRIORITY]); + std::uniform_int_distribution dist(0, kNumPriorities - 1); + char* o_orderpriority = reinterpret_cast( + tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + const char* str = Priorities[dist(tld.rng)]; + std::strncpy(o_orderpriority + i * byte_width, str, byte_width); + } + } + return Status::OK(); + } + + Status O_CLERK(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK)); + int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_CLERK]); + int64_t max_clerk_id = static_cast(scale_factor_ * 1000); + std::uniform_int_distribution dist(1, max_clerk_id); + char* o_clerk = reinterpret_cast( + tld.orders[ORDERS::O_CLERK].array()->buffers[1]->mutable_data()); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + const char* clerk = "Clerk#"; + const size_t clerk_length = std::strlen(clerk); + int64_t clerk_number = dist(tld.rng); + char* output = o_clerk + i * byte_width; + std::strncpy(output, clerk, byte_width); + AppendNumberPaddedToNineDigits(output + clerk_length, clerk_number); + } + } + return Status::OK(); + } + + Status O_SHIPPRIORITY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_SHIPPRIORITY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_SHIPPRIORITY)); + int32_t* o_shippriority = reinterpret_cast( + tld.orders[ORDERS::O_SHIPPRIORITY].array()->buffers[1]->mutable_data()); + std::memset(o_shippriority, 0, tld.orders_to_generate * sizeof(int32_t)); + } + return Status::OK(); + } + + Status O_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.orders[ORDERS::O_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE( + tld.orders[ORDERS::O_COMMENT], + g_text.GenerateComments(tld.orders_to_generate, 19, 78, tld.rng)); + } + return Status::OK(); + } + + Status GenerateRowCounts(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + std::uniform_int_distribution length_dist(1, 7); + tld.lineitem_to_generate = 0; + tld.items_per_order.clear(); + for (int64_t i = 0; i < tld.orders_to_generate; i++) { + int length = length_dist(tld.rng); + tld.items_per_order.push_back(length); + tld.lineitem_to_generate += length; + } + int64_t num_batches = + (tld.first_batch_offset + tld.lineitem_to_generate + batch_size_ - 1) / + batch_size_; + tld.lineitem.resize(num_batches); + for (std::vector& batch : tld.lineitem) { + batch.resize(LINEITEM::kNumCols); + std::fill(batch.begin(), batch.end(), Datum()); + } + return Status::OK(); + } + + Status AllocateLineItemBufferIfNeeded(size_t thread_index, size_t ibatch, int column, + size_t& out_batch_offset) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.lineitem[ibatch][column].kind() == Datum::NONE) { + int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(batch_size_ * byte_width)); + ArrayData ad(kLineitemTypes[column], batch_size_, {nullptr, std::move(buff)}); + tld.lineitem[ibatch][column] = std::move(ad); + out_batch_offset = 0; + } + if (ibatch == 0) out_batch_offset = tld.first_batch_offset; + + return Status::OK(); + } + + Status L_ORDERKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_ORDERKEY]) { + tld.generated_lineitem[LINEITEM::L_ORDERKEY] = true; + RETURN_NOT_OK(O_ORDERKEY(thread_index)); + const int32_t* o_orderkey = reinterpret_cast( + tld.orders[ORDERS::O_ORDERKEY].array()->buffers[1]->data()); + + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_ORDERKEY, batch_offset)); + int32_t* l_linenumber = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_ORDERKEY] + .array() + ->buffers[1] + ->mutable_data()); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_linenumber[batch_offset++] = o_orderkey[iorder]; + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_ORDERKEY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_PARTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_PARTKEY]) { + tld.generated_lineitem[LINEITEM::L_PARTKEY] = true; + + size_t ibatch = 0; + int32_t max_partkey = static_cast(scale_factor_ * 200000); + std::uniform_int_distribution dist(1, max_partkey); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_PARTKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t* l_partkey = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_PARTKEY] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_partkey[batch_offset] = dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SUPPKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SUPPKEY]) { + tld.generated_lineitem[LINEITEM::L_SUPPKEY] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 3); + const int32_t S = static_cast(scale_factor_ * 10000); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset = 0; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_SUPPKEY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t* l_suppkey = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SUPPKEY] + .array() + ->buffers[1] + ->mutable_data()); + const int32_t* l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + for (int64_t i = 0; i < next_run; i++) { + int32_t supplier = dist(tld.rng); + int32_t partkey = l_partkey[batch_offset]; + // Fun fact: the parentheses for this expression are unbalanced in the TPC-H + // spec. + l_suppkey[batch_offset++] = + (partkey + (supplier * ((S / 4) + (partkey - 1) / S))) % S + 1; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SUPPKEY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINENUMBER(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_LINENUMBER]) { + tld.generated_lineitem[LINEITEM::L_LINENUMBER] = true; + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_LINENUMBER, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + int32_t* l_linenumber = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_LINENUMBER] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; + iline++, irun++) { + l_linenumber[batch_offset++] = (iline + 1); + ARROW_DCHECK(1 <= (iline + 1) && (iline + 1) <= 7); + } + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINENUMBER].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_QUANTITY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_QUANTITY]) { + tld.generated_lineitem[LINEITEM::L_QUANTITY] = true; + + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 50); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_QUANTITY, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128* l_quantity = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_QUANTITY] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t i = 0; i < next_run; i++) { + // Multiply by 100 because the type is decimal(12, 2), so the decimal goes after + // two digits + int64_t quantity = dist(tld.rng) * 100; + l_quantity[batch_offset++] = {quantity}; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_EXTENDEDPRICE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE]) { + tld.generated_lineitem[LINEITEM::L_EXTENDEDPRICE] = true; + RETURN_NOT_OK(L_PARTKEY(thread_index)); + RETURN_NOT_OK(L_QUANTITY(thread_index)); + size_t ibatch = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_EXTENDEDPRICE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + const int32_t* l_partkey = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_PARTKEY].array()->buffers[1]->data()); + const Decimal128* l_quantity = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_QUANTITY].array()->buffers[1]->data()); + Decimal128* l_extendedprice = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + int64_t partkey = static_cast(l_partkey[batch_offset]); + // Divide by 100 to recover the integer representation (not Decimal). + int64_t quantity = static_cast(l_quantity[batch_offset]) / 100; + + // Spec says to divide by 100, but that happens automatically due to this being + // stored to two decimal points. + int64_t retail_price = + (90000 + ((partkey / 10) % 20001) + 100 * (partkey % 1000)); + int64_t extended_price = retail_price * quantity; + l_extendedprice[batch_offset] = {extended_price}; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_EXTENDEDPRICE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_DISCOUNT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_DISCOUNT]) { + tld.generated_lineitem[LINEITEM::L_DISCOUNT] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 10); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_DISCOUNT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128* l_discount = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_DISCOUNT] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_discount[batch_offset] = {dist(tld.rng)}; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_DISCOUNT].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_TAX(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_TAX]) { + tld.generated_lineitem[LINEITEM::L_TAX] = true; + size_t ibatch = 0; + std::uniform_int_distribution dist(0, 8); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_TAX, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + Decimal128* l_tax = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->buffers[1]->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_tax[batch_offset] = {dist(tld.rng)}; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_TAX].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RETURNFLAG(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_RETURNFLAG]) { + tld.generated_lineitem[LINEITEM::L_RETURNFLAG] = true; + RETURN_NOT_OK(L_RECEIPTDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_RETURNFLAG, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_returnflag = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG] + .array() + ->buffers[1] + ->mutable_data()); + const uint32_t* l_receiptdate = reinterpret_cast( + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + if (l_receiptdate[batch_offset] <= kCurrentDate) { + uint32_t r = dist(tld.rng); + l_returnflag[batch_offset] = (r % 2 == 1) ? 'R' : 'A'; + } else { + l_returnflag[batch_offset] = 'N'; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RETURNFLAG].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_LINESTATUS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_LINESTATUS]) { + tld.generated_lineitem[LINEITEM::L_LINESTATUS] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_LINESTATUS, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_linestatus = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_LINESTATUS] + .array() + ->buffers[1] + ->mutable_data()); + const uint32_t* l_shipdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPDATE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + if (l_shipdate[batch_offset] > kCurrentDate) + l_linestatus[batch_offset] = 'O'; + else + l_linestatus[batch_offset] = 'F'; + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_LINESTATUS].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SHIPDATE]) { + tld.generated_lineitem[LINEITEM::L_SHIPDATE] = true; + RETURN_NOT_OK(O_ORDERDATE(thread_index)); + const int32_t* o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(1, 121); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_SHIPDATE, batch_offset)); + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t* l_shipdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPDATE] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_shipdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPDATE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMITDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_COMMITDATE]) { + tld.generated_lineitem[LINEITEM::L_COMMITDATE] = true; + const int32_t* o_orderdate = reinterpret_cast( + tld.orders[ORDERS::O_ORDERDATE].array()->buffers[1]->data()); + std::uniform_int_distribution dist(30, 90); + size_t ibatch = 0; + size_t iorder = 0; + int32_t iline = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_COMMITDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t* l_commitdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_COMMITDATE] + .array() + ->buffers[1] + ->mutable_data()); + for (int64_t irun = 0; irun < next_run;) { + for (; iline < tld.items_per_order[iorder] && irun < next_run; iline++, irun++) + l_commitdate[batch_offset++] = o_orderdate[iorder] + dist(tld.rng); + if (iline == tld.items_per_order[iorder]) { + iline = 0; + iorder++; + } + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMITDATE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_RECEIPTDATE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_RECEIPTDATE]) { + tld.generated_lineitem[LINEITEM::L_RECEIPTDATE] = true; + RETURN_NOT_OK(L_SHIPDATE(thread_index)); + size_t ibatch = 0; + std::uniform_int_distribution dist(1, 30); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_RECEIPTDATE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + uint32_t* l_receiptdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE] + .array() + ->buffers[1] + ->mutable_data()); + const uint32_t* l_shipdate = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPDATE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) + l_receiptdate[batch_offset] = l_shipdate[batch_offset] + dist(tld.rng); + + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_RECEIPTDATE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPINSTRUCT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) { + tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true; + int32_t byte_width = + arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPINSTRUCT]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumInstructions - 1); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded( + thread_index, ibatch, LINEITEM::L_SHIPINSTRUCT, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_shipinstruct = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + const char* str = Instructions[dist(tld.rng)]; + // Note that we don't have to memset the buffer to 0 because strncpy pads each + // string with 0's anyway + std::strncpy(l_shipinstruct + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPINSTRUCT].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_SHIPMODE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) { + tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true; + int32_t byte_width = + arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPMODE]); + size_t ibatch = 0; + std::uniform_int_distribution dist(0, kNumModes - 1); + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + size_t batch_offset; + RETURN_NOT_OK(AllocateLineItemBufferIfNeeded(thread_index, ibatch, + LINEITEM::L_SHIPMODE, batch_offset)); + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + char* l_shipmode = + reinterpret_cast(tld.lineitem[ibatch][LINEITEM::L_SHIPMODE] + .array() + ->buffers[1] + ->mutable_data()); + + for (int64_t i = 0; i < next_run; i++, batch_offset++) { + const char* str = Modes[dist(tld.rng)]; + std::strncpy(l_shipmode + batch_offset * byte_width, str, byte_width); + } + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_SHIPMODE].array()->length = + static_cast(batch_offset); + } + } + return Status::OK(); + } + + Status L_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (!tld.generated_lineitem[LINEITEM::L_COMMENT]) { + tld.generated_lineitem[LINEITEM::L_COMMENT] = true; + + size_t batch_offset = tld.first_batch_offset; + size_t ibatch = 0; + for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) { + // Comments are kind of sneaky: we always generate the full batch and then just + // bump the length + if (tld.lineitem[ibatch][LINEITEM::L_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.lineitem[ibatch][LINEITEM::L_COMMENT], + g_text.GenerateComments(batch_size_, 10, 43, tld.rng)); + batch_offset = 0; + } + + int64_t remaining_in_batch = static_cast(batch_size_ - batch_offset); + int64_t next_run = std::min(tld.lineitem_to_generate - irow, remaining_in_batch); + + batch_offset += next_run; + irow += next_run; + tld.lineitem[ibatch][LINEITEM::L_COMMENT].array()->length = batch_offset; + } + } + return Status::OK(); + } + + struct ThreadLocalData { + std::vector orders; + int64_t orders_to_generate; + int64_t orderkey_start; + + std::vector> lineitem; + std::vector items_per_order; + int64_t lineitem_to_generate; + int64_t first_batch_offset; + std::bitset generated_lineitem; + random::pcg32_fast rng; + }; + std::vector thread_local_data_; + + bool inited_ = false; + std::mutex orders_output_queue_mutex_; + std::mutex lineitem_output_queue_mutex_; + std::queue orders_output_queue_; + std::queue lineitem_output_queue_; + int64_t batch_size_; + double scale_factor_; + int64_t orders_rows_to_generate_; + int64_t orders_rows_generated_; + std::vector orders_cols_; + std::vector lineitem_cols_; + ThreadIndexer thread_indexer_; + + std::atomic orders_batches_generated_ = {0}; + std::atomic lineitem_batches_generated_ = {0}; +}; + +class SupplierGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = static_cast(scale_factor_ * 10000); + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, gen_list_)); + + random::pcg64_fast seed_rng(seed); + seed_ = kSeedDist(seed_rng); + + random::pcg32_fast rng(kSeedDist(seed_rng)); + std::uniform_int_distribution dist(0, rows_to_generate_ - 1); + size_t num_special_rows = static_cast(5 * scale_factor_); + std::unordered_set good_rows_set; + while (good_rows_set.size() < num_special_rows) { + int64_t row = dist(rng); + good_rows_set.insert(row); + } + std::unordered_set bad_rows_set; + while (bad_rows_set.size() < num_special_rows) { + int64_t bad_row; + do { + bad_row = dist(rng); + } while (good_rows_set.find(bad_row) != good_rows_set.end()); + bad_rows_set.insert(bad_row); + } + good_rows_.clear(); + bad_rows_.clear(); + good_rows_.insert(good_rows_.end(), good_rows_set.begin(), good_rows_set.end()); + bad_rows_.insert(bad_rows_.end(), bad_rows_set.begin(), bad_rows_set.end()); + std::sort(good_rows_.begin(), good_rows_.end()); + std::sort(bad_rows_.begin(), bad_rows_.end()); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + thread_local_data_.resize(num_threads); + for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(GetRandomSeed()); + + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: +#define FOR_EACH_COLUMN(F) \ + F(S_SUPPKEY) \ + F(S_NAME) \ + F(S_ADDRESS) \ + F(S_NATIONKEY) \ + F(S_PHONE) \ + F(S_ACCTBAL) \ + F(S_COMMENT) + +#define MAKE_ENUM(col) col, + struct SUPPLIER { + enum { + FOR_EACH_COLUMN(MAKE_ENUM) kNumCols, + }; + }; +#undef MAKE_ENUM +#define MAKE_STRING_MAP(col) {#col, SUPPLIER::col}, + const std::unordered_map kNameMap = { + FOR_EACH_COLUMN(MAKE_STRING_MAP)}; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector kGenerators = {FOR_EACH_COLUMN(MAKE_FN_ARRAY)}; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_COLUMN + + std::vector> kTypes = { + int32(), fixed_size_binary(25), utf8(), + int32(), fixed_size_binary(15), decimal(12, 2), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) { + if (done_.load()) return Status::OK(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + tld.suppkey_start = rows_generated_.fetch_add(batch_size_); + if (tld.suppkey_start >= rows_to_generate_) return Status::OK(); + + tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.suppkey_start); + + tld.batch.resize(SUPPLIER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); + for (int col : gen_list_) RETURN_NOT_OK(kGenerators[col](thread_index)); + + std::vector result(gen_list_.size()); + for (size_t i = 0; i < gen_list_.size(); i++) { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_outputted_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_outputted_before_this_one == (batches_to_generate - 1); + output_callback_(std::move(eb)); + if (is_last_batch) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + return Status::OK(); + } + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(kTypes[column], tld.to_generate, {nullptr, std::move(buff)}); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status S_SUPPKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_SUPPKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_SUPPKEY)); + int32_t* s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + s_suppkey[irow] = static_cast(tld.suppkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status S_NAME(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_NAME].kind() == Datum::NONE) { + RETURN_NOT_OK(S_SUPPKEY(thread_index)); + const int32_t* s_suppkey = reinterpret_cast( + tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data()); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_NAME]); + char* s_name = reinterpret_cast( + tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data()); + // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022 + const char* supplier = "Supplie#r"; + const size_t supplier_length = std::strlen(supplier); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + char* out = s_name + byte_width * irow; + std::strncpy(out, supplier, byte_width); + AppendNumberPaddedToNineDigits(out + supplier_length, s_suppkey[irow]); + } + } + return Status::OK(); + } + + Status S_ADDRESS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_ADDRESS].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status S_NATIONKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_NATIONKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t* s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + s_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status S_PHONE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_PHONE].kind() == Datum::NONE) { + RETURN_NOT_OK(S_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_PHONE]); + const int32_t* s_nationkey = reinterpret_cast( + tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data()); + char* s_phone = reinterpret_cast( + tld.batch[SUPPLIER::S_PHONE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + GeneratePhoneNumber(s_phone + irow * byte_width, tld.rng, s_nationkey[irow]); + } + } + return Status::OK(); + } + + Status S_ACCTBAL(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_ACCTBAL].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_ACCTBAL)); + Decimal128* s_acctbal = reinterpret_cast( + tld.batch[SUPPLIER::S_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + s_acctbal[irow] = {dist(tld.rng)}; + } + return Status::OK(); + } + + Status S_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[SUPPLIER::S_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[SUPPLIER::S_COMMENT], + g_text.GenerateComments(tld.to_generate, 25, 100, tld.rng)); + ModifyComments(thread_index, "Recommends", good_rows_); + ModifyComments(thread_index, "Complaints", bad_rows_); + } + return Status::OK(); + } + + void ModifyComments(size_t thread_index, const char* review, + const std::vector& indices) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + const int32_t* offsets = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[1]->data()); + char* str = reinterpret_cast( + tld.batch[SUPPLIER::S_COMMENT].array()->buffers[2]->mutable_data()); + const char* customer = "Customer"; + const int32_t customer_length = static_cast(std::strlen(customer)); + const int32_t review_length = static_cast(std::strlen(review)); + + auto it = std::lower_bound(indices.begin(), indices.end(), tld.suppkey_start); + for (; it != indices.end() && *it < tld.suppkey_start + tld.to_generate; it++) { + int64_t idx_in_batch = *it - tld.suppkey_start; + char* out = str + offsets[idx_in_batch]; + int32_t str_length = offsets[idx_in_batch + 1] - offsets[idx_in_batch]; + std::uniform_int_distribution gap_dist( + 0, str_length - customer_length - review_length); + int32_t gap = gap_dist(tld.rng); + int32_t total_length = customer_length + gap + review_length; + std::uniform_int_distribution start_dist(0, str_length - total_length); + int32_t start = start_dist(tld.rng); + std::memcpy(out + start, customer, customer_length); + std::memcpy(out + start + customer_length + gap, review, review_length); + } + } + + struct ThreadLocalData { + random::pcg32_fast rng; + int64_t suppkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + std::vector good_rows_; + std::vector bad_rows_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_; + std::atomic rows_generated_; + double scale_factor_; + int64_t batch_size_; + std::vector gen_list_; + std::shared_ptr schema_; +}; + +class PartGenerator : public TpchTableGenerator { + public: + explicit PartGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetPartOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, gen_->NextPartBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->part_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + double scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class PartSuppGenerator : public TpchTableGenerator { + public: + explicit PartSuppGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetPartSuppOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextPartSuppBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->partsupp_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + double scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class CustomerGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; + scale_factor_ = scale_factor; + batch_size_ = batch_size; + rows_to_generate_ = static_cast(scale_factor_ * 150000); + rows_generated_.store(0); + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, gen_list_)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + thread_local_data_.resize(num_threads); + random::pcg64_fast seed_rng(seed_); + for (ThreadLocalData& tld : thread_local_data_) tld.rng.seed(kSeedDist(seed_rng)); + + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: +#define FOR_EACH_COLUMN(F) \ + F(C_CUSTKEY) \ + F(C_NAME) \ + F(C_ADDRESS) \ + F(C_NATIONKEY) \ + F(C_PHONE) \ + F(C_ACCTBAL) \ + F(C_MKTSEGMENT) \ + F(C_COMMENT) + +#define MAKE_ENUM(col) col, + struct CUSTOMER { + enum { + FOR_EACH_COLUMN(MAKE_ENUM) kNumCols, + }; + }; +#undef MAKE_ENUM +#define MAKE_STRING_MAP(col) {#col, CUSTOMER::col}, + const std::unordered_map kNameMap = { + FOR_EACH_COLUMN(MAKE_STRING_MAP)}; +#undef MAKE_STRING_MAP +#define MAKE_FN_ARRAY(col) \ + [this](size_t thread_index) { return this->col(thread_index); }, + std::vector kGenerators = {FOR_EACH_COLUMN(MAKE_FN_ARRAY)}; +#undef MAKE_FN_ARRAY +#undef FOR_EACH_COLUMN + + std::vector> kTypes = { + int32(), + utf8(), + utf8(), + int32(), + fixed_size_binary(15), + decimal(12, 2), + fixed_size_binary(10), + utf8(), + }; + + Status ProduceCallback(size_t thread_index) { + if (done_.load()) return Status::OK(); + ThreadLocalData& tld = thread_local_data_[thread_index]; + tld.custkey_start = rows_generated_.fetch_add(batch_size_); + if (tld.custkey_start >= rows_to_generate_) return Status::OK(); + + tld.to_generate = std::min(batch_size_, rows_to_generate_ - tld.custkey_start); + + tld.batch.resize(CUSTOMER::kNumCols); + std::fill(tld.batch.begin(), tld.batch.end(), Datum()); + for (int col : gen_list_) RETURN_NOT_OK(kGenerators[col](thread_index)); + + std::vector result(gen_list_.size()); + for (size_t i = 0; i < gen_list_.size(); i++) { + int col_idx = gen_list_[i]; + result[i] = tld.batch[col_idx]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch eb, ExecBatch::Make(std::move(result))); + int64_t batches_to_generate = (rows_to_generate_ + batch_size_ - 1) / batch_size_; + int64_t batches_generated_before_this_one = batches_outputted_.fetch_add(1); + bool is_last_batch = batches_generated_before_this_one == (batches_to_generate - 1); + output_callback_(std::move(eb)); + if (is_last_batch) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + return Status::OK(); + } + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + Status AllocateColumn(size_t thread_index, int column) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr buff, + AllocateBuffer(tld.to_generate * byte_width)); + ArrayData ad(kTypes[column], tld.to_generate, {nullptr, std::move(buff)}); + tld.batch[column] = std::move(ad); + return Status::OK(); + } + + Status C_CUSTKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_CUSTKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_CUSTKEY)); + int32_t* c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + c_custkey[irow] = static_cast(tld.custkey_start + irow + 1); + } + } + return Status::OK(); + } + + Status C_NAME(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_NAME].kind() == Datum::NONE) { + RETURN_NOT_OK(C_CUSTKEY(thread_index)); + const int32_t* c_custkey = reinterpret_cast( + tld.batch[CUSTOMER::C_CUSTKEY].array()->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr offset_buff, + AllocateBuffer((tld.to_generate + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offset_buff->mutable_data()); + const char* customer = "Customer#"; + const size_t customer_length = std::strlen(customer); + offsets[0] = 0; + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + int num_digits = GetNumDigits(c_custkey[irow]); + int num_chars = std::max(num_digits, 9); + offsets[irow + 1] = + static_cast(offsets[irow] + num_chars + customer_length); + } + ARROW_ASSIGN_OR_RAISE(std::unique_ptr str_buff, + AllocateBuffer(offsets[tld.to_generate])); + char* str = reinterpret_cast(str_buff->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + char* out = str + offsets[irow]; + std::memcpy(out, customer, customer_length); + AppendNumberPaddedToNineDigits(out + customer_length, c_custkey[irow]); + } + ArrayData ad(utf8(), tld.to_generate, + {nullptr, std::move(offset_buff), std::move(str_buff)}); + tld.batch[CUSTOMER::C_NAME] = std::move(ad); + } + return Status::OK(); + } + + Status C_ADDRESS(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_ADDRESS].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_ADDRESS], + RandomVString(tld.rng, tld.to_generate, 10, 40)); + } + return Status::OK(); + } + + Status C_NATIONKEY(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_NATIONKEY].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_NATIONKEY)); + std::uniform_int_distribution dist(0, 24); + int32_t* c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + c_nationkey[irow] = dist(tld.rng); + } + return Status::OK(); + } + + Status C_PHONE(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_PHONE].kind() == Datum::NONE) { + RETURN_NOT_OK(C_NATIONKEY(thread_index)); + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_PHONE]); + const int32_t* c_nationkey = reinterpret_cast( + tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data()); + char* c_phone = reinterpret_cast( + tld.batch[CUSTOMER::C_PHONE].array()->buffers[1]->mutable_data()); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + GeneratePhoneNumber(c_phone + irow * byte_width, tld.rng, c_nationkey[irow]); + } + } + return Status::OK(); + } + + Status C_ACCTBAL(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_ACCTBAL].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_ACCTBAL)); + Decimal128* c_acctbal = reinterpret_cast( + tld.batch[CUSTOMER::C_ACCTBAL].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(-99999, 999999); + for (int64_t irow = 0; irow < tld.to_generate; irow++) + c_acctbal[irow] = {dist(tld.rng)}; + } + return Status::OK(); + } + + Status C_MKTSEGMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) { + RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT)); + int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_MKTSEGMENT]); + char* c_mktsegment = reinterpret_cast( + tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data()); + std::uniform_int_distribution dist(0, kNumSegments - 1); + for (int64_t irow = 0; irow < tld.to_generate; irow++) { + char* out = c_mktsegment + irow * byte_width; + int str_idx = dist(tld.rng); + std::strncpy(out, Segments[str_idx], byte_width); + } + } + return Status::OK(); + } + + Status C_COMMENT(size_t thread_index) { + ThreadLocalData& tld = thread_local_data_[thread_index]; + if (tld.batch[CUSTOMER::C_COMMENT].kind() == Datum::NONE) { + ARROW_ASSIGN_OR_RAISE(tld.batch[CUSTOMER::C_COMMENT], + g_text.GenerateComments(tld.to_generate, 29, 116, tld.rng)); + } + return Status::OK(); + } + + struct ThreadLocalData { + random::pcg32_fast rng; + int64_t custkey_start; + int64_t to_generate; + std::vector batch; + }; + std::vector thread_local_data_; + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t rows_to_generate_{0}; + std::atomic rows_generated_ = {0}; + double scale_factor_{0}; + int64_t batch_size_{0}; + std::vector gen_list_; + std::shared_ptr schema_; +}; + +class OrdersGenerator : public TpchTableGenerator { + public: + explicit OrdersGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetOrdersOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, gen_->NextOrdersBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->orders_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + double scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class LineitemGenerator : public TpchTableGenerator { + public: + explicit LineitemGenerator(std::shared_ptr gen) + : gen_(std::move(gen)) {} + + Status Init(std::vector columns, double scale_factor, int64_t batch_size, + int64_t seed) override { + seed_ = seed; + scale_factor_ = scale_factor; + batch_size_ = batch_size; + ARROW_ASSIGN_OR_RAISE(schema_, gen_->SetLineItemOutputColumns(columns)); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback schedule_callback) override { + RETURN_NOT_OK(gen_->Init(num_threads, batch_size_, scale_factor_, seed_)); + output_callback_ = std::move(output_callback); + finished_callback_ = std::move(finished_callback); + schedule_callback_ = std::move(schedule_callback); + + for (size_t i = 0; i < num_threads; i++) + RETURN_NOT_OK(schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); })); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + Status ProduceCallback(size_t) { + if (done_.load()) return Status::OK(); + ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + gen_->NextLineItemBatch()); + if (!maybe_batch.has_value()) { + int64_t batches_generated = gen_->lineitem_batches_generated(); + if (batches_generated == batches_outputted_.load()) { + bool expected = false; + if (done_.compare_exchange_strong(expected, true)) + finished_callback_(batches_outputted_.load()); + } + return Status::OK(); + } + ExecBatch batch = std::move(*maybe_batch); + output_callback_(std::move(batch)); + batches_outputted_++; + return schedule_callback_( + [this](size_t thread_index) { return this->ProduceCallback(thread_index); }); + } + + OutputBatchCallback output_callback_; + FinishedCallback finished_callback_; + ScheduleCallback schedule_callback_; + int64_t batch_size_; + double scale_factor_; + std::shared_ptr gen_; + std::shared_ptr schema_; +}; + +class NationGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, double /*scale_factor*/, + int64_t /*batch_size*/, int64_t seed) override { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, column_indices_)); + seed_ = seed; + rng_.seed(seed_); + return Status::OK(); + } + + Status StartProducing(size_t /*num_threads*/, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override { + std::shared_ptr N_NATIONKEY_buffer = Buffer::Wrap(kNationKey, kRowCount); + ArrayData N_NATIONKEY_arraydata(int32(), kRowCount, + {nullptr, std::move(N_NATIONKEY_buffer)}); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr N_NAME_buffer, + AllocateBuffer(kRowCount * kNameByteWidth)); + char* N_NAME = reinterpret_cast(N_NAME_buffer->mutable_data()); + for (size_t i = 0; i < kRowCount; i++) + std::strncpy(N_NAME + kNameByteWidth * i, kCountryNames[i], kNameByteWidth); + ArrayData N_NAME_arraydata(fixed_size_binary(kNameByteWidth), kRowCount, + {nullptr, std::move(N_NAME_buffer)}); + + std::shared_ptr N_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, kRowCount); + ArrayData N_REGIONKEY_arraydata(int32(), kRowCount, + {nullptr, std::move(N_REGIONKEY_buffer)}); + + ARROW_ASSIGN_OR_RAISE(Datum N_COMMENT_datum, + g_text.GenerateComments(kRowCount, 31, 114, rng_)); + + std::vector fields = { + std::move(N_NATIONKEY_arraydata), std::move(N_NAME_arraydata), + std::move(N_REGIONKEY_arraydata), std::move(N_COMMENT_datum)}; + + std::vector result; + for (const int& col : column_indices_) result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + private: + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 25; + static constexpr int32_t kNameByteWidth = 25; + static constexpr int32_t kNationKey[kRowCount] = {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24}; + static constexpr const char* kCountryNames[kRowCount] = { + "ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", + "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", + "IRAN", "IRAQ", "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", + "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}; + static constexpr int32_t kRegionKey[kRowCount] = {0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, + 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; + + struct NATION { + enum { + N_NATIONKEY, + N_NAME, + N_REGIONKEY, + N_COMMENT, + }; + }; + + const std::unordered_map kNameMap = { + {"N_NATIONKEY", NATION::N_NATIONKEY}, + {"N_NAME", NATION::N_NAME}, + {"N_REGIONKEY", NATION::N_REGIONKEY}, + {"N_COMMENT", NATION::N_COMMENT}, + }; + + std::vector> kTypes = { + int32(), + fixed_size_binary(kNameByteWidth), + int32(), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; +}; + +constexpr int32_t NationGenerator::kNationKey[NationGenerator::kRowCount]; +constexpr const char* NationGenerator::kCountryNames[NationGenerator::kRowCount]; +constexpr int32_t NationGenerator::kRegionKey[NationGenerator::kRowCount]; + +class RegionGenerator : public TpchTableGenerator { + public: + Status Init(std::vector columns, double /*scale_factor*/, + int64_t /*batch_size*/, int64_t seed) override { + ARROW_ASSIGN_OR_RAISE(schema_, + SetOutputColumns(columns, kTypes, kNameMap, column_indices_)); + seed_ = seed; + rng_.seed(seed_); + return Status::OK(); + } + + Status StartProducing(size_t num_threads, OutputBatchCallback output_callback, + FinishedCallback finished_callback, + ScheduleCallback /*schedule_task_callback*/) override { + std::shared_ptr R_REGIONKEY_buffer = Buffer::Wrap(kRegionKey, kRowCount); + ArrayData R_REGIONKEY_arraydata(int32(), kRowCount, + {nullptr, std::move(R_REGIONKEY_buffer)}); + + ARROW_ASSIGN_OR_RAISE(std::unique_ptr R_NAME_buffer, + AllocateBuffer(kRowCount * kNameByteWidth)); + char* R_NAME_data = reinterpret_cast(R_NAME_buffer->mutable_data()); + for (size_t i = 0; i < kRowCount; i++) + std::strncpy(R_NAME_data + kNameByteWidth * i, kRegionNames[i], kNameByteWidth); + ArrayData R_NAME_arraydata(kTypes[static_cast(REGION::R_NAME)], kRowCount, + {nullptr, std::move(R_NAME_buffer)}); + + ARROW_ASSIGN_OR_RAISE(Datum R_COMMENT_datum, + g_text.GenerateComments(kRowCount, 31, 115, rng_)); + + std::vector fields = {std::move(R_REGIONKEY_arraydata), + std::move(R_NAME_arraydata), std::move(R_COMMENT_datum)}; + std::vector result; + for (const int& col : column_indices_) result.push_back(fields[col]); + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, ExecBatch::Make(std::move(result))); + output_callback(std::move(batch)); + finished_callback(static_cast(1)); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + random::pcg32_fast rng_; + + static constexpr size_t kRowCount = 5; + static constexpr int32_t kNameByteWidth = 25; + static constexpr int32_t kRegionKey[kRowCount] = {0, 1, 2, 3, 4}; + static constexpr const char* kRegionNames[kRowCount] = {"AFRICA", "AMERICA", "ASIA", + "EUROPE", "MIDDLE EAST"}; + + struct REGION { + enum { + R_REGIONKEY, + R_NAME, + R_COMMENT, + kNumColumns, + }; + }; + + const std::unordered_map kNameMap = { + {"R_REGIONKEY", REGION::R_REGIONKEY}, + {"R_NAME", REGION::R_NAME}, + {"R_COMMENT", REGION::R_COMMENT}, + }; + + const std::vector> kTypes = { + int32(), + fixed_size_binary(kNameByteWidth), + utf8(), + }; + + std::shared_ptr schema_; + std::vector column_indices_; +}; + +constexpr int32_t RegionGenerator::kRegionKey[RegionGenerator::kRowCount]; +constexpr const char* RegionGenerator::kRegionNames[RegionGenerator::kRowCount]; + +class TpchNode : public ExecNode { + public: + TpchNode(ExecPlan* plan, const char* name, + std::unique_ptr generator) + : ExecNode(plan, {}, {}, generator->schema(), /*num_outputs=*/1), + name_(name), + generator_(std::move(generator)) {} + + const char* kind_name() const override { return name_; } + + [[noreturn]] static void NoInputs() { + Unreachable("TPC-H node should never have any inputs"); + } + + [[noreturn]] void InputReceived(ExecNode*, ExecBatch) override { NoInputs(); } + + [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); } + + [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); } + + Status StartProducing() override { + return generator_->StartProducing( + thread_indexer_.Capacity(), + [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, + [this](int64_t num_batches) { this->FinishedCallback(num_batches); }, + [this](std::function func) -> Status { + return this->ScheduleTaskCallback(std::move(func)); + }); + } + + void PauseProducing(ExecNode* output) override {} + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override { + if (generator_->Abort()) std::ignore = task_group_.End(); + } + + Future<> finished() override { return task_group_.OnFinished(); } + + private: + void OutputBatchCallback(ExecBatch batch) { + outputs_[0]->InputReceived(this, std::move(batch)); + } + + void FinishedCallback(int64_t total_num_batches) { + outputs_[0]->InputFinished(this, static_cast(total_num_batches)); + std::ignore = task_group_.End(); + } + + Status ScheduleTaskCallback(std::function func) { + auto executor = plan_->exec_context()->executor(); + + // Due to the way that the generators schedule tasks, there may be more tasks + // than output batches. After outputting the last batch, the generator will + // end the task group, but there may still be other threads that try to schedule + // tasks while the task group is being ended. This can result in adding tasks after + // the task group is ended. If those tasks were to be executed, correctness would + // not be affected as they'd see the generator is done and exit immediately. As such, + // if the task group is ended we can just skip scheduling these tasks in general. + if (executor) { + RETURN_NOT_OK(task_group_.AddTaskIfNotEnded([&] { + return executor->Submit([this, func] { + size_t thread_index = thread_indexer_(); + Status status = func(thread_index); + if (!status.ok()) { + StopProducing(); + ErrorIfNotOk(status); + return; + } + }); + })); + } else { + return func(0); + } + return Status::OK(); + } + + const char* name_; + std::unique_ptr generator_; + + util::AsyncTaskGroup task_group_; + ThreadIndexer thread_indexer_; +}; + +class TpchGenImpl : public TpchGen { + public: + Result Supplier(std::vector columns = {}) override; + Result Part(std::vector columns = {}) override; + Result PartSupp(std::vector columns = {}) override; + Result Customer(std::vector columns = {}) override; + Result Orders(std::vector columns = {}) override; + Result Lineitem(std::vector columns = {}) override; + Result Nation(std::vector columns = {}) override; + Result Region(std::vector columns = {}) override; + + TpchGenImpl(ExecPlan* plan, double scale_factor, int64_t batch_size, int64_t seed) + : plan_(plan), + scale_factor_(scale_factor), + batch_size_(batch_size), + seed_rng_(seed) {} + + template + Result CreateNode(const char* name, std::vector columns); + + ExecPlan* plan_; + double scale_factor_; + int64_t batch_size_; + random::pcg64_fast seed_rng_; + + std::shared_ptr part_and_part_supp_generator_{}; + std::shared_ptr orders_and_line_item_generator_{}; +}; + +template +Result TpchGenImpl::CreateNode(const char* name, + std::vector columns) { + std::unique_ptr generator = arrow::internal::make_unique(); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); + return plan_->EmplaceNode(plan_, name, std::move(generator)); +} + +Result TpchGenImpl::Supplier(std::vector columns) { + return CreateNode("Supplier", std::move(columns)); +} + +Result TpchGenImpl::Part(std::vector columns) { + if (!part_and_part_supp_generator_) { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); + return plan_->EmplaceNode(plan_, "Part", std::move(generator)); +} + +Result TpchGenImpl::PartSupp(std::vector columns) { + if (!part_and_part_supp_generator_) { + part_and_part_supp_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(part_and_part_supp_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); + return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); +} + +Result TpchGenImpl::Customer(std::vector columns) { + return CreateNode("Customer", std::move(columns)); +} + +Result TpchGenImpl::Orders(std::vector columns) { + if (!orders_and_line_item_generator_) { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); + return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); +} + +Result TpchGenImpl::Lineitem(std::vector columns) { + if (!orders_and_line_item_generator_) { + orders_and_line_item_generator_ = std::make_shared(); + } + std::unique_ptr generator = + arrow::internal::make_unique(orders_and_line_item_generator_); + RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, + kSeedDist(seed_rng_))); + return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); +} + +Result TpchGenImpl::Nation(std::vector columns) { + return CreateNode("Nation", std::move(columns)); +} + +Result TpchGenImpl::Region(std::vector columns) { + return CreateNode("Region", std::move(columns)); +} + +} // namespace + +Result> TpchGen::Make(ExecPlan* plan, double scale_factor, + int64_t batch_size, + util::optional seed) { + if (!seed.has_value()) seed = GetRandomSeed(); + return std::unique_ptr(new TpchGenImpl(plan, scale_factor, batch_size, *seed)); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h new file mode 100644 index 00000000000..fb9376982b1 --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/optional.h" + +namespace arrow { +namespace compute { +namespace internal { + +class ARROW_EXPORT TpchGen { + public: + virtual ~TpchGen() = default; + + /* + * \brief Create a factory for nodes that generate TPC-H data + * + * Note: Individual tables will reference each other. It is important that you only + * create a single TpchGen instance for each plan and then you can create nodes for each + * table from that single TpchGen instance. Note: Every batch will be scheduled as a new + * task using the ExecPlan's scheduler. + */ + static Result> Make( + ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096, + util::optional seed = util::nullopt); + + // The below methods will create and add an ExecNode to the plan that generates + // data for the desired table. If columns is empty, all columns will be generated. + // The methods return the added ExecNode, which should be used for inputs. + virtual Result Supplier(std::vector columns = {}) = 0; + virtual Result Part(std::vector columns = {}) = 0; + virtual Result PartSupp(std::vector columns = {}) = 0; + virtual Result Customer(std::vector columns = {}) = 0; + virtual Result Orders(std::vector columns = {}) = 0; + virtual Result Lineitem(std::vector columns = {}) = 0; + virtual Result Nation(std::vector columns = {}) = 0; + virtual Result Region(std::vector columns = {}) = 0; +}; + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc new file mode 100644 index 00000000000..8face53eebb --- /dev/null +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -0,0 +1,575 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/compute/exec/tpch_node.h" +#include "arrow/compute/exec/util.h" +#include "arrow/compute/kernels/row_encoder.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" +#include "arrow/testing/random.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/pcg_random.h" +#include "arrow/util/thread_pool.h" + +#include +#include +#include +#include + +namespace arrow { +namespace compute { +namespace internal { + +static constexpr uint32_t kStartDate = + 8035; // January 1, 1992 is 8035 days after January 1, 1970 +static constexpr uint32_t kEndDate = + 10591; // December 12, 1998 is 10591 days after January 1, 1970 + +Result> GenerateTable( + Result (TpchGen::*table)(std::vector), + double scale_factor = 1.0) { + ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(&ctx)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr gen, + TpchGen::Make(plan.get(), scale_factor)); + ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.get()->*table)({}))); + AsyncGenerator> sink_gen; + Declaration sink("sink", {Declaration::Input(table_node)}, SinkNodeOptions{&sink_gen}); + ARROW_RETURN_NOT_OK(sink.AddToPlan(plan.get())); + auto fut = StartAndCollect(plan.get(), sink_gen); + return fut.MoveResult(); +} + +// Verifies that the data is valid Arrow and ensures it's not null. +void ValidateBatch(const ExecBatch& batch) { + for (const Datum& d : batch.values) { + ASSERT_EQ(d.kind(), Datum::ARRAY); + const auto array = d.make_array(); + ASSERT_OK(array->ValidateFull()); + TestInitialized(*array); + ASSERT_EQ(array->data()->buffers[0].get(), nullptr); + } +} + +// Verifies that each element is seen exactly once, and that it's between min and max +// inclusive +void VerifyUniqueKey(std::unordered_set* seen, const Datum& d, int32_t min, + int32_t max) { + const int32_t* keys = reinterpret_cast(d.array()->buffers[1]->data()); + int64_t num_keys = d.length(); + for (int64_t i = 0; i < num_keys; i++) { + ASSERT_TRUE(seen->insert(keys[i]).second); + ASSERT_LE(keys[i], max); + ASSERT_GE(keys[i], min); + } +} + +void VerifyStringAndNumber_Single(const util::string_view& row, + const util::string_view& prefix, const int64_t i, + const int32_t* nums, bool verify_padding) { + ASSERT_TRUE(row.starts_with(prefix)) << row << ", prefix=" << prefix << ", i=" << i; + const char* num_str = row.data() + prefix.size(); + const char* num_str_end = row.data() + row.size(); + int64_t num = 0; + // Parse the number out; note that it can be padded with NUL chars at the end + for (; *num_str && num_str < num_str_end; num_str++) { + num *= 10; + ASSERT_TRUE(std::isdigit(*num_str)) << row << ", prefix=" << prefix << ", i=" << i; + num += *num_str - '0'; + } + // If nums is not null, ensure it matches the parsed number + if (nums) { + ASSERT_EQ(num, nums[i]); + } + // TPC-H requires only ever requires padding up to 9 digits, so we ensure that + // the total length of the string was at least 9 (could be more for bigger numbers). + if (verify_padding) { + const auto num_chars = num_str - (row.data() + prefix.size()); + ASSERT_GE(num_chars, 9); + } +} + +// Verifies that each row is the string "prefix" followed by a number. If numbers is not +// EMPTY, it also checks that the number following the prefix is equal to the +// corresponding row in numbers. Some TPC-H data is padded to 9 zeros, which this function +// can optionally verify as well. This string function verifies fixed width columns. +void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers, + int byte_width, const util::string_view& prefix, + bool verify_padding = true) { + int64_t length = strings.length(); + const char* str = reinterpret_cast(strings.array()->buffers[1]->data()); + + const int32_t* nums = nullptr; + if (numbers.kind() != Datum::NONE) { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast(numbers.array()->buffers[1]->data()); + } + + for (int64_t i = 0; i < length; i++) { + const char* row = str + i * byte_width; + util::string_view view(row, byte_width); + VerifyStringAndNumber_Single(view, prefix, i, nums, verify_padding); + } +} + +// Same as above but for variable length columns +void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, + const util::string_view& prefix, + bool verify_padding = true) { + int64_t length = strings.length(); + const int32_t* offsets = + reinterpret_cast(strings.array()->buffers[1]->data()); + const char* str = reinterpret_cast(strings.array()->buffers[2]->data()); + + const int32_t* nums = nullptr; + if (numbers.kind() != Datum::NONE) { + ASSERT_EQ(length, numbers.length()); + nums = reinterpret_cast(numbers.array()->buffers[1]->data()); + } + + for (int64_t i = 0; i < length; i++) { + int32_t start = offsets[i]; + int32_t str_len = offsets[i + 1] - offsets[i]; + util::string_view view(str + start, str_len); + VerifyStringAndNumber_Single(view, prefix, i, nums, verify_padding); + } +} + +// Verifies that each row is a V-string, which is defined in the spec to be +// a string of random length between min_length and max_length, that is composed +// of alphanumeric characters, commas, or spaces. +void VerifyVString(const Datum& d, int min_length, int max_length) { + int64_t length = d.length(); + const int32_t* off = reinterpret_cast(d.array()->buffers[1]->data()); + const char* str = reinterpret_cast(d.array()->buffers[2]->data()); + for (int64_t i = 0; i < length; i++) { + int32_t start = off[i]; + int32_t end = off[i + 1]; + int32_t str_len = end - start; + ASSERT_LE(str_len, max_length); + ASSERT_GE(str_len, min_length); + for (int32_t i = start; i < end; i++) { + bool is_valid = + std::isdigit(str[i]) || std::isalpha(str[i]) || str[i] == ',' || str[i] == ' '; + ASSERT_TRUE(is_valid) << "Character " << str[i] + << " is not a digit, a letter, a comma, or a space"; + } + } +} + +// Verifies that each 32-bit element modulo "mod" is between min and max. +void VerifyModuloBetween(const Datum& d, int32_t min, int32_t max, int32_t mod) { + int64_t length = d.length(); + const int32_t* n = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + int32_t m = n[i] % mod; + ASSERT_GE(m, min) << "Value must be between " << min << " and " << max << " mod " + << mod << ", " << n[i] << " % " << mod << " = " << m; + ASSERT_LE(m, max) << "Value must be between " << min << " and " << max << " mod " + << mod << ", " << n[i] << " % " << mod << " = " << m; + } +} + +// Verifies that each 32-bit element is between min and max. +void VerifyAllBetween(const Datum& d, int32_t min, int32_t max) { + int64_t length = d.length(); + const int32_t* n = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + ASSERT_GE(n[i], min) << "Value must be between " << min << " and " << max << ", got " + << n[i]; + ASSERT_LE(n[i], max) << "Value must be between " << min << " and " << max << ", got " + << n[i]; + } +} + +void VerifyNationKey(const Datum& d) { VerifyAllBetween(d, 0, 24); } + +// Verifies that each row satisfies the phone number spec. +void VerifyPhone(const Datum& d) { + int64_t length = d.length(); + const char* phones = reinterpret_cast(d.array()->buffers[1]->data()); + constexpr int kByteWidth = 15; // This is common for all PHONE columns + std::regex exp("\\d{2}-\\d{3}-\\d{3}-\\d{4}"); + for (int64_t i = 0; i < length; i++) { + const char* row = phones + i * kByteWidth; + ASSERT_TRUE(std::regex_match(row, row + kByteWidth, exp)); + } +} + +// Verifies that each decimal is between min and max +void VerifyDecimalsBetween(const Datum& d, int64_t min, int64_t max) { + int64_t length = d.length(); + const Decimal128* decs = + reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + int64_t val = static_cast(decs[i]); + ASSERT_LE(val, max); + ASSERT_GE(val, min); + } +} + +// Verifies that each variable-length row is a series of words separated by +// spaces. Number of words is determined by the number of spaces. +void VerifyCorrectNumberOfWords_Varlen(const Datum& d, int num_words) { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const int32_t* offsets = + reinterpret_cast(d.array()->buffers[1]->data()); + const char* str = reinterpret_cast(d.array()->buffers[2]->data()); + + for (int64_t i = 0; i < length; i++) { + int actual_num_spaces = 0; + + int32_t start = offsets[i]; + int32_t end = offsets[i + 1]; + int32_t str_len = end - start; + util::string_view view(str + start, str_len); + bool is_only_alphas_or_spaces = true; + for (const char& c : view) { + bool is_space = c == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(c)); + } + ASSERT_TRUE(is_only_alphas_or_spaces) + << "Words must be composed only of letters, got " << view; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) + << "Wrong number of spaces in " << view; + } +} + +// Same as above but for fixed width columns. +void VerifyCorrectNumberOfWords_FixedWidth(const Datum& d, int num_words, + int byte_width) { + int expected_num_spaces = num_words - 1; + int64_t length = d.length(); + const char* str = reinterpret_cast(d.array()->buffers[1]->data()); + + for (int64_t i = 0; i < length; i++) { + int actual_num_spaces = 0; + const char* row = str + i * byte_width; + bool is_only_alphas_or_spaces = true; + for (int32_t j = 0; j < byte_width && row[j]; j++) { + bool is_space = row[j] == ' '; + actual_num_spaces += is_space; + is_only_alphas_or_spaces &= (is_space || std::isalpha(row[j])); + } + ASSERT_TRUE(is_only_alphas_or_spaces) + << "Words must be composed only of letters, got " << row; + ASSERT_EQ(actual_num_spaces, expected_num_spaces) + << "Wrong number of spaces in " << row; + } +} + +// Verifies that each row of the single-byte-wide column is one of the possibilities. +void VerifyOneOf(const Datum& d, const std::unordered_set& possibilities) { + int64_t length = d.length(); + const char* col = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) + ASSERT_TRUE(possibilities.find(col[i]) != possibilities.end()); +} + +// Verifies that each fixed-width row is one of the possibilities +void VerifyOneOf(const Datum& d, int32_t byte_width, + const std::unordered_set& possibilities) { + int64_t length = d.length(); + const char* col = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) { + const char* row = col + i * byte_width; + int32_t row_len = 0; + while (row[row_len] && row_len < byte_width) row_len++; + util::string_view view(row, row_len); + ASSERT_TRUE(possibilities.find(view) != possibilities.end()) + << view << " is not a valid string."; + } +} + +// Counts the number of instances of each integer +void CountInstances(std::unordered_map* counts, const Datum& d) { + int64_t length = d.length(); + const int32_t* nums = reinterpret_cast(d.array()->buffers[1]->data()); + for (int64_t i = 0; i < length; i++) (*counts)[nums[i]]++; +} + +// For the S_COMMENT column, some of the columns must be modified to contain +// "Customer...Complaints" or "Customer...Recommends". This function counts the number of +// good and bad comments. +void CountModifiedComments(const Datum& d, int* good_count, int* bad_count) { + int64_t length = d.length(); + const int32_t* offsets = + reinterpret_cast(d.array()->buffers[1]->data()); + const char* str = reinterpret_cast(d.array()->buffers[2]->data()); + for (int64_t i = 0; i < length; i++) { + const char* row = str + offsets[i]; + int32_t row_length = offsets[i + 1] - offsets[i]; + util::string_view view(row, row_length); + bool customer = view.find("Customer") != util::string_view::npos; + bool recommends = view.find("Recommends") != util::string_view::npos; + bool complaints = view.find("Complaints") != util::string_view::npos; + if (customer) { + ASSERT_TRUE(recommends ^ complaints); + if (recommends) *good_count += 1; + if (complaints) *bad_count += 1; + } + } +} + +TEST(TpchNode, ScaleFactor) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Supplier, 0.25)); + + int64_t kExpectedRows = 2500; + int64_t num_rows = 0; + for (auto& batch : res) num_rows += batch.length; + ASSERT_EQ(num_rows, kExpectedRows); +} + +TEST(TpchNode, Supplier) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Supplier)); + int64_t kExpectedRows = 10000; + int64_t num_rows = 0; + + std::unordered_set seen_suppkey; + int good_count = 0; + int bad_count = 0; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(&seen_suppkey, batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyStringAndNumber_FixedWidth(batch[1], batch[0], /*byte_width=*/25, "Supplie#r"); + VerifyVString(batch[2], /*min_length=*/10, /*max_length=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + CountModifiedComments(batch[6], &good_count, &bad_count); + num_rows += batch.length; + } + ASSERT_EQ(seen_suppkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); + ASSERT_EQ(good_count, 5); + ASSERT_EQ(bad_count, 5); +} + +TEST(TpchNode, Part) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Part)); + + int64_t kExpectedRows = 200000; + int64_t num_rows = 0; + + std::unordered_set seen_partkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(&seen_partkey, batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyCorrectNumberOfWords_Varlen(batch[1], + /*num_words*=*/5); + VerifyStringAndNumber_FixedWidth(batch[2], Datum(), + /*byte_width=*/25, "Manufacturer#", + /*verify_padding=*/false); + VerifyStringAndNumber_FixedWidth(batch[3], Datum(), + /*byte_width=*/10, "Brand#", + /*verify_padding=*/false); + VerifyCorrectNumberOfWords_Varlen(batch[4], + /*num_words=*/3); + VerifyAllBetween(batch[5], /*min=*/1, /*max=*/50); + VerifyCorrectNumberOfWords_FixedWidth(batch[6], + /*num_words=*/2, + /*byte_width=*/10); + num_rows += batch.length; + } + ASSERT_EQ(seen_partkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); +} + +TEST(TpchNode, PartSupp) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::PartSupp)); + + constexpr int64_t kExpectedRows = 800000; + int64_t num_rows = 0; + + std::unordered_map counts; + for (auto& batch : res) { + ValidateBatch(batch); + CountInstances(&counts, batch[0]); + VerifyAllBetween(batch[2], 1, 9999); + VerifyDecimalsBetween(batch[3], 100, 100000); + num_rows += batch.length; + } + for (auto& partkey : counts) + ASSERT_EQ(partkey.second, 4) + << "Key " << partkey.first << " has count " << partkey.second; + ASSERT_EQ(counts.size(), kExpectedRows / 4); + + ASSERT_EQ(num_rows, kExpectedRows); +} + +TEST(TpchNode, Customer) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Customer)); + + const int64_t kExpectedRows = 150000; + int64_t num_rows = 0; + + std::unordered_set seen_custkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(&seen_custkey, batch[0], + /*min=*/1, + /*max=*/static_cast(kExpectedRows)); + VerifyStringAndNumber_Varlen(batch[1], batch[0], "Customer#"); + VerifyVString(batch[2], /*min=*/10, /*max=*/40); + VerifyNationKey(batch[3]); + VerifyPhone(batch[4]); + VerifyDecimalsBetween(batch[5], -99999, 999999); + VerifyCorrectNumberOfWords_FixedWidth(batch[6], + /*num_words=*/1, + /*byte_width=*/10); + num_rows += batch.length; + } + ASSERT_EQ(seen_custkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); +} + +TEST(TpchNode, Orders) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Orders)); + + constexpr int64_t kExpectedRows = 1500000; + int64_t num_rows = 0; + + std::unordered_set seen_orderkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(&seen_orderkey, batch[0], + /*min=*/1, + /*max=*/static_cast(4 * kExpectedRows)); + VerifyAllBetween(batch[1], /*min=*/1, /*max=*/static_cast(kExpectedRows)); + VerifyModuloBetween(batch[1], /*min=*/1, /*max=*/2, /*mod=*/3); + VerifyOneOf(batch[2], {'F', 'O', 'P'}); + VerifyAllBetween(batch[4], kStartDate, kEndDate - 151); + VerifyOneOf(batch[5], + /*byte_width=*/15, + { + "1-URGENT", + "2-HIGH", + "3-MEDIUM", + "4-NOT SPECIFIED", + "5-LOW", + }); + VerifyStringAndNumber_FixedWidth(batch[6], Datum(), + /*byte_width=*/15, "Clerk#", + /*verify_padding=*/true); + VerifyAllBetween(batch[7], /*min=*/0, /*max=*/0); + num_rows += batch.length; + } + ASSERT_EQ(seen_orderkey.size(), kExpectedRows); + ASSERT_EQ(num_rows, kExpectedRows); +} + +TEST(TpchNode, Lineitem) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Lineitem)); + + std::unordered_map counts; + for (auto& batch : res) { + ValidateBatch(batch); + CountInstances(&counts, batch[0]); + VerifyAllBetween(batch[1], /*min=*/1, /*max=*/200000); + VerifyAllBetween(batch[3], /*min=*/1, /*max=*/7); + VerifyDecimalsBetween(batch[4], /*min=*/100, /*max=*/5000); + VerifyDecimalsBetween(batch[6], /*min=*/0, /*max=*/10); + VerifyDecimalsBetween(batch[7], /*min=*/0, /*max=*/8); + VerifyOneOf(batch[8], {'R', 'A', 'N'}); + VerifyOneOf(batch[9], {'O', 'F'}); + VerifyAllBetween(batch[10], kStartDate + 1, kEndDate - 151 + 121); + VerifyAllBetween(batch[11], kStartDate + 30, kEndDate - 151 + 90); + VerifyAllBetween(batch[12], kStartDate + 2, kEndDate - 151 + 121 + 30); + VerifyOneOf(batch[13], + /*byte_width=*/25, + { + "DELIVER IN PERSON", + "COLLECT COD", + "NONE", + "TAKE BACK RETURN", + }); + VerifyOneOf(batch[14], + /*byte_width=*/10, + { + "REG AIR", + "AIR", + "RAIL", + "SHIP", + "TRUCK", + "MAIL", + "FOB", + }); + } + for (auto& count : counts) { + ASSERT_GE(count.second, 1); + ASSERT_LE(count.second, 7); + } +} + +TEST(TpchNode, Nation) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Nation)); + + constexpr int64_t kExpectedRows = 25; + int64_t num_rows = 0; + + std::unordered_set seen_nationkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(&seen_nationkey, batch[0], 0, kExpectedRows - 1); + VerifyOneOf( + batch[1], + /*byte_width=*/25, + {"ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", + "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", + "IRAN", "IRAQ", "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", + "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}); + VerifyAllBetween(batch[2], 0, 4); + num_rows += batch.length; + } + ASSERT_EQ(num_rows, kExpectedRows); +} + +TEST(TpchNode, Region) { + ASSERT_OK_AND_ASSIGN(auto res, GenerateTable(&TpchGen::Region)); + + constexpr int64_t kExpectedRows = 5; + int64_t num_rows = 0; + + std::unordered_set seen_regionkey; + for (auto& batch : res) { + ValidateBatch(batch); + VerifyUniqueKey(&seen_regionkey, batch[0], 0, kExpectedRows - 1); + VerifyOneOf(batch[1], + /*byte_width=*/25, + {"AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST"}); + + num_rows += batch.length; + } + ASSERT_EQ(num_rows, 5); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 7b2968253b3..1f62ce5b42a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -1131,6 +1131,7 @@ class TableSorter { MergeNullsOnly(range_begin, range_middle, range_end, temp_indices, null_count); } + Status status_; ExecContext* ctx_; const Table& table_; const RecordBatchVector batches_; @@ -1141,7 +1142,6 @@ class TableSorter { uint64_t* indices_begin_; uint64_t* indices_end_; Comparator comparator_; - Status status_; }; // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 45355019ab8..3dacaa18b44 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -57,6 +57,22 @@ Status AsyncTaskGroup::AddTask(std::function>()> task) { return AddTaskUnlocked(*maybe_task_fut, std::move(guard)); } +Status AsyncTaskGroup::AddTaskIfNotEnded(std::function>()> task) { + auto guard = mutex_.Lock(); + if (finished_adding_) { + return Status::OK(); + } + if (!err_.ok()) { + return err_; + } + Result> maybe_task_fut = task(); + if (!maybe_task_fut.ok()) { + err_ = maybe_task_fut.status(); + return err_; + } + return AddTaskUnlocked(*maybe_task_fut, std::move(guard)); +} + Status AsyncTaskGroup::AddTaskUnlocked(const Future<>& task_fut, util::Mutex::Guard guard) { // If the task is already finished there is nothing to track so lets save @@ -89,6 +105,17 @@ Status AsyncTaskGroup::AddTask(const Future<>& task_fut) { return AddTaskUnlocked(task_fut, std::move(guard)); } +Status AsyncTaskGroup::AddTaskIfNotEnded(const Future<>& task_fut) { + auto guard = mutex_.Lock(); + if (finished_adding_) { + return Status::OK(); + } + if (!err_.ok()) { + return err_; + } + return AddTaskUnlocked(task_fut, std::move(guard)); +} + Future<> AsyncTaskGroup::End() { auto guard = mutex_.Lock(); finished_adding_ = true; diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index fdac025030a..ab43aeee197 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -116,8 +116,12 @@ class ARROW_EXPORT AsyncTaskGroup { /// If WaitForTasksToFinish has been called and the returned future has been marked /// completed then adding a task will fail. Status AddTask(std::function>()> task); + /// Same as AddTask but doesn't add the task if End() has been called. + Status AddTaskIfNotEnded(std::function>()> task); /// Add a task that has already been started Status AddTask(const Future<>& task); + /// Same as AddTask but doesn't add the task if End() has been called. + Status AddTaskIfNotEnded(const Future<>& task); /// Signal that top level tasks are done being added /// /// It is allowed for tasks to be added after this call provided the future has not yet diff --git a/cpp/src/arrow/vendored/CMakeLists.txt b/cpp/src/arrow/vendored/CMakeLists.txt index 8d4c323d28a..0fdabc49f7c 100644 --- a/cpp/src/arrow/vendored/CMakeLists.txt +++ b/cpp/src/arrow/vendored/CMakeLists.txt @@ -19,3 +19,4 @@ arrow_install_all_headers("arrow/vendored") add_subdirectory(datetime) add_subdirectory(double-conversion) +add_subdirectory(pcg) diff --git a/cpp/src/arrow/vendored/pcg/CMakeLists.txt b/cpp/src/arrow/vendored/pcg/CMakeLists.txt new file mode 100644 index 00000000000..b4f0aec561b --- /dev/null +++ b/cpp/src/arrow/vendored/pcg/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arrow_install_all_headers("arrow/vendored/pcg")