diff --git a/cpp/src/arrow/compute/exec/hash_join.cc b/cpp/src/arrow/compute/exec/hash_join.cc index c1e587598fa..5a9afaa5bdf 100644 --- a/cpp/src/arrow/compute/exec/hash_join.cc +++ b/cpp/src/arrow/compute/exec/hash_join.cc @@ -103,7 +103,7 @@ class HashJoinBasicImpl : public HashJoinImpl { filter_ = std::move(filter); output_batch_callback_ = std::move(output_batch_callback); finished_callback_ = std::move(finished_callback); - local_states_.resize(num_threads); + local_states_.resize(num_threads + 1); // +1 for calling thread + worker threads for (size_t i = 0; i < local_states_.size(); ++i) { local_states_[i].is_initialized = false; local_states_[i].is_has_match_initialized = false; @@ -151,6 +151,7 @@ class HashJoinBasicImpl : public HashJoinImpl { } void InitLocalStateIfNeeded(size_t thread_index) { + DCHECK_LT(thread_index, local_states_.size()); ThreadLocalState& local_state = local_states_[thread_index]; if (!local_state.is_initialized) { InitEncoder(0, HashJoinProjection::KEY, &local_state.exec_batch_keys); diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.cc b/cpp/src/arrow/compute/exec/hash_join_dict.cc index b923433b493..ac1fbbaa3df 100644 --- a/cpp/src/arrow/compute/exec/hash_join_dict.cc +++ b/cpp/src/arrow/compute/exec/hash_join_dict.cc @@ -566,7 +566,7 @@ Status HashJoinDictBuildMulti::PostDecode( } void HashJoinDictProbeMulti::Init(size_t num_threads) { - local_states_.resize(num_threads); + local_states_.resize(num_threads + 1); // +1 for calling thread + worker threads for (size_t i = 0; i < local_states_.size(); ++i) { local_states_[i].is_initialized = false; } diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index 5f4dbfbbd72..319aa8cf98d 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -249,3 +249,23 @@ test_that("arrow dplyr query correctly filters then joins", { ) ) }) + + +test_that("arrow dplyr query can join with tibble", { + # ARROW-14908 + dir_out <- tempdir() + write_dataset(iris, file.path(dir_out, "iris")) + species_codes <- data.frame( + Species = c("setosa", "versicolor", "virginica"), + code = c("SET", "VER", "VIR") + ) + + withr::with_options( + list(arrow.use_threads = FALSE), + { + iris <- open_dataset(file.path(dir_out, "iris")) + res <- left_join(iris, species_codes) %>% collect() # We should not segfault here. + expect_equal(nrow(res), 150) + } + ) +})