From 73f3603894ca14e06ba03dccf20b963b9d167403 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 14:48:14 +0800 Subject: [PATCH 01/13] Fix gandiva cache size --- cpp/src/gandiva/cache.cc | 38 ++++++++++++++++++------------- cpp/src/gandiva/cache.h | 5 +++++ cpp/src/gandiva/cache_test.cc | 42 ++++++++++++++++++++++++++++++++++- 3 files changed, 69 insertions(+), 16 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index a1333ccdc5d..9f07bcd892e 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -23,23 +23,31 @@ namespace gandiva { -static const size_t DEFAULT_CACHE_SIZE = 5000; +static const int DEFAULT_CACHE_SIZE = 5000; -int GetCapacity() { - size_t capacity = DEFAULT_CACHE_SIZE; - auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); - if (maybe_env_cache_size.ok()) { - const auto env_cache_size = *std::move(maybe_env_cache_size); - if (!env_cache_size.empty()) { - capacity = std::atol(env_cache_size.c_str()); - if (capacity <= 0) { - ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " - << "Using default cache size: " << DEFAULT_CACHE_SIZE; - capacity = DEFAULT_CACHE_SIZE; - } - } +namespace internal { +int GetCapacityInternal() { + auto maybe_env_value = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); + if (!maybe_env_value.ok()) { + return DEFAULT_CACHE_SIZE; + } + const auto env_value = *std::move(maybe_env_value); + if (env_value.empty()) { + return DEFAULT_CACHE_SIZE; + } + int capacity = std::atoi(env_value.c_str()); + if (capacity <= 0) { + ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " + << "Using default cache size: " << DEFAULT_CACHE_SIZE; + return DEFAULT_CACHE_SIZE; } - return static_cast(capacity); + return capacity; +} +} // namespace internal + +int GetCapacity() { + static const int capacity = internal::GetCapacityInternal(); + return capacity; } void LogCacheSize(size_t capacity) { diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index 7cff9b02692..83b65dfb5eb 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -25,6 +25,11 @@ namespace gandiva { +namespace internal { +// For testing purposes only. +int GetCapacityInternal(); +} // namespace internal + GANDIVA_EXPORT int GetCapacity(); diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index a146707079f..d4a28f53df3 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -16,10 +16,13 @@ // under the License. #include "gandiva/cache.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" #include namespace gandiva { + class TestCacheKey { public: explicit TestCacheKey(int value) : value_(value) {} @@ -38,5 +41,42 @@ TEST(TestCache, TestGetPut) { ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); } -TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +TEST(TestCache, TestGetCacheCapacityDefault) { ASSERT_EQ(GetCapacity(), 5000); } + +TEST(TestCache, TestGetCacheCapacityEnvVar) { + // Empty. + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "")); + ASSERT_EQ(internal::GetCapacityInternal(), 5000); + + // Non-number. + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "invalid")); + ASSERT_EQ(internal::GetCapacityInternal(), 5000); + + // Valid positive number. + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "42")); + ASSERT_EQ(internal::GetCapacityInternal(), 42); + + // Int max. + { + auto str = std::to_string(std::numeric_limits::max()); + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", str)); + ASSERT_EQ(internal::GetCapacityInternal(), std::numeric_limits::max()); + } + + // Over int max. + { + auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", str)); + ASSERT_EQ(internal::GetCapacityInternal(), 5000); + } + + // Zero. + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "0")); + ASSERT_EQ(internal::GetCapacityInternal(), 5000); + + // Negative number. + ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "-1")); + ASSERT_EQ(internal::GetCapacityInternal(), 5000); +} + } // namespace gandiva From 156bb1d22aaad0115bf8f4170d1990d421d8ced7 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 15:00:15 +0800 Subject: [PATCH 02/13] Update doc about the variable --- docs/source/cpp/env_vars.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 116c151824c..ad7a815ef09 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -180,6 +180,8 @@ that changing their value later will have an effect. The number of entries to keep in the Gandiva JIT compilation cache. The cache is in-memory and does not persist across processes. + The value of this variable is expected to be a positive number less or equal + than int32 max, defaults to 5000 otherwise. .. envvar:: HADOOP_HOME From 7f712fd4ed18a232f000955b99ef0327f823a770 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 15:17:37 +0800 Subject: [PATCH 03/13] Refine naming --- cpp/src/gandiva/cache.cc | 4 ++-- cpp/src/gandiva/cache.h | 6 ++++-- cpp/src/gandiva/cache_test.cc | 14 +++++++------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index 9f07bcd892e..1f8e2304012 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -26,7 +26,7 @@ namespace gandiva { static const int DEFAULT_CACHE_SIZE = 5000; namespace internal { -int GetCapacityInternal() { +int GetCapacityFromEnvVar() { auto maybe_env_value = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); if (!maybe_env_value.ok()) { return DEFAULT_CACHE_SIZE; @@ -46,7 +46,7 @@ int GetCapacityInternal() { } // namespace internal int GetCapacity() { - static const int capacity = internal::GetCapacityInternal(); + static const int capacity = internal::GetCapacityFromEnvVar(); return capacity; } diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index 83b65dfb5eb..f50808f0056 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -26,8 +26,10 @@ namespace gandiva { namespace internal { -// For testing purposes only. -int GetCapacityInternal(); +// Only called once by GetCapacity(). +// Do the actual work of getting the capacity from env var. +// Also makes the testing easier. +int GetCapacityFromEnvVar(); } // namespace internal GANDIVA_EXPORT diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index d4a28f53df3..bc033b04143 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -46,37 +46,37 @@ TEST(TestCache, TestGetCacheCapacityDefault) { ASSERT_EQ(GetCapacity(), 5000); } TEST(TestCache, TestGetCacheCapacityEnvVar) { // Empty. ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "")); - ASSERT_EQ(internal::GetCapacityInternal(), 5000); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); // Non-number. ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "invalid")); - ASSERT_EQ(internal::GetCapacityInternal(), 5000); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); // Valid positive number. ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "42")); - ASSERT_EQ(internal::GetCapacityInternal(), 42); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 42); // Int max. { auto str = std::to_string(std::numeric_limits::max()); ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", str)); - ASSERT_EQ(internal::GetCapacityInternal(), std::numeric_limits::max()); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), std::numeric_limits::max()); } // Over int max. { auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", str)); - ASSERT_EQ(internal::GetCapacityInternal(), 5000); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); } // Zero. ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "0")); - ASSERT_EQ(internal::GetCapacityInternal(), 5000); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); // Negative number. ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "-1")); - ASSERT_EQ(internal::GetCapacityInternal(), 5000); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); } } // namespace gandiva From 78e764014446430457f6243c0eeed3a6fb218544 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 15:27:24 +0800 Subject: [PATCH 04/13] Fix link error on windows --- cpp/src/gandiva/cache.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index f50808f0056..d40e6417ea0 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -29,6 +29,7 @@ namespace internal { // Only called once by GetCapacity(). // Do the actual work of getting the capacity from env var. // Also makes the testing easier. +GANDIVA_EXPORT int GetCapacityFromEnvVar(); } // namespace internal From 915d731d41e4f9d3ef9e273df77a76e44333aae0 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 17:17:52 +0800 Subject: [PATCH 05/13] Refine --- cpp/src/gandiva/cache_test.cc | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index bc033b04143..5f67cc79382 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -44,39 +44,42 @@ TEST(TestCache, TestGetPut) { TEST(TestCache, TestGetCacheCapacityDefault) { ASSERT_EQ(GetCapacity(), 5000); } TEST(TestCache, TestGetCacheCapacityEnvVar) { + constexpr auto capacity_env_var = "GANDIVA_CACHE_SIZE"; + constexpr auto default_capacity = 5000; + // Empty. - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "")); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); // Non-number. - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "invalid")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "invalid")); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); // Valid positive number. - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "42")); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "42")); ASSERT_EQ(internal::GetCapacityFromEnvVar(), 42); // Int max. { auto str = std::to_string(std::numeric_limits::max()); - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", str)); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, str)); ASSERT_EQ(internal::GetCapacityFromEnvVar(), std::numeric_limits::max()); } // Over int max. { auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", str)); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, str)); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); } // Zero. - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "0")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "0")); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); // Negative number. - ASSERT_OK(::arrow::internal::SetEnvVar("GANDIVA_CACHE_SIZE", "-1")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 5000); + ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "-1")); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); } } // namespace gandiva From 064bb7865877a429a88f491136095f9057beca1d Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 17:19:36 +0800 Subject: [PATCH 06/13] Refine --- cpp/src/gandiva/cache_test.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index 5f67cc79382..00f9daba74a 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -41,12 +41,14 @@ TEST(TestCache, TestGetPut) { ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); } +namespace { +constexpr auto capacity_env_var = "GANDIVA_CACHE_SIZE"; +constexpr auto default_capacity = 5000; +} // namespace + TEST(TestCache, TestGetCacheCapacityDefault) { ASSERT_EQ(GetCapacity(), 5000); } TEST(TestCache, TestGetCacheCapacityEnvVar) { - constexpr auto capacity_env_var = "GANDIVA_CACHE_SIZE"; - constexpr auto default_capacity = 5000; - // Empty. ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "")); ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); From fdca72db4886f0cdbdbcd4c74ed621599d3d2814 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 17:20:12 +0800 Subject: [PATCH 07/13] Refine --- cpp/src/gandiva/cache_test.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index 00f9daba74a..ee953f4ecad 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -46,7 +46,9 @@ constexpr auto capacity_env_var = "GANDIVA_CACHE_SIZE"; constexpr auto default_capacity = 5000; } // namespace -TEST(TestCache, TestGetCacheCapacityDefault) { ASSERT_EQ(GetCapacity(), 5000); } +TEST(TestCache, TestGetCacheCapacityDefault) { + ASSERT_EQ(GetCapacity(), default_capacity); +} TEST(TestCache, TestGetCacheCapacityEnvVar) { // Empty. From 77a71a70f5704dd59923d0a2c38bcbd73ecee233 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 17:53:26 +0800 Subject: [PATCH 08/13] Fix uncleared env var in tests --- cpp/src/gandiva/cache_test.cc | 48 +++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index ee953f4ecad..2f5e5e7f399 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -18,6 +18,7 @@ #include "gandiva/cache.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/io_util.h" +#include "arrow/util/logging.h" #include @@ -51,39 +52,54 @@ TEST(TestCache, TestGetCacheCapacityDefault) { } TEST(TestCache, TestGetCacheCapacityEnvVar) { + // Uncleared env var may have side-effect to subsequent tests. Use a structure to help + // clearing the env var when leaving the scope. + struct ScopedEnvVar { + ScopedEnvVar(const char* name, const char* value) : name_(std::move(name)) { + ARROW_CHECK_OK(::arrow::internal::SetEnvVar(name_, value)); + } + ~ScopedEnvVar() { ARROW_CHECK_OK(::arrow::internal::DelEnvVar(name_)); } + + private: + const char* name_; + }; + // Empty. - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + { + ScopedEnvVar env(capacity_env_var, ""); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + } // Non-number. - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "invalid")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + { + ScopedEnvVar env(capacity_env_var, "invalid"); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + } // Valid positive number. - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "42")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 42); + { + ScopedEnvVar env(capacity_env_var, "42"); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), 42); + } // Int max. { auto str = std::to_string(std::numeric_limits::max()); - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, str)); + ScopedEnvVar env(capacity_env_var, str.c_str()); ASSERT_EQ(internal::GetCapacityFromEnvVar(), std::numeric_limits::max()); } - // Over int max. + // Zero. { - auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, str)); + ScopedEnvVar env(capacity_env_var, "0"); ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); } - // Zero. - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "0")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); - // Negative number. - ASSERT_OK(::arrow::internal::SetEnvVar(capacity_env_var, "-1")); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + { + ScopedEnvVar env(capacity_env_var, "-1"); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + } } } // namespace gandiva From 032169ed472d70c0989d2dc848a17ecc0f2a5d02 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 18:19:26 +0800 Subject: [PATCH 09/13] Use std::stoi to detect invalid cases --- cpp/src/gandiva/cache.cc | 11 +++++++++-- cpp/src/gandiva/cache_test.cc | 13 +++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index 1f8e2304012..37b41cde0fe 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -35,8 +35,15 @@ int GetCapacityFromEnvVar() { if (env_value.empty()) { return DEFAULT_CACHE_SIZE; } - int capacity = std::atoi(env_value.c_str()); - if (capacity <= 0) { + int capacity = 0; + size_t length = 0; + bool exception = false; + try { + capacity = std::stoi(env_value.c_str(), &length); + } catch (const std::exception&) { + exception = true; + } + if (length != env_value.length() || exception || capacity <= 0) { ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " << "Using default cache size: " << DEFAULT_CACHE_SIZE; return DEFAULT_CACHE_SIZE; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index 2f5e5e7f399..18d1d59ac78 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -76,6 +76,12 @@ TEST(TestCache, TestGetCacheCapacityEnvVar) { ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); } + // Number with invalid suffix. + { + ScopedEnvVar env(capacity_env_var, "42MB"); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + } + // Valid positive number. { ScopedEnvVar env(capacity_env_var, "42"); @@ -100,6 +106,13 @@ TEST(TestCache, TestGetCacheCapacityEnvVar) { ScopedEnvVar env(capacity_env_var, "-1"); ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); } + + // Over int max. + { + auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); + ScopedEnvVar env(capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + } } } // namespace gandiva From 869945eae31597160a9f0286210b66b9340ef9e7 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Mon, 22 Apr 2024 19:14:16 +0800 Subject: [PATCH 10/13] Refine doc --- docs/source/cpp/env_vars.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index ad7a815ef09..0a082b0a5d8 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -180,8 +180,10 @@ that changing their value later will have an effect. The number of entries to keep in the Gandiva JIT compilation cache. The cache is in-memory and does not persist across processes. - The value of this variable is expected to be a positive number less or equal - than int32 max, defaults to 5000 otherwise. + + The default cache size is 5000. The value of this environment variable + should be a positive integer and should not exceed the maximum value + of int32. Otherwise the default value is used. .. envvar:: HADOOP_HOME From d3aa5e9f528347e649e7d2b08fc981253e4f122d Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 2 May 2024 23:56:08 +0800 Subject: [PATCH 11/13] Address comment --- cpp/src/gandiva/cache.cc | 9 ++++--- cpp/src/gandiva/cache.h | 13 ++++++--- cpp/src/gandiva/cache_test.cc | 50 ++++++++++++++--------------------- 3 files changed, 35 insertions(+), 37 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index 37b41cde0fe..f8e46cf076f 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -26,7 +26,7 @@ namespace gandiva { static const int DEFAULT_CACHE_SIZE = 5000; namespace internal { -int GetCapacityFromEnvVar() { +int GetCacheCapacityFromEnvVar() { auto maybe_env_value = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); if (!maybe_env_value.ok()) { return DEFAULT_CACHE_SIZE; @@ -52,8 +52,11 @@ int GetCapacityFromEnvVar() { } } // namespace internal -int GetCapacity() { - static const int capacity = internal::GetCapacityFromEnvVar(); +// Deprecated in 17.0.0. Use GetCacheCapacity instead. +int GetCapacity() { return GetCacheCapacity(); } + +int GetCacheCapacity() { + static const int capacity = internal::GetCacheCapacityFromEnvVar(); return capacity; } diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index d40e6417ea0..c19dbb7a0e3 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -20,22 +20,27 @@ #include #include +#include "arrow/util/macros.h" #include "gandiva/lru_cache.h" #include "gandiva/visibility.h" namespace gandiva { namespace internal { -// Only called once by GetCapacity(). -// Do the actual work of getting the capacity from env var. +// Only called once by GetCacheCapacity(). +// Do the actual work of getting the cache capacity from env var. // Also makes the testing easier. GANDIVA_EXPORT -int GetCapacityFromEnvVar(); +int GetCacheCapacityFromEnvVar(); } // namespace internal +ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.") GANDIVA_EXPORT int GetCapacity(); +GANDIVA_EXPORT +int GetCacheCapacity(); + GANDIVA_EXPORT void LogCacheSize(size_t capacity); @@ -44,7 +49,7 @@ class Cache { public: explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); } - Cache() : Cache(GetCapacity()) {} + Cache() : Cache(GetCacheCapacity()) {} ValueType GetObjectCode(const KeyType& cache_key) { std::optional result; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index 18d1d59ac78..96cf4a12e58 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -43,75 +43,65 @@ TEST(TestCache, TestGetPut) { } namespace { -constexpr auto capacity_env_var = "GANDIVA_CACHE_SIZE"; -constexpr auto default_capacity = 5000; +constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE"; +constexpr auto default_cache_capacity = 5000; } // namespace TEST(TestCache, TestGetCacheCapacityDefault) { - ASSERT_EQ(GetCapacity(), default_capacity); + ASSERT_EQ(GetCacheCapacity(), default_cache_capacity); } TEST(TestCache, TestGetCacheCapacityEnvVar) { - // Uncleared env var may have side-effect to subsequent tests. Use a structure to help - // clearing the env var when leaving the scope. - struct ScopedEnvVar { - ScopedEnvVar(const char* name, const char* value) : name_(std::move(name)) { - ARROW_CHECK_OK(::arrow::internal::SetEnvVar(name_, value)); - } - ~ScopedEnvVar() { ARROW_CHECK_OK(::arrow::internal::DelEnvVar(name_)); } - - private: - const char* name_; - }; + using ::arrow::EnvVarGuard; // Empty. { - ScopedEnvVar env(capacity_env_var, ""); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + EnvVarGuard guard(cache_capacity_env_var, ""); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); } // Non-number. { - ScopedEnvVar env(capacity_env_var, "invalid"); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + EnvVarGuard guard(cache_capacity_env_var, "invalid"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); } // Number with invalid suffix. { - ScopedEnvVar env(capacity_env_var, "42MB"); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + EnvVarGuard guard(cache_capacity_env_var, "42MB"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); } // Valid positive number. { - ScopedEnvVar env(capacity_env_var, "42"); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), 42); + EnvVarGuard guard(cache_capacity_env_var, "42"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42); } // Int max. { auto str = std::to_string(std::numeric_limits::max()); - ScopedEnvVar env(capacity_env_var, str.c_str()); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), std::numeric_limits::max()); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits::max()); } // Zero. { - ScopedEnvVar env(capacity_env_var, "0"); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + EnvVarGuard guard(cache_capacity_env_var, "0"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); } // Negative number. { - ScopedEnvVar env(capacity_env_var, "-1"); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + EnvVarGuard guard(cache_capacity_env_var, "-1"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); } // Over int max. { auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); - ScopedEnvVar env(capacity_env_var, str.c_str()); - ASSERT_EQ(internal::GetCapacityFromEnvVar(), default_capacity); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); } } From 0b491bcfe44a340ddb582f843edae89d63aa62f7 Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Fri, 3 May 2024 00:04:43 +0800 Subject: [PATCH 12/13] Some more refinement --- cpp/src/gandiva/cache.cc | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index f8e46cf076f..f49346beabf 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -20,33 +20,30 @@ #include "arrow/result.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace gandiva { -static const int DEFAULT_CACHE_SIZE = 5000; +constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE"; +constexpr auto kDefaultCacheSize = 5000; namespace internal { int GetCacheCapacityFromEnvVar() { - auto maybe_env_value = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); + auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar); if (!maybe_env_value.ok()) { - return DEFAULT_CACHE_SIZE; + return kDefaultCacheSize; } const auto env_value = *std::move(maybe_env_value); if (env_value.empty()) { - return DEFAULT_CACHE_SIZE; + return kDefaultCacheSize; } int capacity = 0; - size_t length = 0; - bool exception = false; - try { - capacity = std::stoi(env_value.c_str(), &length); - } catch (const std::exception&) { - exception = true; - } - if (length != env_value.length() || exception || capacity <= 0) { - ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " - << "Using default cache size: " << DEFAULT_CACHE_SIZE; - return DEFAULT_CACHE_SIZE; + bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>( + env_value.c_str(), env_value.size(), &capacity); + if (!ok || capacity <= 0) { + ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar + << "Using default cache size: " << kDefaultCacheSize; + return kDefaultCacheSize; } return capacity; } From eaaa659e81dbb62b813c67f334419a2a69d6c6d6 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 3 May 2024 00:13:25 +0800 Subject: [PATCH 13/13] Update cpp/src/gandiva/cache.cc Co-authored-by: Antoine Pitrou --- cpp/src/gandiva/cache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index f49346beabf..2358b08c824 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -42,7 +42,7 @@ int GetCacheCapacityFromEnvVar() { env_value.c_str(), env_value.size(), &capacity); if (!ok || capacity <= 0) { ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar - << "Using default cache size: " << kDefaultCacheSize; + << ". Using default cache size: " << kDefaultCacheSize; return kDefaultCacheSize; } return capacity;