apache · pitrou · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -75,6 +75,10 @@ Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
 
 ARROW_EXPORT extern OutputType kOutputTargetType;
 
+// Add generic casts to out_ty from:
+// - the null type
+// - dictionary with out_ty as given value type
+// - extension types with a compatible storage type
 void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* func);
 
 }  // namespace internal

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -467,6 +467,42 @@ struct CastFunctor<Decimal128Type, Decimal128Type> {
   }
 };
 
+// ----------------------------------------------------------------------
+// Real to decimal
+
+struct RealToDecimal {
+  template <typename OUT, typename RealType>
+  Decimal128 Call(KernelContext* ctx, RealType val) const {
+    auto result = Decimal128::FromReal(val, out_precision_, out_scale_);
+    if (ARROW_PREDICT_FALSE(!result.ok())) {
+      if (!allow_truncate_) {
+        ctx->SetStatus(result.status());
+      }
+      return Decimal128();  // Zero
+    } else {
+      return *std::move(result);
+    }
+  }
+
+  int32_t out_scale_, out_precision_;
+  bool allow_truncate_;
+};
+
+template <typename I>
+struct CastFunctor<Decimal128Type, I, enable_if_t<is_floating_type<I>::value>> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+    ArrayData* output = out->mutable_array();
+    const auto& out_type_inst = checked_cast<const Decimal128Type&>(*output->type);
+    const auto out_scale = out_type_inst.scale();
+    const auto out_precision = out_type_inst.precision();
+
+    applicator::ScalarUnaryNotNullStateful<Decimal128Type, I, RealToDecimal> kernel(
+        RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate});
+    return kernel.Exec(ctx, batch, out);
+  }
+};
+
 namespace {
 
 template <typename OutType>
@@ -530,10 +566,16 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
 std::shared_ptr<CastFunction> GetCastToDecimal() {
   OutputType sig_out_ty(ResolveOutputFromOptions);
 
-  // Cast to decimal
   auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL);
   AddCommonCasts(Type::DECIMAL, sig_out_ty, func.get());
 
+  // Cast from floating point
+  DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
+                            CastFunctor<Decimal128Type, FloatType>::Exec));
+  DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
+                            CastFunctor<Decimal128Type, DoubleType>::Exec));
+
+  // Cast from other decimal
   auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
   // We resolve the output type of this kernel from the CastOptions
   DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)}, sig_out_ty,

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -101,20 +101,11 @@ class TestCast : public TestBase {
     }
   }
 
-  template <typename InType, typename I_TYPE = typename TestCType<InType>::type>
-  void CheckFails(const std::shared_ptr<DataType>& in_type,
-                  const std::vector<I_TYPE>& in_values, const std::vector<bool>& is_valid,
-                  const std::shared_ptr<DataType>& out_type, const CastOptions& options,
-                  bool check_scalar = true) {
-    std::shared_ptr<Array> input;
-    if (is_valid.size() > 0) {
-      ArrayFromVector<InType, I_TYPE>(in_type, is_valid, in_values, &input);
-    } else {
-      ArrayFromVector<InType, I_TYPE>(in_type, in_values, &input);
-    }
-    ASSERT_RAISES(Invalid, Cast(*input, out_type, options));
+  void CheckFails(const Array& input, const std::shared_ptr<DataType>& out_type,
+                  const CastOptions& options, bool check_scalar = true) {
+    ASSERT_RAISES(Invalid, Cast(input, out_type, options));
 
-    if (in_type->id() == Type::DECIMAL || out_type->id() == Type::DECIMAL) {
+    if (input.type_id() == Type::DECIMAL || out_type->id() == Type::DECIMAL) {
       // ARROW-9194
       check_scalar = false;
     }
@@ -124,14 +115,28 @@ class TestCast : public TestBase {
     // cases we will want to check more precisely
     if (check_scalar) {
       int64_t num_failing = 0;
-      for (int64_t i = 0; i < input->length(); ++i) {
-        auto maybe_out = Cast(*input->GetScalar(i), out_type, options);
+      for (int64_t i = 0; i < input.length(); ++i) {
+        auto maybe_out = Cast(*input.GetScalar(i), out_type, options);
         num_failing += static_cast<int>(maybe_out.status().IsInvalid());
       }
       ASSERT_GT(num_failing, 0);
     }
   }
 
+  template <typename InType, typename I_TYPE = typename TestCType<InType>::type>
+  void CheckFails(const std::shared_ptr<DataType>& in_type,
+                  const std::vector<I_TYPE>& in_values, const std::vector<bool>& is_valid,
+                  const std::shared_ptr<DataType>& out_type, const CastOptions& options,
+                  bool check_scalar = true) {
+    std::shared_ptr<Array> input;
+    if (is_valid.size() > 0) {
+      ArrayFromVector<InType, I_TYPE>(in_type, is_valid, in_values, &input);
+    } else {
+      ArrayFromVector<InType, I_TYPE>(in_type, in_values, &input);
+    }
+    CheckFails(*input, out_type, options, check_scalar);
+  }
+
   template <typename InType, typename I_TYPE = typename TestCType<InType>::type>
   void CheckFails(const std::vector<I_TYPE>& in_values, const std::vector<bool>& is_valid,
                   const std::shared_ptr<DataType>& out_type, const CastOptions& options,
@@ -202,6 +207,14 @@ class TestCast : public TestBase {
     }
   }
 
+  void CheckFailsJSON(const std::shared_ptr<DataType>& in_type,
+                      const std::shared_ptr<DataType>& out_type,
+                      const std::string& in_json, bool check_scalar = true,
+                      const CastOptions& options = CastOptions()) {
+    std::shared_ptr<Array> input = ArrayFromJSON(in_type, in_json);
+    CheckFails(*input, out_type, options, check_scalar);
+  }
+
   template <typename SourceType, typename DestType>
   void TestCastBinaryToString() {
     CastOptions options;
@@ -369,6 +382,23 @@ class TestCast : public TestBase {
 
     // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
   }
+
+  void TestCastFloatingToDecimal(const std::shared_ptr<DataType>& in_type) {
+    auto out_type = decimal(5, 2);
+
+    CheckCaseJSON(in_type, out_type, "[0.0, null, 123.45, 123.456, 999.994]",
+                  R"(["0.00", null, "123.45", "123.46", "999.99"])");
+
+    // Overflow
+    CastOptions options{};
+    out_type = decimal(5, 2);
+    CheckFailsJSON(in_type, out_type, "[999.996]", /*check_scalar=*/true, options);
+
+    options.allow_decimal_truncate = true;
+    CheckCaseJSON(in_type, out_type, "[0.0, null, 999.996, 123.45, 999.994]",
+                  R"(["0.00", null, "0.00", "123.45", "999.99"])", /*check_scalar=*/true,
+                  options);
+  }
 };
 
 TEST_F(TestCast, SameTypeZeroCopy) {
@@ -901,6 +931,34 @@ TEST_F(TestCast, DecimalToDecimal) {
   check_truncate(decimal(4, 2), v5, is_valid1, decimal(2, 1), e5);
 }
 
+TEST_F(TestCast, FloatToDecimal) {
+  auto in_type = float32();
+
+  TestCastFloatingToDecimal(in_type);
+
+  // 2**64 + 2**41 (exactly representable as a float)
+  auto out_type = decimal(20, 0);
+  CheckCaseJSON(in_type, out_type, "[1.8446746e+19, -1.8446746e+19]",
+                R"(["18446746272732807168", "-18446746272732807168"])");
+  out_type = decimal(20, 4);
+  CheckCaseJSON(in_type, out_type, "[1.8446746e+15, -1.8446746e+15]",
+                R"(["1844674627273280.7168", "-1844674627273280.7168"])");
+}
+
+TEST_F(TestCast, DoubleToDecimal) {
+  auto in_type = float64();
+
+  TestCastFloatingToDecimal(in_type);
+
+  // 2**64 + 2**11 (exactly representable as a double)
+  auto out_type = decimal(20, 0);
+  CheckCaseJSON(in_type, out_type, "[1.8446744073709556e+19, -1.8446744073709556e+19]",
+                R"(["18446744073709555712", "-18446744073709555712"])");
+  out_type = decimal(20, 4);
+  CheckCaseJSON(in_type, out_type, "[1.8446744073709556e+15, -1.8446744073709556e+15]",
+                R"(["1844674407370955.5712", "-1844674407370955.5712"])");
+}
+
 TEST_F(TestCast, TimestampToTimestamp) {
   CastOptions options;
 

diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <array>
 #include <climits>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
@@ -48,6 +49,117 @@ static const Decimal128 kTenTo36(static_cast<int64_t>(0xC097CE7BC90715),
                                  0xB34B9F1000000000);
 static const Decimal128 kTenTo18(0xDE0B6B3A7640000);
 
+static constexpr auto kInt64DecimalDigits =
+    static_cast<size_t>(std::numeric_limits<int64_t>::digits10);
+
+static constexpr int64_t kInt64PowersOfTen[kInt64DecimalDigits + 1] = {
+    // clang-format off
+    1LL,
+    10LL,
+    100LL,
+    1000LL,
+    10000LL,
+    100000LL,
+    1000000LL,
+    10000000LL,
+    100000000LL,
+    1000000000LL,
+    10000000000LL,
+    100000000000LL,
+    1000000000000LL,
+    10000000000000LL,
+    100000000000000LL,
+    1000000000000000LL,
+    10000000000000000LL,
+    100000000000000000LL,
+    1000000000000000000LL
+    // clang-format on
+};
+
+static constexpr float kFloatPowersOfTen[2 * 38 + 1] = {
+    1e-38f, 1e-37f, 1e-36f, 1e-35f, 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f,
+    1e-28f, 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f, 1e-20f, 1e-19f,
+    1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f, 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f,
+    1e-8f,  1e-7f,  1e-6f,  1e-5f,  1e-4f,  1e-3f,  1e-2f,  1e-1f,  1e0f,   1e1f,
+    1e2f,   1e3f,   1e4f,   1e5f,   1e6f,   1e7f,   1e8f,   1e9f,   1e10f,  1e11f,
+    1e12f,  1e13f,  1e14f,  1e15f,  1e16f,  1e17f,  1e18f,  1e19f,  1e20f,  1e21f,
+    1e22f,  1e23f,  1e24f,  1e25f,  1e26f,  1e27f,  1e28f,  1e29f,  1e30f,  1e31f,
+    1e32f,  1e33f,  1e34f,  1e35f,  1e36f,  1e37f,  1e38f};
+
+static constexpr double kDoublePowersOfTen[2 * 38 + 1] = {
+    1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28,
+    1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
+    1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9,  1e-8,  1e-7,  1e-6,
+    1e-5,  1e-4,  1e-3,  1e-2,  1e-1,  1e0,   1e1,   1e2,   1e3,   1e4,   1e5,
+    1e6,   1e7,   1e8,   1e9,   1e10,  1e11,  1e12,  1e13,  1e14,  1e15,  1e16,
+    1e17,  1e18,  1e19,  1e20,  1e21,  1e22,  1e23,  1e24,  1e25,  1e26,  1e27,
+    1e28,  1e29,  1e30,  1e31,  1e32,  1e33,  1e34,  1e35,  1e36,  1e37,  1e38};
+
+namespace {
+
+template <typename Real, typename Derived>
+struct Decimal128FromReal {
+  static Result<Decimal128> FromPositiveReal(Real real, int32_t precision,
+                                             int32_t scale) {
+    auto x = real;
+    if (scale >= -38 && scale <= 38) {
+      x *= Derived::powers_of_ten()[scale + 38];
+    } else {
+      x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
+    }
+    x = std::nearbyint(x);
+    const auto max_abs = Derived::powers_of_ten()[precision + 38];
+    if (x <= -max_abs || x >= max_abs) {
+      return Status::Invalid("Cannot convert ", real,
+                             " to Decimal128(precision = ", precision,
+                             ", scale = ", scale, "): overflow");
+    }
+    // Extract high and low bits
+    const auto high = std::floor(std::ldexp(x, -64));
+    const auto low = x - std::ldexp(high, 64);
+
+    DCHECK_GE(high, -9.223372036854776e+18);  // -2**63
+    DCHECK_LT(high, 9.223372036854776e+18);   // 2**63
+    DCHECK_GE(low, 0);
+    DCHECK_LT(low, 1.8446744073709552e+19);  // 2**64
+    return Decimal128(static_cast<int64_t>(high), static_cast<uint64_t>(low));
+  }
+
+  static Result<Decimal128> FromReal(Real x, int32_t precision, int32_t scale) {
+    DCHECK_GT(precision, 0);
+    DCHECK_LE(precision, 38);
+
+    if (!std::isfinite(x)) {
+      return Status::Invalid("Cannot convert ", x, " to Decimal128");
+    }
+    if (x < 0) {
+      ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
+      return dec.Negate();
+    } else {
+      // Includes negative zero
+      return FromPositiveReal(x, precision, scale);
+    }
+  }
+};
+
+struct Decimal128FromFloat : public Decimal128FromReal<float, Decimal128FromFloat> {
+  static constexpr const float* powers_of_ten() { return kFloatPowersOfTen; }
+};
+
+struct Decimal128FromDouble : public Decimal128FromReal<double, Decimal128FromDouble> {
+  static constexpr const double* powers_of_ten() { return kDoublePowersOfTen; }
+};
+
+}  // namespace
+
+Result<Decimal128> Decimal128::FromReal(float x, int32_t precision, int32_t scale) {
+  return Decimal128FromFloat::FromReal(x, precision, scale);
+}
+
+Result<Decimal128> Decimal128::FromReal(double x, int32_t precision, int32_t scale) {
+  return Decimal128FromDouble::FromReal(x, precision, scale);
+}
+
 std::string Decimal128::ToIntegerString() const {
   Decimal128 remainder;
   std::stringstream buf;
@@ -154,35 +266,13 @@ std::string Decimal128::ToString(int32_t scale) const {
   return "0." + std::string(static_cast<size_t>(scale - len), '0') + str;
 }
 
-static constexpr auto kInt64DecimalDigits =
-    static_cast<size_t>(std::numeric_limits<int64_t>::digits10);
-static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 1] = {1LL,
-                                                                  10LL,
-                                                                  100LL,
-                                                                  1000LL,
-                                                                  10000LL,
-                                                                  100000LL,
-                                                                  1000000LL,
-                                                                  10000000LL,
-                                                                  100000000LL,
-                                                                  1000000000LL,
-                                                                  10000000000LL,
-                                                                  100000000000LL,
-                                                                  1000000000000LL,
-                                                                  10000000000000LL,
-                                                                  100000000000000LL,
-                                                                  1000000000000000LL,
-                                                                  10000000000000000LL,
-                                                                  100000000000000000LL,
-                                                                  1000000000000000000LL};
-
 // Iterates over data and for each group of kInt64DecimalDigits multiple out by
 // the appropriate power of 10 necessary to add source parsed as uint64 and
 // then adds the parsed value of source.
 static inline void ShiftAndAdd(const char* data, size_t length, Decimal128* out) {
   for (size_t posn = 0; posn < length;) {
     const size_t group_size = std::min(kInt64DecimalDigits, length - posn);
-    const int64_t multiple = kPowersOfTen[group_size];
+    const int64_t multiple = kInt64PowersOfTen[group_size];
     int64_t chunk = 0;
     ARROW_CHECK(internal::ParseValue<Int64Type>(data + posn, group_size, &chunk));
 

diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h
@@ -32,6 +32,8 @@ namespace arrow {
 
 /// Represents a signed 128-bit integer in two's complement.
 /// Calculations wrap around and overflow is ignored.
+/// The max decimal precision that can be safely represented is
+/// 38 significant digits.
 ///
 /// For a discussion of the algorithms, look at Knuth's volume 2,
 /// Semi-numerical Algorithms section 4.3.1.
@@ -101,6 +103,9 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
   static Result<Decimal128> FromString(const std::string& s);
   static Result<Decimal128> FromString(const char* s);
 
+  static Result<Decimal128> FromReal(double real, int32_t precision, int32_t scale);
+  static Result<Decimal128> FromReal(float real, int32_t precision, int32_t scale);
+
   /// \brief Convert from a big-endian byte representation. The length must be
   ///        between 1 and 16.
   /// \return error status if the length is an invalid value