From dbec9a159a2a80e2012f89706591eb85f3b016f6 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 28 Sep 2021 18:31:10 -0700
Subject: [PATCH 01/19] [API] Add floor_divide

---
 python/mxnet/ndarray/numpy/_op.py             |  40 ++-
 python/mxnet/numpy/multiarray.py              |  62 ++++-
 .../numpy/np_elemwise_broadcast_op.cc         |   9 +
 src/operator/mshadow_op.h                     |  90 +++++++
 src/operator/numpy/np_floor_divide-inl.h      | 240 ++++++++++++++++++
 src/operator/numpy/np_floor_divide.cc         | 137 ++++++++++
 src/operator/numpy/np_floor_divide.cu         |  40 +++
 7 files changed, 616 insertions(+), 2 deletions(-)
 create mode 100644 src/operator/numpy/np_floor_divide-inl.h
 create mode 100644 src/operator/numpy/np_floor_divide.cc
 create mode 100644 src/operator/numpy/np_floor_divide.cu

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index a7465865b707..53d425d8f961 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -50,7 +50,8 @@
            'true_divide', 'nonzero', 'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'interp',
            'diff', 'ediff1d', 'resize', 'polyval', 'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite',
            'atleast_1d', 'atleast_2d', 'atleast_3d', 'fill_diagonal', 'squeeze',
-           'where', 'bincount', 'rollaxis', 'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'diag', 'diagonal']
+           'where', 'bincount', 'rollaxis', 'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'diag', 'diagonal',
+           'floor_divide']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -1167,6 +1168,43 @@ def true_divide(x1, x2, out=None):
     return _api_internal.true_divide(x1, x2, out)
 
 
+@set_module('mxnet.ndarray.numpy')
+def floor_divide(x1, x2, out=None):
+    """Return the largest integer smaller or equal to the division of the inputs.
+    It is equivalent to the Python // operator and pairs with the Python % (remainder),
+    function so that a = a % b + b * (a // b) up to roundoff.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+    x2 : ndarray or scalar
+        Divisor array.
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+
+    .. note::
+
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), the output is the more
+       precise type
+    """
+    if isinstance(x1, numeric_types) and isinstance(x2, numeric_types):
+        return _np.floor_divide(x1, x2, out=out)
+    return _api_internal.floor_divide(x1, x2, out)
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_binary_func
 def mod(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index c2d9db95f471..82bf9c0e1d6b 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -80,7 +80,8 @@
            'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d', 'resize', 'matmul',
            'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite', 'polyval', 'where', 'bincount',
            'atleast_1d', 'atleast_2d', 'atleast_3d', 'fill_diagonal', 'squeeze',
-           'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal']
+           'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal',
+           'floor_divide']
 
 __all__ += fallback.__all__
 
@@ -1093,6 +1094,23 @@ def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
         return multiply(self, other)
 
+    @wrap_mxnp_np_ufunc
+    def __floordiv__(self, other):
+        """x.__floordiv__(y) <=> x // y"""
+        return floor_divide(self, other)
+
+    @wrap_mxnp_np_ufunc
+    def __ifloordiv__(self, other):
+        """x.__ifloordiv__(y) <=> x //= y"""
+        if not self.writable:
+            raise ValueError('trying to divide from a readonly ndarray')
+        return floor_divide(self, other, out=self)
+
+    @wrap_mxnp_np_ufunc
+    def __rfloordiv__(self, other):
+        """x.__rfloordiv__(y) <=> y // x"""
+        return floor_divide(other, self)
+
     def __neg__(self):
         return negative(self)
 
@@ -3399,6 +3417,48 @@ def true_divide(x1, x2, out=None):
     return _mx_nd_np.true_divide(x1, x2, out=out)
 
 
+@set_module('mxnet.numpy')
+def floor_divide(x1, x2, out=None):
+    """Return the largest integer smaller or equal to the division of the inputs.
+    It is equivalent to the Python // operator and pairs with the Python % (remainder),
+    function so that a = a % b + b * (a // b) up to roundoff.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+    x2 : ndarray or scalar
+        Divisor array.
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+
+    .. note::
+
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), the output is the more
+       precise type
+
+    Examples
+    --------
+    >>> np.floor_divide(7,3)
+    2
+    >>> np.floor_divide([1., 2., 3., 4.], 2.5)
+    array([ 0.,  0.,  1.,  1.])
+    """
+    return _mx_nd_np.floor_divide(x1, x2, out=out)
+
+
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def mod(x1, x2, out=None, **kwargs):
diff --git a/src/api/operator/numpy/np_elemwise_broadcast_op.cc b/src/api/operator/numpy/np_elemwise_broadcast_op.cc
index 184a4e241eff..2161a421cff5 100644
--- a/src/api/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/api/operator/numpy/np_elemwise_broadcast_op.cc
@@ -61,6 +61,15 @@ MXNET_REGISTER_API("_npi.true_divide")
       UFuncHelper(args, ret, op, op_scalar, op_rscalar);
     });
 
+MXNET_REGISTER_API("_npi.floor_divide")
+    .set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+      using namespace runtime;
+      const nnvm::Op* op         = Op::Get("_npi_floor_divide");
+      const nnvm::Op* op_scalar  = Op::Get("_npi_floor_divide_scalar");
+      const nnvm::Op* op_rscalar = Op::Get("_npi_rfloor_divide_scalar");
+      UFuncHelper(args, ret, op, op_scalar, op_rscalar);
+    });
+
 MXNET_REGISTER_API("_npi.mod").set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
   using namespace runtime;
   const nnvm::Op* op         = Op::Get("_npi_mod");
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index eee928452f50..091b3481494a 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -221,6 +221,96 @@ struct rtrue_divide : public mxnet_op::tunable {
   }
 };
 
+struct floor_divide : public mxnet_op::tunable {
+  template <typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    DType c = static_cast<DType>(::floor(a / b));
+    if ((c * b != a) && (a < DType(0)) != (b < DType(0))) {
+      return DType(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
+    mshadow::half::half_t c = static_cast<mshadow::half::half_t>(::floor(a_half / b));
+    if ((c * b != a_half) && (a < DType(0)) != (b < mshadow::half::half_t(0))) {
+      return mshadow::half::half_t(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    float a_float = static_cast<float>(a);
+    float c = ::floorf(a_float / b);
+    if ((c * b != a_float) && (a < DType(0)) != (b < float(0))) {
+      return float(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    double a_double = static_cast<double>(a);
+    double c = ::floor(a_double / b);
+    if ((c * b != a_double) && (a < DType(0)) != (b < double(0))) {
+      return double(c - 1);
+    } else {
+      return c;
+    }
+  }
+};
+
+struct rfloor_divide : public mxnet_op::tunable {
+  template <typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    DType c = static_cast<DType>(::floor(b / a));
+    if ((c * a != b) && (a < DType(0)) != (b < DType(0))) {
+      return DType(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
+    mshadow::half::half_t c = static_cast<mshadow::half::half_t>(::floor(b / a_half));
+    if ((c * a_half != b) && (a < DType(0)) != (b < mshadow::half::half_t(0))) {
+      return mshadow::half::half_t(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    float a_float = static_cast<float>(a);
+    float c = ::floorf(b / a_float);
+    if ((c * a_float != b) && (a < DType(0)) != (b < float(0))) {
+      return float(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    double a_double = static_cast<double>(a);
+    double c = ::floor(b / a_double);
+    if ((c * a_double != b) && (a < DType(0)) != (b < double(0))) {
+      return double(c - 1);
+    } else {
+      return c;
+    }
+  }
+};
+
 MXNET_BINARY_MATH_OP_NC(left, a);
 
 MXNET_BINARY_MATH_OP_NC(right, b);
diff --git a/src/operator/numpy/np_floor_divide-inl.h b/src/operator/numpy/np_floor_divide-inl.h
new file mode 100644
index 000000000000..070f5bb862cd
--- /dev/null
+++ b/src/operator/numpy/np_floor_divide-inl.h
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_floor_divide-inl.h
+ * \brief Function definitions of floor_divide operator
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_FLOOR_DIVIDE_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_FLOOR_DIVIDE_INL_H_
+
+#include <vector>
+#include "../../common/utils.h"
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../numpy/np_elemwise_broadcast_op.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename xpu, typename OP>
+void FloorDivideScalarCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  if (req[0] == kNullOp || outputs[0].Size() == 0U)
+    return;
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  Stream<xpu>* s                      = ctx.get_stream<xpu>();
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha                  = param.scalar;
+  const TBlob& data                   = inputs[0];
+  const TBlob& out                    = outputs[0];
+  if (common::is_int(out.type_flag_)) {
+    MXNET_INT_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        Kernel<op_with_req<OP, Req>, xpu>::Launch(
+            s, data.Size(), out.dptr<DType>(), data.dptr<DType>(), static_cast<DType>(alpha));
+      });
+    });
+  } else {
+    CHECK(out.type_flag_ == mshadow::kFloat32 || out.type_flag_ == mshadow::kFloat64)
+        << "floor_divide only supports float32 and float64"
+           " output when input's dtype is "
+        << type_string(inputs[0].type_flag_);
+    if (common::is_int(data.type_flag_)) {
+      MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, ODType, {
+        MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+          MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+            Kernel<op_with_req<OP, Req>, xpu>::Launch(
+                s, data.Size(), out.dptr<ODType>(), data.dptr<DType>(), static_cast<ODType>(alpha));
+          });
+        });
+      });
+    } else {
+      MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          Kernel<op_with_req<OP, Req>, xpu>::Launch(
+              s, data.Size(), out.dptr<DType>(), data.dptr<DType>(), DType(alpha));
+        });
+      });
+    }
+  }
+}
+
+template <typename xpu>
+void FloorDivideElemwiseCompute(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  if (req[0] == kNullOp || outputs[0].Size() == 0U)
+    return;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const TBlob& lhs = inputs[0];
+  const TBlob& rhs = inputs[1];
+  const TBlob& out = outputs[0];
+  if (lhs.type_flag_ == rhs.type_flag_) {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+      MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+        Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+            s, out.Size(), out.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>());
+      });
+    });
+  } else {
+    // Case when types of the 2 input tensors are different
+    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+      // both lhs and rhs are float types, output type is the more precise one
+      LOG(FATAL) << "not implemented yet...";
+    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+      // one is float type, the other is integer type, the output type should be the same as float
+      CHECK_EQ(out.type_flag_, common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
+          << "This case out type should be same as the float type";
+      if (common::is_float(lhs.type_flag_)) {
+        // lhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+            MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+              Kernel<op_with_req<mshadow_op::rtrue_divide, Req>, xpu>::Launch(
+                  s, out.Size(), out.dptr<LType>(), rhs.dptr<RType>(), lhs.dptr<LType>());
+            });
+          });
+        });
+      } else {
+        // rhs is the float one
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+            MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+              Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
+                  s, out.Size(), out.dptr<RType>(), lhs.dptr<LType>(), rhs.dptr<RType>());
+            });
+          });
+        });
+      }
+    } else {
+      // lhs is integer type, rhs is integer type, output type should be float
+      LOG(FATAL) << "not implemented yet...";
+    }
+  }
+}
+
+template <typename xpu>
+void FloorDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<TBlob>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  if (outputs[0].shape_.Size() == 0U)
+    return;
+  CHECK_EQ(inputs.size(), 2U);
+  mxnet::TShape new_lshape, new_rshape, new_oshape;
+  int ndim = BinaryBroadcastShapeCompact(
+      inputs[0].shape_, inputs[1].shape_, outputs[0].shape_, &new_lshape, &new_rshape, &new_oshape);
+  if (!ndim) {
+    FloorDivideElemwiseCompute<xpu>(attrs, ctx, inputs, req, outputs);
+  } else {
+    if (req[0] == kNullOp)
+      return;
+    mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+    const TBlob& lhs        = inputs[0];
+    const TBlob& rhs        = inputs[1];
+    const TBlob& out        = outputs[0];
+    BROADCAST_NDIM_SWITCH(ndim, NDim, {
+      mshadow::Shape<NDim> oshape  = new_oshape.get<NDim>();
+      mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+      mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
+      if (lhs.type_flag_ == rhs.type_flag_) {
+        MSHADOW_TYPE_SWITCH(lhs.type_flag_, DType, {
+          Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::template LaunchEx(
+              s,
+              new_oshape.Size(),
+              req[0],
+              lstride,
+              rstride,
+              oshape,
+              lhs.dptr<DType>(),
+              rhs.dptr<DType>(),
+              out.dptr<DType>());
+        });
+      } else {
+        if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
+          // lhs and rhs have different float types, the output is the more precise one
+          LOG(FATAL) << "not implemented yet...";
+        } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
+          // one of lhs and rhs is float, the output is the same type as the float one
+          if (common::is_float(lhs.type_flag_)) {
+            // lhs is float type, output will be the same float type
+            CHECK_EQ(lhs.type_flag_, out.type_flag_)
+                << "lhs should have the same type as out, infer type broken?";
+            MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
+              MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+                Kernel<binary_broadcast_kernel<NDim, mshadow_op::rtrue_divide>,
+                       xpu>::template LaunchEx(s,
+                                               new_oshape.Size(),
+                                               req[0],
+                                               rstride,
+                                               lstride,
+                                               oshape,
+                                               rhs.dptr<RType>(),
+                                               lhs.dptr<LType>(),
+                                               out.dptr<LType>());
+              });
+            });
+          } else {
+            // rhs is float type, output will be the same float type
+            CHECK_EQ(rhs.type_flag_, out.type_flag_)
+                << "rhs should have the same type as out, infer type broken?";
+            MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+              MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
+                Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>,
+                       xpu>::template LaunchEx(s,
+                                               new_oshape.Size(),
+                                               req[0],
+                                               lstride,
+                                               rstride,
+                                               oshape,
+                                               lhs.dptr<LType>(),
+                                               rhs.dptr<RType>(),
+                                               out.dptr<RType>());
+              });
+            });
+          }
+        } else {
+          // lhs and rhs have different integer types, the output is float type
+          LOG(FATAL) << "not implemented yet...";
+        }
+      }
+    });
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_FLOOR_DIVIDE_INL_H_
diff --git a/src/operator/numpy/np_floor_divide.cc b/src/operator/numpy/np_floor_divide.cc
new file mode 100644
index 000000000000..11489fb246dd
--- /dev/null
+++ b/src/operator/numpy/np_floor_divide.cc
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_floor_divide.cc
+ * \brief CPU Implementation of floor_divide operator.
+ */
+
+#include "./np_floor_divide-inl.h"
+
+namespace mxnet {
+namespace op {
+
+int FloorDivideOutType(int ltype, int rtype) {
+  if (common::is_float(ltype) && common::is_float(rtype)) {
+    // If both inputs are float, return the one with the higher precision
+    return common::get_more_precise_type(ltype, rtype);
+  } else if (common::is_float(ltype) || common::is_float(rtype)) {
+    // If only one of the inputs is float, return that float type
+    return (common::is_float(ltype)) ? ltype : rtype;
+  }
+  // If neither of the inputs is float, return the higher precision int dtype
+  return common::get_more_precise_type(ltype, rtype);
+}
+
+bool FloorDivideType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int>* in_attrs,
+                     std::vector<int>* out_attrs) {
+  CHECK_GT(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  for (const int dtype : *in_attrs) {
+    if (dtype == -1)
+      return false;
+  }
+
+  const int lhs_dtype = in_attrs->at(0);
+  const int rhs_dtype = in_attrs->at(1);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, FloorDivideOutType(lhs_dtype, rhs_dtype));
+  return true;
+}
+
+bool FloorDivideScalarType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_GT(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  for (const int dtype : *in_attrs) {
+    if (dtype == -1)
+      return false;
+  }
+
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  bool is_int = param.is_int;
+  const int lhs_dtype = in_attrs->at(0);
+  const int rhs_dtype =
+      common::is_float(lhs_dtype)
+          ? lhs_dtype
+          : (is_int ? mxnet::common::GetDefaultIntType() : mxnet::common::GetDefaultDtype());
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, FloorDivideOutType(lhs_dtype, rhs_dtype));
+  return true;
+}
+
+NNVM_REGISTER_OP(_npi_floor_divide)
+    .set_num_inputs(2)
+    .set_num_outputs(1)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<std::string>{"lhs", "rhs"};
+                                     })
+    .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)
+    .set_attr<nnvm::FInferType>("FInferType", FloorDivideType)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
+                                    })
+    .set_attr<FCompute>("FCompute<cpu>", FloorDivideBroadcastCompute<cpu>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+    .add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
+    .add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+
+
+NNVM_REGISTER_OP(_npi_floor_divide_scalar)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+    .set_attr<nnvm::FInferType>("FInferType", FloorDivideScalarType)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 0}};
+                                    })
+    .set_attr<FCompute>("FCompute<cpu>", FloorDivideScalarCompute<cpu, op::mshadow_op::floor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+    .add_argument("data", "NDArray-or-Symbol", "source input")
+    .add_arguments(NumpyBinaryScalarParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+    .set_attr<nnvm::FInferType>("FInferType", FloorDivideType)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 0}};
+                                    })
+#ifdef _WIN32
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+#endif
+    .set_attr<FCompute>("FCompute<cpu>", FloorDivideScalarCompute<cpu, mshadow_op::rfloor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+    .add_argument("data", "NDArray-or-Symbol", "source input")
+    .add_arguments(NumpyBinaryScalarParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_floor_divide.cu b/src/operator/numpy/np_floor_divide.cu
new file mode 100644
index 000000000000..b6d01b706460
--- /dev/null
+++ b/src/operator/numpy/np_floor_divide.cu
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_floor_divide.cu
+ * \brief GPU Implementation of floor_divide operator.
+ */
+
+#include "./np_floor_divide-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_floor_divide)
+    .set_attr<FCompute>("FCompute<gpu>", FloorDivideBroadcastCompute<gpu>);
+
+NNVM_REGISTER_OP(_npi_floor_divide_scalar)
+    .set_attr<FCompute>("FCompute<gpu>", FloorDivideScalarCompute<gpu, mshadow_op::floor_divide>);
+
+NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
+    .set_attr<FCompute>("FCompute<gpu>", FloorDivideScalarCompute<gpu, mshadow_op::rFloor_divide>);
+
+}  // namespace op
+}  // namespace mxnet

From 6e1d0805d81062065fac7d5e72dfae18484b95c5 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 3 Oct 2021 21:55:31 -0700
Subject: [PATCH 02/19] fix lint

---
 src/operator/mshadow_op.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 418d0e331f2e..02c46b278043 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -224,7 +224,7 @@ struct floor_divide : public mxnet_op::tunable {
   template <typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     DType c = static_cast<DType>(::floor(a / b));
-    if ((c * b != a) && (a < DType(0)) != (b < DType(0))) {
+    if ((c * b != a) && (a < 0) != (b < 0)) {
       return DType(c - 1);
     } else {
       return c;
@@ -235,7 +235,7 @@ struct floor_divide : public mxnet_op::tunable {
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
     mshadow::half::half_t c = static_cast<mshadow::half::half_t>(::floor(a_half / b));
-    if ((c * b != a_half) && (a < DType(0)) != (b < mshadow::half::half_t(0))) {
+    if ((c * b != a_half) && (a < 0) != (b < 0)) {
       return mshadow::half::half_t(c - 1);
     } else {
       return c;
@@ -246,7 +246,7 @@ struct floor_divide : public mxnet_op::tunable {
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c = ::floorf(a_float / b);
-    if ((c * b != a_float) && (a < DType(0)) != (b < float(0))) {
+    if ((c * b != a_float) && (a < 0) != (b < 0)) {
       return float(c - 1);
     } else {
       return c;
@@ -257,7 +257,7 @@ struct floor_divide : public mxnet_op::tunable {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c = ::floor(a_double / b);
-    if ((c * b != a_double) && (a < DType(0)) != (b < double(0))) {
+    if ((c * b != a_double) && (a < 0) != (b < 0)) {
       return double(c - 1);
     } else {
       return c;
@@ -269,7 +269,7 @@ struct rfloor_divide : public mxnet_op::tunable {
   template <typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     DType c = static_cast<DType>(::floor(b / a));
-    if ((c * a != b) && (a < DType(0)) != (b < DType(0))) {
+    if ((c * a != b) && (a < 0) != (b < 0)) {
       return DType(c - 1);
     } else {
       return c;
@@ -280,7 +280,7 @@ struct rfloor_divide : public mxnet_op::tunable {
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
     mshadow::half::half_t c = static_cast<mshadow::half::half_t>(::floor(b / a_half));
-    if ((c * a_half != b) && (a < DType(0)) != (b < mshadow::half::half_t(0))) {
+    if ((c * a_half != b) && (a < 0) != (b < 0)) {
       return mshadow::half::half_t(c - 1);
     } else {
       return c;
@@ -291,7 +291,7 @@ struct rfloor_divide : public mxnet_op::tunable {
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c = ::floorf(b / a_float);
-    if ((c * a_float != b) && (a < DType(0)) != (b < float(0))) {
+    if ((c * a_float != b) && (a < 0) != (b < 0)) {
       return float(c - 1);
     } else {
       return c;
@@ -302,7 +302,7 @@ struct rfloor_divide : public mxnet_op::tunable {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c = ::floor(b / a_double);
-    if ((c * a_double != b) && (a < DType(0)) != (b < double(0))) {
+    if ((c * a_double != b) && (a < 0) != (b < 0)) {
       return double(c - 1);
     } else {
       return c;

From db91b410761b7b416e817ef972ffb2f9a672fdcb Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 7 Oct 2021 15:42:47 -0700
Subject: [PATCH 03/19] fix sanity

---
 src/operator/mshadow_op.h             | 8 ++++----
 src/operator/numpy/np_floor_divide.cc | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 02c46b278043..31ad36670628 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -247,7 +247,7 @@ struct floor_divide : public mxnet_op::tunable {
     float a_float = static_cast<float>(a);
     float c = ::floorf(a_float / b);
     if ((c * b != a_float) && (a < 0) != (b < 0)) {
-      return float(c - 1);
+      return c - 1.0f;
     } else {
       return c;
     }
@@ -258,7 +258,7 @@ struct floor_divide : public mxnet_op::tunable {
     double a_double = static_cast<double>(a);
     double c = ::floor(a_double / b);
     if ((c * b != a_double) && (a < 0) != (b < 0)) {
-      return double(c - 1);
+      return c - 1.0;
     } else {
       return c;
     }
@@ -292,7 +292,7 @@ struct rfloor_divide : public mxnet_op::tunable {
     float a_float = static_cast<float>(a);
     float c = ::floorf(b / a_float);
     if ((c * a_float != b) && (a < 0) != (b < 0)) {
-      return float(c - 1);
+      return c - 1.0f;
     } else {
       return c;
     }
@@ -303,7 +303,7 @@ struct rfloor_divide : public mxnet_op::tunable {
     double a_double = static_cast<double>(a);
     double c = ::floor(b / a_double);
     if ((c * a_double != b) && (a < 0) != (b < 0)) {
-      return double(c - 1);
+      return c - 1.0;
     } else {
       return c;
     }
diff --git a/src/operator/numpy/np_floor_divide.cc b/src/operator/numpy/np_floor_divide.cc
index 11489fb246dd..f89d580c6f9c 100644
--- a/src/operator/numpy/np_floor_divide.cc
+++ b/src/operator/numpy/np_floor_divide.cc
@@ -107,7 +107,8 @@ NNVM_REGISTER_OP(_npi_floor_divide_scalar)
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::pair<int, int> >{{0, 0}};
                                     })
-    .set_attr<FCompute>("FCompute<cpu>", FloorDivideScalarCompute<cpu, op::mshadow_op::floor_divide>)
+    .set_attr<FCompute>("FCompute<cpu>",
+                        FloorDivideScalarCompute<cpu, op::mshadow_op::floor_divide>)
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
     .add_argument("data", "NDArray-or-Symbol", "source input")
     .add_arguments(NumpyBinaryScalarParam::__FIELDS__());

From d14c2e1a614bd649f22f7e7fab22c130f5afe48d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 17 Oct 2021 18:57:53 -0700
Subject: [PATCH 04/19] update implementation

---
 python/mxnet/ndarray/numpy/_op.py           |   1 +
 python/mxnet/numpy/multiarray.py            |   1 +
 src/common/cuda/rtc/forward_functions-inl.h |  22 ++
 src/operator/mshadow_op.h                   |  51 +++--
 src/operator/numpy/np_floor_divide-inl.h    | 240 --------------------
 src/operator/numpy/np_floor_divide.cc       | 114 +---------
 src/operator/numpy/np_floor_divide.cu       |   8 +-
 tests/python/unittest/test_numpy_op.py      |   2 +
 8 files changed, 69 insertions(+), 370 deletions(-)
 delete mode 100644 src/operator/numpy/np_floor_divide-inl.h

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 53d425d8f961..f9c7b553f3fc 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -1169,6 +1169,7 @@ def true_divide(x1, x2, out=None):
 
 
 @set_module('mxnet.ndarray.numpy')
+@wrap_np_binary_func
 def floor_divide(x1, x2, out=None):
     """Return the largest integer smaller or equal to the division of the inputs.
     It is equivalent to the Python // operator and pairs with the Python % (remainder),
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 82bf9c0e1d6b..3f7cc1c2efb0 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -3418,6 +3418,7 @@ def true_divide(x1, x2, out=None):
 
 
 @set_module('mxnet.numpy')
+@wrap_np_binary_func
 def floor_divide(x1, x2, out=None):
     """Return the largest integer smaller or equal to the division of the inputs.
     It is equivalent to the Python // operator and pairs with the Python % (remainder),
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 7a886a0a9aec..cb93de8c49a6 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -259,6 +259,28 @@ rsub(const DType a, const DType2 b) {
   return b - a;
 }
 
+template <typename DType, typename DType2>
+__device__ inline mixed_type<DType, DType2>
+floor_divide(const DType a, const DType2 b) {
+  mixed_type<DType, DType2> c = op::floor(op::div(a, b));
+  if ((c * b != a) && (a < 0) != (b < 0)) {
+    return mixed_type<DType, DType2>(c - 1);
+  } else {
+    return c;
+  }
+}
+
+template <typename DType, typename DType2>
+__device__ inline mixed_type<DType, DType2>
+rfloor_divide(const DType a, const DType2 b) {
+  mixed_type<DType, DType2> c = op::floor(op::div(b, a));
+  if ((c * a != b) && (a < 0) != (b < 0)) {
+    return mixed_type<DType, DType2>(c - 1);
+  } else {
+    return c;
+  }
+}
+
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 mul(const DType a, const DType2 b) {
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 31ad36670628..cd12f0166847 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -220,6 +220,8 @@ struct rtrue_divide : public mxnet_op::tunable {
   }
 };
 
+/***** floor_divide ******/
+
 struct floor_divide : public mxnet_op::tunable {
   template <typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
@@ -230,7 +232,21 @@ struct floor_divide : public mxnet_op::tunable {
       return c;
     }
   }
+};
+
+struct rfloor_divide : public mxnet_op::tunable {
+  template <typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    DType c = static_cast<DType>(::floor(b / a));
+    if ((c * a != b) && (a < 0) != (b < 0)) {
+      return DType(c - 1);
+    } else {
+      return c;
+    }
+  }
+};
 
+struct mixed_floor_divide {
   template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
@@ -242,7 +258,10 @@ struct floor_divide : public mxnet_op::tunable {
     }
   }
 
-  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c = ::floorf(a_float / b);
@@ -253,8 +272,11 @@ struct floor_divide : public mxnet_op::tunable {
     }
   }
 
-  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
-  MSHADOW_XINLINE static double Map(DType a, double b) {
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_same<DType, float>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>  MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c = ::floor(a_double / b);
     if ((c * b != a_double) && (a < 0) != (b < 0)) {
@@ -265,17 +287,7 @@ struct floor_divide : public mxnet_op::tunable {
   }
 };
 
-struct rfloor_divide : public mxnet_op::tunable {
-  template <typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    DType c = static_cast<DType>(::floor(b / a));
-    if ((c * a != b) && (a < 0) != (b < 0)) {
-      return DType(c - 1);
-    } else {
-      return c;
-    }
-  }
-
+struct mixed_rfloor_divide {
   template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
@@ -287,7 +299,10 @@ struct rfloor_divide : public mxnet_op::tunable {
     }
   }
 
-  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c = ::floorf(b / a_float);
@@ -298,7 +313,11 @@ struct rfloor_divide : public mxnet_op::tunable {
     }
   }
 
-  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_same<DType, float>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c = ::floor(b / a_double);
diff --git a/src/operator/numpy/np_floor_divide-inl.h b/src/operator/numpy/np_floor_divide-inl.h
deleted file mode 100644
index 070f5bb862cd..000000000000
--- a/src/operator/numpy/np_floor_divide-inl.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file np_floor_divide-inl.h
- * \brief Function definitions of floor_divide operator
- */
-
-#ifndef MXNET_OPERATOR_NUMPY_NP_FLOOR_DIVIDE_INL_H_
-#define MXNET_OPERATOR_NUMPY_NP_FLOOR_DIVIDE_INL_H_
-
-#include <vector>
-#include "../../common/utils.h"
-#include "../tensor/elemwise_binary_broadcast_op.h"
-#include "../numpy/np_elemwise_broadcast_op.h"
-
-namespace mxnet {
-namespace op {
-
-template <typename xpu, typename OP>
-void FloorDivideScalarCompute(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<TBlob>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  if (req[0] == kNullOp || outputs[0].Size() == 0U)
-    return;
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace mshadow::expr;
-  Stream<xpu>* s                      = ctx.get_stream<xpu>();
-  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
-  const double alpha                  = param.scalar;
-  const TBlob& data                   = inputs[0];
-  const TBlob& out                    = outputs[0];
-  if (common::is_int(out.type_flag_)) {
-    MXNET_INT_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        Kernel<op_with_req<OP, Req>, xpu>::Launch(
-            s, data.Size(), out.dptr<DType>(), data.dptr<DType>(), static_cast<DType>(alpha));
-      });
-    });
-  } else {
-    CHECK(out.type_flag_ == mshadow::kFloat32 || out.type_flag_ == mshadow::kFloat64)
-        << "floor_divide only supports float32 and float64"
-           " output when input's dtype is "
-        << type_string(inputs[0].type_flag_);
-    if (common::is_int(data.type_flag_)) {
-      MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, ODType, {
-        MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-          MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-            Kernel<op_with_req<OP, Req>, xpu>::Launch(
-                s, data.Size(), out.dptr<ODType>(), data.dptr<DType>(), static_cast<ODType>(alpha));
-          });
-        });
-      });
-    } else {
-      MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-          Kernel<op_with_req<OP, Req>, xpu>::Launch(
-              s, data.Size(), out.dptr<DType>(), data.dptr<DType>(), DType(alpha));
-        });
-      });
-    }
-  }
-}
-
-template <typename xpu>
-void FloorDivideElemwiseCompute(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<TBlob>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  if (req[0] == kNullOp || outputs[0].Size() == 0U)
-    return;
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-
-  const TBlob& lhs = inputs[0];
-  const TBlob& rhs = inputs[1];
-  const TBlob& out = outputs[0];
-  if (lhs.type_flag_ == rhs.type_flag_) {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-      MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
-        Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
-            s, out.Size(), out.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>());
-      });
-    });
-  } else {
-    // Case when types of the 2 input tensors are different
-    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-      // both lhs and rhs are float types, output type is the more precise one
-      LOG(FATAL) << "not implemented yet...";
-    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-      // one is float type, the other is integer type, the output type should be the same as float
-      CHECK_EQ(out.type_flag_, common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
-          << "This case out type should be same as the float type";
-      if (common::is_float(lhs.type_flag_)) {
-        // lhs is the float one
-        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-            MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
-              Kernel<op_with_req<mshadow_op::rtrue_divide, Req>, xpu>::Launch(
-                  s, out.Size(), out.dptr<LType>(), rhs.dptr<RType>(), lhs.dptr<LType>());
-            });
-          });
-        });
-      } else {
-        // rhs is the float one
-        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
-            MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-              Kernel<op_with_req<mshadow_op::true_divide, Req>, xpu>::Launch(
-                  s, out.Size(), out.dptr<RType>(), lhs.dptr<LType>(), rhs.dptr<RType>());
-            });
-          });
-        });
-      }
-    } else {
-      // lhs is integer type, rhs is integer type, output type should be float
-      LOG(FATAL) << "not implemented yet...";
-    }
-  }
-}
-
-template <typename xpu>
-void FloorDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
-                                 const OpContext& ctx,
-                                 const std::vector<TBlob>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  if (outputs[0].shape_.Size() == 0U)
-    return;
-  CHECK_EQ(inputs.size(), 2U);
-  mxnet::TShape new_lshape, new_rshape, new_oshape;
-  int ndim = BinaryBroadcastShapeCompact(
-      inputs[0].shape_, inputs[1].shape_, outputs[0].shape_, &new_lshape, &new_rshape, &new_oshape);
-  if (!ndim) {
-    FloorDivideElemwiseCompute<xpu>(attrs, ctx, inputs, req, outputs);
-  } else {
-    if (req[0] == kNullOp)
-      return;
-    mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
-    const TBlob& lhs        = inputs[0];
-    const TBlob& rhs        = inputs[1];
-    const TBlob& out        = outputs[0];
-    BROADCAST_NDIM_SWITCH(ndim, NDim, {
-      mshadow::Shape<NDim> oshape  = new_oshape.get<NDim>();
-      mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
-      mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
-      if (lhs.type_flag_ == rhs.type_flag_) {
-        MSHADOW_TYPE_SWITCH(lhs.type_flag_, DType, {
-          Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::template LaunchEx(
-              s,
-              new_oshape.Size(),
-              req[0],
-              lstride,
-              rstride,
-              oshape,
-              lhs.dptr<DType>(),
-              rhs.dptr<DType>(),
-              out.dptr<DType>());
-        });
-      } else {
-        if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-          // lhs and rhs have different float types, the output is the more precise one
-          LOG(FATAL) << "not implemented yet...";
-        } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-          // one of lhs and rhs is float, the output is the same type as the float one
-          if (common::is_float(lhs.type_flag_)) {
-            // lhs is float type, output will be the same float type
-            CHECK_EQ(lhs.type_flag_, out.type_flag_)
-                << "lhs should have the same type as out, infer type broken?";
-            MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-              MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
-                Kernel<binary_broadcast_kernel<NDim, mshadow_op::rtrue_divide>,
-                       xpu>::template LaunchEx(s,
-                                               new_oshape.Size(),
-                                               req[0],
-                                               rstride,
-                                               lstride,
-                                               oshape,
-                                               rhs.dptr<RType>(),
-                                               lhs.dptr<LType>(),
-                                               out.dptr<LType>());
-              });
-            });
-          } else {
-            // rhs is float type, output will be the same float type
-            CHECK_EQ(rhs.type_flag_, out.type_flag_)
-                << "rhs should have the same type as out, infer type broken?";
-            MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
-              MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-                Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>,
-                       xpu>::template LaunchEx(s,
-                                               new_oshape.Size(),
-                                               req[0],
-                                               lstride,
-                                               rstride,
-                                               oshape,
-                                               lhs.dptr<LType>(),
-                                               rhs.dptr<RType>(),
-                                               out.dptr<RType>());
-              });
-            });
-          }
-        } else {
-          // lhs and rhs have different integer types, the output is float type
-          LOG(FATAL) << "not implemented yet...";
-        }
-      }
-    });
-  }
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_NUMPY_NP_FLOOR_DIVIDE_INL_H_
diff --git a/src/operator/numpy/np_floor_divide.cc b/src/operator/numpy/np_floor_divide.cc
index f89d580c6f9c..df00f8483110 100644
--- a/src/operator/numpy/np_floor_divide.cc
+++ b/src/operator/numpy/np_floor_divide.cc
@@ -22,117 +22,17 @@
  * \brief CPU Implementation of floor_divide operator.
  */
 
-#include "./np_floor_divide-inl.h"
+#include "./np_elemwise_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
 
-int FloorDivideOutType(int ltype, int rtype) {
-  if (common::is_float(ltype) && common::is_float(rtype)) {
-    // If both inputs are float, return the one with the higher precision
-    return common::get_more_precise_type(ltype, rtype);
-  } else if (common::is_float(ltype) || common::is_float(rtype)) {
-    // If only one of the inputs is float, return that float type
-    return (common::is_float(ltype)) ? ltype : rtype;
-  }
-  // If neither of the inputs is float, return the higher precision int dtype
-  return common::get_more_precise_type(ltype, rtype);
-}
-
-bool FloorDivideType(const nnvm::NodeAttrs& attrs,
-                     std::vector<int>* in_attrs,
-                     std::vector<int>* out_attrs) {
-  CHECK_GT(in_attrs->size(), 0U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  for (const int dtype : *in_attrs) {
-    if (dtype == -1)
-      return false;
-  }
-
-  const int lhs_dtype = in_attrs->at(0);
-  const int rhs_dtype = in_attrs->at(1);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, FloorDivideOutType(lhs_dtype, rhs_dtype));
-  return true;
-}
-
-bool FloorDivideScalarType(const nnvm::NodeAttrs& attrs,
-                           std::vector<int>* in_attrs,
-                           std::vector<int>* out_attrs) {
-  CHECK_GT(in_attrs->size(), 0U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  for (const int dtype : *in_attrs) {
-    if (dtype == -1)
-      return false;
-  }
-
-  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
-  bool is_int = param.is_int;
-  const int lhs_dtype = in_attrs->at(0);
-  const int rhs_dtype =
-      common::is_float(lhs_dtype)
-          ? lhs_dtype
-          : (is_int ? mxnet::common::GetDefaultIntType() : mxnet::common::GetDefaultDtype());
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, FloorDivideOutType(lhs_dtype, rhs_dtype));
-  return true;
-}
-
-NNVM_REGISTER_OP(_npi_floor_divide)
-    .set_num_inputs(2)
-    .set_num_outputs(1)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<std::string>{"lhs", "rhs"};
-                                     })
-    .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)
-    .set_attr<nnvm::FInferType>("FInferType", FloorDivideType)
-    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-                                    [](const NodeAttrs& attrs) {
-                                      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
-                                    })
-    .set_attr<FCompute>("FCompute<cpu>", FloorDivideBroadcastCompute<cpu>)
-    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-    .add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
-    .add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
-
-
-NNVM_REGISTER_OP(_npi_floor_divide_scalar)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-    .set_attr<nnvm::FInferType>("FInferType", FloorDivideScalarType)
-    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-                                    [](const NodeAttrs& attrs) {
-                                      return std::vector<std::pair<int, int> >{{0, 0}};
-                                    })
-    .set_attr<FCompute>("FCompute<cpu>",
-                        FloorDivideScalarCompute<cpu, op::mshadow_op::floor_divide>)
-    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-    .add_argument("data", "NDArray-or-Symbol", "source input")
-    .add_arguments(NumpyBinaryScalarParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-    .set_attr<nnvm::FInferType>("FInferType", FloorDivideType)
-    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-                                    [](const NodeAttrs& attrs) {
-                                      return std::vector<std::pair<int, int> >{{0, 0}};
-                                    })
-#ifdef _WIN32
-    .set_attr<FResourceRequest>("FResourceRequest",
-                                [](const NodeAttrs& attrs) {
-                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                                })
-#endif
-    .set_attr<FCompute>("FCompute<cpu>", FloorDivideScalarCompute<cpu, mshadow_op::rfloor_divide>)
-    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-    .add_argument("data", "NDArray-or-Symbol", "source input")
-    .add_arguments(NumpyBinaryScalarParam::__FIELDS__());
+MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_floor_divide)
+.set_attr<FCompute>(
+  "FCompute<cpu>",
+  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::floor_divide, op::mshadow_op::mixed_floor_divide,
+                                      op::mshadow_op::mixed_rfloor_divide>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_floor_divide.cu b/src/operator/numpy/np_floor_divide.cu
index b6d01b706460..0ce87a089f19 100644
--- a/src/operator/numpy/np_floor_divide.cu
+++ b/src/operator/numpy/np_floor_divide.cu
@@ -28,13 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_floor_divide)
-    .set_attr<FCompute>("FCompute<gpu>", FloorDivideBroadcastCompute<gpu>);
-
-NNVM_REGISTER_OP(_npi_floor_divide_scalar)
-    .set_attr<FCompute>("FCompute<gpu>", FloorDivideScalarCompute<gpu, mshadow_op::floor_divide>);
-
-NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
-    .set_attr<FCompute>("FCompute<gpu>", FloorDivideScalarCompute<gpu, mshadow_op::rFloor_divide>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"floor_divide"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 488f1a80285d..750035d3f1f9 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3069,6 +3069,8 @@ def forward(self, a, b, *args, **kwargs):
                                 [lambda y, x1, x2: onp.broadcast_to(x1, y.shape)]),
         'divide': (0.1, 1.0, [lambda y, x1, x2: onp.ones(y.shape) / x2],
                    [lambda y, x1, x2: -x1 / (x2 * x2)]),
+        'floor_divide': (0.1, 1.0, [lambda y, x1, x2: onp.zeros(y.shape)],
+                 [lambda y, x1, x2: onp.zeros(y.shape)]),
         'mod': (1.0, 10.0,
                 [lambda y, x1, x2: onp.ones(y.shape),
                  lambda y, x1, x2: onp.zeros(y.shape)],

From a56a1d1ee455d1c534c00efc1b65ae7554f66c5b Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 17 Oct 2021 22:12:29 -0700
Subject: [PATCH 05/19] fix lint

---
 src/operator/mshadow_op.h             | 15 ++++++++-------
 src/operator/numpy/np_floor_divide.cc | 11 ++++++-----
 src/operator/numpy/np_floor_divide.cu |  2 +-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 3397862435f8..32a846419989 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -261,7 +261,7 @@ struct mixed_floor_divide {
   template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
-    mshadow::half::half_t c = static_cast<mshadow::half::half_t>(::floor(a_half / b));
+    mshadow::half::half_t c      = static_cast<mshadow::half::half_t>(::floor(a_half / b));
     if ((c * b != a_half) && (a < 0) != (b < 0)) {
       return mshadow::half::half_t(c - 1);
     } else {
@@ -275,7 +275,7 @@ struct mixed_floor_divide {
                                     int>::type = 0>
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
-    float c = ::floorf(a_float / b);
+    float c       = ::floorf(a_float / b);
     if ((c * b != a_float) && (a < 0) != (b < 0)) {
       return c - 1.0f;
     } else {
@@ -287,9 +287,10 @@ struct mixed_floor_divide {
             typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
                                         std::is_same<DType, float>::value ||
                                         std::is_integral<DType>::value,
-                                    int>::type = 0>  MSHADOW_XINLINE static double Map(DType a, double b) {
+                                    int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
-    double c = ::floor(a_double / b);
+    double c        = ::floor(a_double / b);
     if ((c * b != a_double) && (a < 0) != (b < 0)) {
       return c - 1.0;
     } else {
@@ -302,7 +303,7 @@ struct mixed_rfloor_divide {
   template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
-    mshadow::half::half_t c = static_cast<mshadow::half::half_t>(::floor(b / a_half));
+    mshadow::half::half_t c      = static_cast<mshadow::half::half_t>(::floor(b / a_half));
     if ((c * a_half != b) && (a < 0) != (b < 0)) {
       return mshadow::half::half_t(c - 1);
     } else {
@@ -316,7 +317,7 @@ struct mixed_rfloor_divide {
                                     int>::type = 0>
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
-    float c = ::floorf(b / a_float);
+    float c       = ::floorf(b / a_float);
     if ((c * a_float != b) && (a < 0) != (b < 0)) {
       return c - 1.0f;
     } else {
@@ -331,7 +332,7 @@ struct mixed_rfloor_divide {
                                     int>::type = 0>
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
-    double c = ::floor(b / a_double);
+    double c        = ::floor(b / a_double);
     if ((c * a_double != b) && (a < 0) != (b < 0)) {
       return c - 1.0;
     } else {
diff --git a/src/operator/numpy/np_floor_divide.cc b/src/operator/numpy/np_floor_divide.cc
index df00f8483110..78f6cf58ec7a 100644
--- a/src/operator/numpy/np_floor_divide.cc
+++ b/src/operator/numpy/np_floor_divide.cc
@@ -28,11 +28,12 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_floor_divide)
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::floor_divide, op::mshadow_op::mixed_floor_divide,
-                                      op::mshadow_op::mixed_rfloor_divide>)
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastComputeWithBool<cpu,
+                                                            op::mshadow_op::floor_divide,
+                                                            op::mshadow_op::mixed_floor_divide,
+                                                            op::mshadow_op::mixed_rfloor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_floor_divide.cu b/src/operator/numpy/np_floor_divide.cu
index 0ce87a089f19..81596d8d21f2 100644
--- a/src/operator/numpy/np_floor_divide.cu
+++ b/src/operator/numpy/np_floor_divide.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_floor_divide)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"floor_divide"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"floor_divide"});
 
 }  // namespace op
 }  // namespace mxnet

From 61546b2dea27c175e3f3da5b4d055e9677b694b3 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 18 Oct 2021 09:47:41 -0700
Subject: [PATCH 06/19] update operator_tune.cc

---
 src/operator/operator_tune.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index f83885da1ba1..ddb5eebea63a 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -362,17 +362,21 @@ IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::plus);
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::minus);             // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::mul);               // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::div);               // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::floor_divide);      // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::true_divide);                 // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minus_sign);                  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rminus);                      // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rdiv);                        // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rfloor_divide);               // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::plus);                        // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::minus);                       // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mul);                         // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div);                         // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::floor_divide);                // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::minus_sign);                  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rminus);                      // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rdiv);                        // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rfloor_divide);               // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rtrue_divide);                // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_grad);                    // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div_grad);                    // NOLINT()

From 09e92d61ea0276644548e4b96f656fc57fc6cd29 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 19 Oct 2021 16:00:31 -0700
Subject: [PATCH 07/19] fix

---
 src/operator/mshadow_op.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 32a846419989..e23fe728ba8c 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -237,7 +237,7 @@ struct floor_divide : public mxnet_op::tunable {
   template <typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     DType c = static_cast<DType>(::floor(a / b));
-    if ((c * b != a) && (a < 0) != (b < 0)) {
+    if ((c * b != a) && ((a < 0) != (b < 0))) {
       return DType(c - 1);
     } else {
       return c;
@@ -249,7 +249,7 @@ struct rfloor_divide : public mxnet_op::tunable {
   template <typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     DType c = static_cast<DType>(::floor(b / a));
-    if ((c * a != b) && (a < 0) != (b < 0)) {
+    if ((c * a != b) && ((a < 0) != (b < 0))) {
       return DType(c - 1);
     } else {
       return c;
@@ -262,7 +262,7 @@ struct mixed_floor_divide {
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
     mshadow::half::half_t c      = static_cast<mshadow::half::half_t>(::floor(a_half / b));
-    if ((c * b != a_half) && (a < 0) != (b < 0)) {
+    if ((c * b != a_half) && ((a < 0) != (b < 0))) {
       return mshadow::half::half_t(c - 1);
     } else {
       return c;
@@ -276,7 +276,7 @@ struct mixed_floor_divide {
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c       = ::floorf(a_float / b);
-    if ((c * b != a_float) && (a < 0) != (b < 0)) {
+    if ((c * b != a_float) && ((a < 0) != (b < 0))) {
       return c - 1.0f;
     } else {
       return c;
@@ -291,7 +291,7 @@ struct mixed_floor_divide {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c        = ::floor(a_double / b);
-    if ((c * b != a_double) && (a < 0) != (b < 0)) {
+    if ((c * b != a_double) && ((a < 0) != (b < 0))) {
       return c - 1.0;
     } else {
       return c;
@@ -304,7 +304,7 @@ struct mixed_rfloor_divide {
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
     mshadow::half::half_t c      = static_cast<mshadow::half::half_t>(::floor(b / a_half));
-    if ((c * a_half != b) && (a < 0) != (b < 0)) {
+    if ((c * a_half != b) && ((a < 0) != (b < 0))) {
       return mshadow::half::half_t(c - 1);
     } else {
       return c;
@@ -318,7 +318,7 @@ struct mixed_rfloor_divide {
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c       = ::floorf(b / a_float);
-    if ((c * a_float != b) && (a < 0) != (b < 0)) {
+    if ((c * a_float != b) && ((a < 0) != (b < 0))) {
       return c - 1.0f;
     } else {
       return c;
@@ -333,7 +333,7 @@ struct mixed_rfloor_divide {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c        = ::floor(b / a_double);
-    if ((c * a_double != b) && (a < 0) != (b < 0)) {
+    if ((c * a_double != b) && ((a < 0) != (b < 0))) {
       return c - 1.0;
     } else {
       return c;

From ff6100b2e3dc557b435f034c9b4c7858855f1525 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 20 Oct 2021 15:28:05 -0700
Subject: [PATCH 08/19] fix lint

---
 src/operator/numpy/np_elemwise_broadcast_op_scalar.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
index 366a8efcf123..4fd1f2c84070 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
@@ -62,12 +62,13 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rpower_scalar)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_floor_divide_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::floor_divide>)
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::floor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rfloor_divide_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::rfloor_divide>)
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+    .set_attr<FCompute>("FCompute<cpu>",
+                        BinaryScalarOp::Compute<cpu, op::mshadow_op::rfloor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 }  // namespace op
 }  // namespace mxnet

From 0d56b5abb36d4c68f81b60b0570f2a6fd3c501dd Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 20 Oct 2021 16:25:22 -0700
Subject: [PATCH 09/19] fix build

---
 src/operator/mshadow_op.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index a6df1466879c..2e83d73ecb3b 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -272,7 +272,7 @@ struct mixed_floor_divide {
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
     mshadow::half::half_t c      = static_cast<mshadow::half::half_t>(::floor(a_half / b));
-    if ((c * b != a_half) && ((a < 0) != (b < 0))) {
+    if ((c * b != a_half) && ((a_half < 0) != (b < 0))) {
       return mshadow::half::half_t(c - 1);
     } else {
       return c;
@@ -286,7 +286,7 @@ struct mixed_floor_divide {
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c       = ::floorf(a_float / b);
-    if ((c * b != a_float) && ((a < 0) != (b < 0))) {
+    if ((c * b != a_float) && ((a_float < 0) != (b < 0))) {
       return c - 1.0f;
     } else {
       return c;
@@ -301,7 +301,7 @@ struct mixed_floor_divide {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c        = ::floor(a_double / b);
-    if ((c * b != a_double) && ((a < 0) != (b < 0))) {
+    if ((c * b != a_double) && ((a_double < 0) != (b < 0))) {
       return c - 1.0;
     } else {
       return c;
@@ -314,7 +314,7 @@ struct mixed_rfloor_divide {
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
     mshadow::half::half_t a_half = static_cast<mshadow::half::half_t>(a);
     mshadow::half::half_t c      = static_cast<mshadow::half::half_t>(::floor(b / a_half));
-    if ((c * a_half != b) && ((a < 0) != (b < 0))) {
+    if ((c * a_half != b) && ((a_half < 0) != (b < 0))) {
       return mshadow::half::half_t(c - 1);
     } else {
       return c;
@@ -328,7 +328,7 @@ struct mixed_rfloor_divide {
   MSHADOW_XINLINE static float Map(DType a, float b) {
     float a_float = static_cast<float>(a);
     float c       = ::floorf(b / a_float);
-    if ((c * a_float != b) && ((a < 0) != (b < 0))) {
+    if ((c * a_float != b) && ((a_float < 0) != (b < 0))) {
       return c - 1.0f;
     } else {
       return c;
@@ -343,7 +343,7 @@ struct mixed_rfloor_divide {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     double a_double = static_cast<double>(a);
     double c        = ::floor(b / a_double);
-    if ((c * a_double != b) && ((a < 0) != (b < 0))) {
+    if ((c * a_double != b) && ((a_double < 0) != (b < 0))) {
       return c - 1.0;
     } else {
       return c;

From 0b77f03c6450682f64f30b7826f5de6ca4bb6861 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 21 Oct 2021 10:29:39 -0700
Subject: [PATCH 10/19] fix include

---
 src/operator/numpy/np_floor_divide.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_floor_divide.cu b/src/operator/numpy/np_floor_divide.cu
index 81596d8d21f2..54fbd9d36642 100644
--- a/src/operator/numpy/np_floor_divide.cu
+++ b/src/operator/numpy/np_floor_divide.cu
@@ -22,7 +22,7 @@
  * \brief GPU Implementation of floor_divide operator.
  */
 
-#include "./np_floor_divide-inl.h"
+#include "./np_elemwise_broadcast_op.h"
 
 namespace mxnet {
 namespace op {

From 2ff4918c3b33d5e6336c4ba5e2046ed2e97a89be Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 21 Oct 2021 17:48:48 -0700
Subject: [PATCH 11/19] fix rtc functions

---
 src/common/cuda/rtc/forward_functions-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index cb93de8c49a6..8b29009520a8 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -262,7 +262,7 @@ rsub(const DType a, const DType2 b) {
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 floor_divide(const DType a, const DType2 b) {
-  mixed_type<DType, DType2> c = op::floor(op::div(a, b));
+  mixed_type<DType, DType2> c = ::floor(a / b);
   if ((c * b != a) && (a < 0) != (b < 0)) {
     return mixed_type<DType, DType2>(c - 1);
   } else {
@@ -273,7 +273,7 @@ floor_divide(const DType a, const DType2 b) {
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 rfloor_divide(const DType a, const DType2 b) {
-  mixed_type<DType, DType2> c = op::floor(op::div(b, a));
+  mixed_type<DType, DType2> c = ::floor(b / a);
   if ((c * a != b) && (a < 0) != (b < 0)) {
     return mixed_type<DType, DType2>(c - 1);
   } else {

From 062ae01a019dc924417e664a3bca6f68207babb3 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 21 Oct 2021 20:28:01 -0700
Subject: [PATCH 12/19] add amp list

---
 python/mxnet/amp/lists/symbol_fp16.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index b561b335d9a7..ca3955b1d236 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -265,6 +265,9 @@
     '_npi_multinomial',
     '_npi_multiply',
     '_npi_multiply_scalar',
+    '_npi_floor_divide',
+    '_npi_floor_divide_scalar',
+    '_npi_rfloor_divide_scalar',
     '_npi_nan_to_num',
     '_npi_negative',
     '_npi_normal',

From 3b9a3f071a9929d50dc40221f1c3e573e0f2ab59 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 22 Oct 2021 13:39:59 -0700
Subject: [PATCH 13/19] add floor_divide in GPU

---
 src/operator/numpy/np_elemwise_broadcast_op_scalar.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
index 024d02a21d65..e21bac115ab4 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
@@ -51,5 +51,11 @@ NNVM_REGISTER_OP(_npi_power_scalar)
 NNVM_REGISTER_OP(_npi_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
+NNVM_REGISTER_OP(_npi_floor_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"floor_divide"});
+
+NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rfloor_divide"});
+
 }  // namespace op
 }  // namespace mxnet

From 876c2fb7d73245242cbcb45f2269234eea12597c Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 22 Oct 2021 13:47:12 -0700
Subject: [PATCH 14/19] fix lint

---
 src/operator/numpy/np_elemwise_broadcast_op_scalar.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
index e21bac115ab4..c7bbeefb4445 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
@@ -52,10 +52,10 @@ NNVM_REGISTER_OP(_npi_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
 NNVM_REGISTER_OP(_npi_floor_divide_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"floor_divide"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"floor_divide"});
 
 NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rfloor_divide"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rfloor_divide"});
 
 }  // namespace op
 }  // namespace mxnet

From 18549fa6c3081f730135bdaedad5c46301fda81a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 25 Oct 2021 09:24:13 -0700
Subject: [PATCH 15/19] fix docstring

---
 python/mxnet/ndarray/numpy/_op.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 477eb860a29b..22f43e0275eb 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -1200,6 +1200,7 @@ def floor_divide(x1, x2, out=None):
        * If only one of the inputs is floating number type, the result is that type.
        * If both inputs are of integer types (including boolean), the output is the more
        precise type
+
     """
     if isinstance(x1, numeric_types) and isinstance(x2, numeric_types):
         return _np.floor_divide(x1, x2, out=out)

From c1b564ffc7ed41c90c04b39730bc43554226a9bd Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 25 Oct 2021 12:56:22 -0700
Subject: [PATCH 16/19] Fix docstring

---
 python/mxnet/numpy/multiarray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index c0ccacb11921..0b6a0c02c688 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -3454,6 +3454,7 @@ def true_divide(x1, x2, out=None):
 @wrap_np_binary_func
 def floor_divide(x1, x2, out=None):
     """Return the largest integer smaller or equal to the division of the inputs.
+
     It is equivalent to the Python // operator and pairs with the Python % (remainder),
     function so that a = a % b + b * (a // b) up to roundoff.
 
@@ -3481,7 +3482,7 @@ def floor_divide(x1, x2, out=None):
        * If both inputs are of floating number types, the output is the more precise type.
        * If only one of the inputs is floating number type, the result is that type.
        * If both inputs are of integer types (including boolean), the output is the more
-       precise type
+         precise type
 
     Examples
     --------

From d6c2f65ca7a8e38e1edcaa3ff137a40a81b947c1 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 27 Oct 2021 14:37:13 -0700
Subject: [PATCH 17/19] fix lint

---
 src/operator/mshadow_op.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 3d331a3cc112..34f852ddaa02 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -234,9 +234,10 @@ struct rtrue_divide : public mxnet_op::tunable {
 /***** floor_divide ******/
 
 struct floor_divide : public mxnet_op::tunable {
-  template <typename DType,
-            typename std::enable_if<!std::is_same<DType, bool>::value &&
-                                    std::is_integral<DType>::value, int>::type = 0>
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_same<DType, bool>::value && std::is_integral<DType>::value,
+                              int>::type = 0>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     DType c = static_cast<DType>(::floor(a / b));
     if ((c * a != b) && ((a < 0) != (b < 0))) {
@@ -250,9 +251,10 @@ struct floor_divide : public mxnet_op::tunable {
     return static_cast<bool>(::floor(a / b));
   }
 
-  template <typename DType,
-            typename std::enable_if<!std::is_integral<DType>::value &&
-                                    !std::is_same<DType, float>::value, int>::type = 0>
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_integral<DType>::value && !std::is_same<DType, float>::value,
+                              int>::type = 0>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     return ::floor(a / b);
   }
@@ -263,9 +265,10 @@ struct floor_divide : public mxnet_op::tunable {
 };
 
 struct rfloor_divide : public mxnet_op::tunable {
-  template <typename DType,
-            typename std::enable_if<!std::is_same<DType, bool>::value &&
-                                    std::is_integral<DType>::value, int>::type = 0>
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_same<DType, bool>::value && std::is_integral<DType>::value,
+                              int>::type = 0>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     DType c = static_cast<DType>(::floor(b / a));
     if ((c * a != b) && ((a < 0) != (b < 0))) {
@@ -279,9 +282,10 @@ struct rfloor_divide : public mxnet_op::tunable {
     return static_cast<bool>(::floor(b / a));
   }
 
-  template <typename DType,
-            typename std::enable_if<!std::is_integral<DType>::value &&
-                                    !std::is_same<DType, float>::value, int>::type = 0>
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_integral<DType>::value && !std::is_same<DType, float>::value,
+                              int>::type = 0>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     return ::floor(b / a);
   }

From 8c94120f9b58af77708b8feeb73f3fa1a0fc40ff Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 27 Oct 2021 16:47:58 -0700
Subject: [PATCH 18/19] update rtc

---
 src/common/cuda/rtc/forward_functions-inl.h | 26 ++++++++++++++-------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 76fa4dc0234a..cf82e2c70fce 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -262,22 +262,32 @@ rsub(const DType a, const DType2 b) {
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 floor_divide(const DType a, const DType2 b) {
-  mixed_type<DType, DType2> c = ::floor(a / b);
-  if ((c * b != a) && (a < 0) != (b < 0)) {
-    return mixed_type<DType, DType2>(c - 1);
+  if (type_util::is_integral<DType>::value &&
+      type_util::is_integral<DType2>::value) {
+    mixed_type<DType, DType2> c = ::floor(a / b);
+    if ((c * b != a) && (a < 0) != (b < 0)) {
+      return mixed_type<DType, DType2>(c - 1);
+    } else {
+      return c;
+    }
   } else {
-    return c;
+    return ::floor(a / b);
   }
 }
 
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 rfloor_divide(const DType a, const DType2 b) {
-  mixed_type<DType, DType2> c = ::floor(b / a);
-  if ((c * a != b) && (a < 0) != (b < 0)) {
-    return mixed_type<DType, DType2>(c - 1);
+  if (type_util::is_integral<DType>::value &&
+      type_util::is_integral<DType2>::value) {
+    mixed_type<DType, DType2> c = ::floor(b / a);
+    if ((c * a != b) && (a < 0) != (b < 0)) {
+      return mixed_type<DType, DType2>(c - 1);
+    } else {
+      return c;
+    }
   } else {
-    return c;
+    return ::floor(b / a);
   }
 }
 

From c93df5db5cffa3b5031db7a102e2f2290e4a4e85 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 28 Oct 2021 11:59:09 -0700
Subject: [PATCH 19/19] fix rtc

---
 src/common/cuda/rtc/forward_functions-inl.h   | 24 +++++--------------
 .../unittest/test_numpy_interoperability.py   |  1 +
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index cf82e2c70fce..2b457092b3c8 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -262,32 +262,20 @@ rsub(const DType a, const DType2 b) {
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 floor_divide(const DType a, const DType2 b) {
-  if (type_util::is_integral<DType>::value &&
-      type_util::is_integral<DType2>::value) {
-    mixed_type<DType, DType2> c = ::floor(a / b);
-    if ((c * b != a) && (a < 0) != (b < 0)) {
-      return mixed_type<DType, DType2>(c - 1);
-    } else {
-      return c;
-    }
+  if (type_util::has_double_or_integral<DType, DType2>::value) {
+    return ::floor((double)a / (double)b);
   } else {
-    return ::floor(a / b);
+    return ::floorf((float)a / (float)b);
   }
 }
 
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 rfloor_divide(const DType a, const DType2 b) {
-  if (type_util::is_integral<DType>::value &&
-      type_util::is_integral<DType2>::value) {
-    mixed_type<DType, DType2> c = ::floor(b / a);
-    if ((c * a != b) && (a < 0) != (b < 0)) {
-      return mixed_type<DType, DType2>(c - 1);
-    } else {
-      return c;
-    }
+  if (type_util::has_double_or_integral<DType, DType2>::value) {
+    return ::floor((double)b / (double)a);
   } else {
-    return ::floor(b / a);
+    return ::floorf((float)b / (float)a);
   }
 }
 
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 06b8871fc3d5..c8edad6b59f3 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -1594,6 +1594,7 @@ def _add_workload_floor_divide(array_pool):
     OpArgMngr.add_workload('floor_divide', np.array([-1, -2, -3], np.float32), 1.9999)
     OpArgMngr.add_workload('floor_divide', np.array([1000, -200, -3], np.int64), 3)
     OpArgMngr.add_workload('floor_divide', np.array([1, -2, -3, 4, -5], np.int32), 2.0001)
+    OpArgMngr.add_workload('floor_divide', np.array([1, -50, -0.2, 40000, 0], np.float64), -7)
 
 
 def _add_workload_remainder():