From 38e78f854d4c4db6dcc07440056f34c71a61b781 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 4 Jun 2022 17:41:26 -0400 Subject: [PATCH 1/3] replace 1./sqrt with rsqrt Per NVIDIA doc: > 11.1.3. Reciprocal Square Root > The reciprocal square root should always be invoked explicitly as rsqrtf() for single precision and rsqrt() for double precision. The compiler optimizes 1.0f/sqrtf(x) into rsqrtf() only when this does not violate IEEE-754 semantics. See https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#reciprocal-square-root --- source/lib/src/cuda/prod_env_mat.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index 85941b9da0..eac94a7ffa 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -7,6 +7,8 @@ __device__ inline double _sqrt(double x) {return sqrt(x);} __device__ inline float _sqrt(float x) {return sqrtf(x);} +__device__ inline double _rsqrt(double x) {return rsqrt(x);} +__device__ inline float _rsqrt(float x) {return rsqrtf(x);} // common part of prod_env_mat template < @@ -408,7 +410,7 @@ __global__ void compute_env_mat_a( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = (FPTYPE)1./_sqrt(nr2); + FPTYPE inr = (FPTYPE)_rsqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; @@ -494,7 +496,7 @@ __global__ void compute_env_mat_r( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = (FPTYPE)1./_sqrt(nr2); + FPTYPE inr = (FPTYPE)_rsqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; From f23e456de88ec8d3f0f254d967ad0cc246aa6519 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 4 Jun 2022 17:42:59 -0400 Subject: [PATCH 2/3] remove FPTYPE as it has been FPTYPE --- source/lib/src/cuda/prod_env_mat.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index eac94a7ffa..93a2b6a787 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -410,7 +410,7 @@ __global__ void compute_env_mat_a( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = (FPTYPE)_rsqrt(nr2); + FPTYPE inr = _rsqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; @@ -496,7 +496,7 @@ __global__ void compute_env_mat_r( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = (FPTYPE)_rsqrt(nr2); + FPTYPE inr = _rsqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; From 486fb8c7e6e08f119185ebd918998ce1657abfb3 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 4 Jun 2022 17:56:46 -0400 Subject: [PATCH 3/3] apply the same opt for ROCM --- source/lib/src/rocm/prod_env_mat.hip.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu index 6a437bd3e0..506a844a04 100644 --- a/source/lib/src/rocm/prod_env_mat.hip.cu +++ b/source/lib/src/rocm/prod_env_mat.hip.cu @@ -5,6 +5,8 @@ __device__ inline double _sqrt(double x) {return sqrt(x);} __device__ inline float _sqrt(float x) {return sqrtf(x);} +__device__ inline double _rsqrt(double x) {return rsqrt(x);} +__device__ inline float _rsqrt(float x) {return rsqrtf(x);} // common part of prod_env_mat template < @@ -406,7 +408,7 @@ __global__ void compute_env_mat_a( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = (FPTYPE)1./_sqrt(nr2); + FPTYPE inr = _rsqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; @@ -492,7 +494,7 @@ __global__ void compute_env_mat_r( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = (FPTYPE)1./_sqrt(nr2); + FPTYPE inr = _rsqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2;