diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index dfca5c4e8e..115abfa7c7 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -1136,6 +1136,97 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
 
+// CUDA kernel for stress Ewald sincos calculation
+// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
+template <typename FPTYPE>
+__global__ void cal_stress_ewa_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+
+    // Parallelize over G-vectors (ig) instead of atoms (iat)
+    const int ig = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (ig >= npw) return;
+
+    // Skip G=0 term
+    if (ig == ig_gge0) return;
+
+    // Local accumulation variables for this G-vector
+    FPTYPE local_rhostar_real = 0.0;
+    FPTYPE local_rhostar_imag = 0.0;
+
+    // Load G-vector components to registers
+    const FPTYPE gcar_x = gcar[ig * 3 + 0];
+    const FPTYPE gcar_y = gcar[ig * 3 + 1];
+    const FPTYPE gcar_z = gcar[ig * 3 + 2];
+
+    // Loop over all atoms for this G-vector (matches CPU implementation)
+    for (int iat = 0; iat < nat; iat++) {
+        // Load atom information to registers
+        const FPTYPE tau_x = tau[iat * 3 + 0];
+        const FPTYPE tau_y = tau[iat * 3 + 1];
+        const FPTYPE tau_z = tau[iat * 3 + 2];
+        const FPTYPE zv = zv_facts[iat];
+
+        // Calculate phase: 2π * (G · τ) - same as CPU implementation
+        const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
+                                       gcar_y * tau_y +
+                                       gcar_z * tau_z);
+
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Accumulate structure factor locally (no race conditions)
+        local_rhostar_real += zv * cosp;
+        local_rhostar_imag += zv * sinp;
+    }
+
+    // Store results - each thread writes to unique memory location
+    rhostar_real[ig] = local_rhostar_real;
+    rhostar_imag[ig] = local_rhostar_imag;
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    
+    // Calculate grid configuration for G-vector parallelization
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
+
+    dim3 grid(num_blocks);
+    dim3 block(threads_per_block);
+
+    cal_stress_ewa_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, zv_facts,
+        rhostar_real, rhostar_imag
+    );
+
+    cudaCheckOnDebug();
+}
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
+
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
 }  // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index ef138c04cc..85e9744470 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -1117,4 +1117,96 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
+
+// HIP kernel for stress Ewald sincos calculation
+// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
+template <typename FPTYPE>
+__global__ void cal_stress_ewa_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+
+    // Parallelize over G-vectors (ig) instead of atoms (iat)
+    const int ig = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (ig >= npw) return;
+
+    // Skip G=0 term
+    if (ig == ig_gge0) return;
+
+    // Local accumulation variables for this G-vector
+    FPTYPE local_rhostar_real = 0.0;
+    FPTYPE local_rhostar_imag = 0.0;
+
+    // Load G-vector components to registers
+    const FPTYPE gcar_x = gcar[ig * 3 + 0];
+    const FPTYPE gcar_y = gcar[ig * 3 + 1];
+    const FPTYPE gcar_z = gcar[ig * 3 + 2];
+
+    // Loop over all atoms for this G-vector (matches CPU implementation)
+    for (int iat = 0; iat < nat; iat++) {
+        // Load atom information to registers
+        const FPTYPE tau_x = tau[iat * 3 + 0];
+        const FPTYPE tau_y = tau[iat * 3 + 1];
+        const FPTYPE tau_z = tau[iat * 3 + 2];
+        const FPTYPE zv = zv_facts[iat];
+
+        // Calculate phase: 2π * (G · τ) - same as CPU implementation
+        const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
+                                       gcar_y * tau_y +
+                                       gcar_z * tau_z);
+
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Accumulate structure factor locally (no race conditions)
+        local_rhostar_real += zv * cosp;
+        local_rhostar_imag += zv * sinp;
+    }
+
+    // Store results - each thread writes to unique memory location
+    rhostar_real[ig] = local_rhostar_real;
+    rhostar_imag[ig] = local_rhostar_imag;
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    // Calculate grid configuration for G-vector parallelization
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
+
+    // Use 1D grid since we're parallelizing over G-vectors only
+    dim3 grid(num_blocks);
+    dim3 block(threads_per_block);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw, ig_gge0, gcar, tau, zv_facts,
+        rhostar_real, rhostar_imag
+    );
+
+    hipCheckOnDebug();
+}
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
+
 }  // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index 0cd0e1ab96..8477973174 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -759,6 +759,64 @@ template struct cal_force_npw_op<double, base_device::DEVICE_CPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_CPU>;
 template struct cal_multi_dot_op<double, base_device::DEVICE_CPU>;
 
+// CPU implementation of Ewald stress sincos operator
+template <typename FPTYPE>
+struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* zv_facts,
+                    FPTYPE* rhostar_real,
+                    FPTYPE* rhostar_imag)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+        // Note: Arrays are already initialized to zero in the calling function
+        // No need to initialize again here to avoid redundant operations
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int ig = 0; ig < npw; ig++) {
+            if (ig == ig_gge0) continue; // Skip G=0
+
+            FPTYPE local_rhostar_real = 0.0;
+            FPTYPE local_rhostar_imag = 0.0;
+
+            // Double loop: iat -> ig (as requested)
+            for (int iat = 0; iat < nat; iat++) {
+                const FPTYPE tau_x = tau[iat * 3 + 0];
+                const FPTYPE tau_y = tau[iat * 3 + 1];
+                const FPTYPE tau_z = tau[iat * 3 + 2];
+                const FPTYPE zv = zv_facts[iat];
+
+                // Calculate phase: 2π * (G · τ) - similar to cal_force_ewa phase
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+                                               gcar[ig * 3 + 1] * tau_y +
+                                               gcar[ig * 3 + 2] * tau_z);
+
+                // Calculate sincos
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+
+                // Accumulate structure factor
+                local_rhostar_real += zv * cosp;
+                local_rhostar_imag += zv * sinp;
+            }
+
+            // Store results
+            rhostar_real[ig] = local_rhostar_real;
+            rhostar_imag[ig] = local_rhostar_imag;
+        }
+    }
+};
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_CPU>;
 
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_CPU>;
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_CPU>;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 7fecd96d75..1154c3c79f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -260,6 +260,35 @@ struct cal_multi_dot_op{
                     const std::complex<FPTYPE>* psi);
 };
 
+template <typename FPTYPE, typename Device>
+struct cal_stress_ewa_sincos_op
+{
+    /// @brief Calculate Ewald stress sincos computation
+    /// Only computes rhostar, other calculations remain in original function
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - total number of atoms
+    /// @param npw - number of plane waves
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param zv_facts - precomputed zv factors for each atom [nat]
+    ///
+    /// Output Parameters
+    /// @param rhostar_real - real part of structure factor [npw]
+    /// @param rhostar_imag - imaginary part of structure factor [npw]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* zv_facts,
+                    FPTYPE* rhostar_real,
+                    FPTYPE* rhostar_imag);
+};
+
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
@@ -434,6 +463,20 @@ struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_GPU>{
                     const std::complex<FPTYPE>* psi);
 };
 
+template <typename FPTYPE>
+struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* zv_facts,
+                    FPTYPE* rhostar_real,
+                    FPTYPE* rhostar_imag);
+};
+
 /**
  * The operator is used to compute the auxiliary amount of stress /force 
  * in parallel on the GPU. They identify type with the type provided and 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp
index 5dbe2cd6b4..20cb98226e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp
@@ -4,6 +4,7 @@
 #include "module_base/tool_threading.h"
 #include "module_base/libm/libm.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "kernels/stress_op.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -56,6 +57,96 @@ void Stress_Func<FPTYPE, Device>::stress_ewa(const UnitCell& ucell,
     if (PARAM.globalv.gamma_only_pw && is_pw) fact=2.0;
 //    else fact=1.0;
 
+    // Prepare data for the sincos op
+    std::vector<FPTYPE> zv_facts_host(ucell.nat);
+    std::vector<FPTYPE> tau_flat(ucell.nat * 3);
+
+    for (int iat = 0; iat < ucell.nat; iat++) {
+        int it = ucell.iat2it[iat];
+        int ia = ucell.iat2ia[iat];
+
+        zv_facts_host[iat] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
+
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+    }
+
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+
+    // Allocate result arrays
+    std::vector<FPTYPE> rhostar_real_host(rho_basis->npw);
+    std::vector<FPTYPE> rhostar_imag_host(rho_basis->npw);
+
+    // Device pointers for GPU data transfer
+    FPTYPE* d_gcar = nullptr;
+    FPTYPE* d_tau = nullptr;
+    FPTYPE* d_zv_facts = nullptr;
+    FPTYPE* d_rhostar_real = nullptr;
+    FPTYPE* d_rhostar_imag = nullptr;
+
+    // GPU memory management and data transfer
+    if (this->device == base_device::GpuDevice) {
+        // Allocate GPU memory
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, ucell.nat * 3);
+        resmem_var_op()(this->ctx, d_zv_facts, ucell.nat);
+        resmem_var_op()(this->ctx, d_rhostar_real, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_rhostar_imag, rho_basis->npw);
+
+        // Data transfer H2D
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), ucell.nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_zv_facts, zv_facts_host.data(), ucell.nat);
+
+        // Initialize output arrays to zero on GPU
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_rhostar_real, 0.0, rho_basis->npw);
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_rhostar_imag, 0.0, rho_basis->npw);
+    } else {
+        // CPU case: use host pointers directly and initialize arrays to zero
+        d_gcar = gcar_flat.data();
+        d_tau = tau_flat.data();
+        d_zv_facts = zv_facts_host.data();
+        d_rhostar_real = rhostar_real_host.data();
+        d_rhostar_imag = rhostar_imag_host.data();
+
+        // Initialize output arrays to zero for CPU case
+        std::fill(rhostar_real_host.begin(), rhostar_real_host.end(), static_cast<FPTYPE>(0.0));
+        std::fill(rhostar_imag_host.begin(), rhostar_imag_host.end(), static_cast<FPTYPE>(0.0));
+    }
+
+    // Call sincos op (outside OpenMP parallel region, op has its own parallelization)
+    hamilt::cal_stress_ewa_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        ucell.nat,
+        rho_basis->npw,
+        rho_basis->ig_gge0,
+        d_gcar,
+        d_tau,
+        d_zv_facts,
+        d_rhostar_real,
+        d_rhostar_imag
+    );
+
+    // GPU data transfer D2H and memory cleanup
+    if (this->device == base_device::GpuDevice) {
+        // Transfer results back to host
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_real_host.data(), d_rhostar_real, rho_basis->npw);
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_imag_host.data(), d_rhostar_imag, rho_basis->npw);
+
+        // Free GPU memory
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_var_op()(this->ctx, d_zv_facts);
+        delmem_var_op()(this->ctx, d_rhostar_real);
+        delmem_var_op()(this->ctx, d_rhostar_imag);
+    }
+
 #ifdef _OPENMP
 #pragma omp parallel
 {
@@ -76,27 +167,21 @@ void Stress_Func<FPTYPE, Device>::stress_ewa(const UnitCell& ucell,
 	ig_end = ig + ig_end;
 
     FPTYPE g2,g2a;
-    FPTYPE arg;
-    std::complex<FPTYPE> rhostar;
     FPTYPE sewald;
     for(; ig < ig_end; ig++)
 	{
 		if(ig == ig0)  continue;
 		g2 = rho_basis->gg[ig]* ucell.tpiba2;
 		g2a = g2 /4.0/alpha;
-		rhostar=std::complex<FPTYPE>(0.0,0.0);
-		for(int it=0; it < ucell.ntype; it++)
-		{
-			for(int i=0; i<ucell.atoms[it].na; i++)
-			{
-				arg = (rho_basis->gcar[ig] * ucell.atoms[it].tau[i]) * (ModuleBase::TWO_PI);
-				FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(arg, &sinp, &cosp);
-				rhostar = rhostar + std::complex<FPTYPE>(ucell.atoms[it].ncpp.zv * cosp,ucell.atoms[it].ncpp.zv * sinp);
-			}
-		}
-		rhostar /= ucell.omega;
-		sewald = fact* (ModuleBase::TWO_PI) * ModuleBase::e2 * ModuleBase::libm::exp(-g2a) / g2 * pow(std::abs(rhostar),2);
+
+		// Use precomputed rhostar values
+		FPTYPE rhostar_real = rhostar_real_host[ig] / ucell.omega;
+		FPTYPE rhostar_imag = rhostar_imag_host[ig] / ucell.omega;
+
+		// Calculate |rhostar|² - mathematically equivalent to pow(std::abs(rhostar), 2)
+		FPTYPE rhostar_abs2 = rhostar_real * rhostar_real + rhostar_imag * rhostar_imag;
+
+		sewald = fact* (ModuleBase::TWO_PI) * ModuleBase::e2 * ModuleBase::libm::exp(-g2a) / g2 * rhostar_abs2;
 		local_sdewald -= sewald;
 		for(int l=0;l<3;l++)
 		{