diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md
index 70e448f4e0..64d4753275 100644
--- a/docs/advanced/acceleration/cuda.md
+++ b/docs/advanced/acceleration/cuda.md
@@ -37,7 +37,8 @@ The ABACUS program will automatically determine whether the current ELPA support
 
 In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs.
 - Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver`, `cusolvermp` and `elpa` is supported on GPU.
-- **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. 
+- **single-card**: ABACUS allows for single-GPU acceleration. You can run ABACUS without any MPI process by command `abacus`, and `ks_solver cusolver` is recommended for the LCAO basis. *note: avoid using `mpirun -n 1 abacus`*.
+- **multi-cards**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. *note: the number of MPI processes SHOULD be equal to the number of GPU cards, unless you are using MPS in your computer.*
 
 ## Examples
 We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations.
diff --git a/examples/36_gpu/si16_lcao/INPUT b/examples/36_gpu/si16_lcao/INPUT
index 1ecbd7e13a..2aea35eb9c 100644
--- a/examples/36_gpu/si16_lcao/INPUT
+++ b/examples/36_gpu/si16_lcao/INPUT
@@ -3,9 +3,8 @@ INPUT_PARAMETERS
 suffix			autotest
 calculation     scf
 device          gpu
-gamma_only      1  # GPU acceleration currently only support gamma_only set to 1. ### Abacus will generate/overwrite a KPT file when gamma_only is set to 1.
 ks_solver		cusolver  # if not set, the default ks_solver is cusolver,
-                          # you can also choose genelpa or scalapack_gvx.
+                          # you can also choose cusolvermp or elpa if compiled.
 
 #nbands			8
 symmetry		1
@@ -26,7 +25,7 @@ smearing_sigma		0.002
 
 #Parameters (5.Mixing)
 mixing_type		broyden
-mixing_beta		0.3
+mixing_beta		0.4
 
 
 ### [1] Energy cutoff determines the quality of numerical quadratures in your calculations.
diff --git a/examples/36_gpu/si16_lcao/KPT b/examples/36_gpu/si16_lcao/KPT
index da8500ebdb..5acdc7a0f7 100644
--- a/examples/36_gpu/si16_lcao/KPT
+++ b/examples/36_gpu/si16_lcao/KPT
@@ -1,5 +1,7 @@
 K_POINTS
 0
 Gamma
-1 1 1 0 0 0
-###This file will be overwritten by Abacus because either kspacing is used or gamma_only is set to 1
+5 5 5 0 0 0
+### If you are running an energy calculation, please make sure your final energy is
+### converged with respect to the k-point settings, unless you set a loose k-point
+### mesh on purpose.
diff --git a/examples/36_gpu/si16_pw/INPUT b/examples/36_gpu/si16_pw/INPUT
index 67e75edbca..c14489af3b 100644
--- a/examples/36_gpu/si16_pw/INPUT
+++ b/examples/36_gpu/si16_pw/INPUT
@@ -21,7 +21,7 @@ smearing_sigma		0.002
 
 #Parameters (5.Mixing)
 mixing_type		broyden
-mixing_beta		0.3
+mixing_beta		0.4
 
 
 ### [1] Energy cutoff determines the quality of numerical quadratures in your calculations.