deepmodeling · maorz1998 · Jun 11, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,16 +1,6 @@
 {
     volScalarField& he = thermo.he();
 
-#ifdef GPUSolver_
-        start1 = std::clock();
-        UEqn_GPU.updatePsi(&U[0][0]);
-        K = 0.5*magSqr(U);
-        end1 = std::clock();
-        time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
 #ifdef CPUSolver_
     start1 = std::clock();
     fvScalarMatrix EEqn
@@ -32,7 +22,16 @@
     EEqn.solve();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
+
+#ifdef GPUSolver_
+    start1 = std::clock();
+    UEqn_GPU.updatePsi(&U[0][0]);
+    K = 0.5*magSqr(U);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 #ifdef GPUSolver_
@@ -41,33 +40,39 @@
     start2 = std::clock();
     const tmp<volScalarField> alphaEff_tmp(turbulence->alphaEff());
     const volScalarField& alphaEff = alphaEff_tmp();
+    end2 = std::clock();
     int eeqn_offset = 0;
+    int patchNum = 0;
     forAll(he.boundaryField(), patchi)
     {
-        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
+        patchNum++;
         const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
         int patchSize = pw.size();
 
         const scalarField& patchK = K.boundaryField()[patchi];
-        const vectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi];
         const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi];
         memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
-        memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double));
         memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
 
         eeqn_offset += patchSize;
     }
     end1 = std::clock();
-    time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n",
+            time_monitor_EEqn_mtxAssembly_CPU_prepare,
+            double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum);
 
     // prepare data on GPU
     start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
-            &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0],
-            boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff);
-    if (doSync) EEqn_GPU.sync();
+            &dpdt[0], boundary_K, boundary_alphaEff);
+    EEqn_GPU.sync();
     end1 = std::clock();
-    time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
     EEqn_GPU.initializeTimeStep();
@@ -78,30 +83,26 @@
     EEqn_GPU.fvc_div_phi_scalar();
     EEqn_GPU.fvc_div_vector();
     EEqn_GPU.add_to_source();
-    if (doSync) EEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_EEqn_mtxAssembly_GPU_Run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
     EEqn_GPU.sync();
-    end2 = std::clock();
-    time_monitor_EEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // check value of mtxAssembly, no time monitor
     // EEqn_GPU.checkValue(false);
 
     start1 = std::clock();
     EEqn_GPU.solve();
-    if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
     EEqn_GPU.updatePsi(&he[0]);
     he.correctBoundaryConditions();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 }
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,9 +1,6 @@
 // Solve the Momentum equation
 #ifdef GPUSolver_
     start1 = std::clock();
-    UEqn_GPU.initializeTimeStep();
-    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
-    start2 = std::clock();
     int offset = 0;
     const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
     const volScalarField& nuEff = nuEff_tmp();
@@ -27,21 +24,24 @@
         offset += patchSize;
     }
     end1 = std::clock();
-    end2 = std::clock();
-    time_monitor_UEqn_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
+    UEqn_GPU.initializeTimeStep();
+    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
     UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
     UEqn_GPU.fvc_grad(&p[0]);
     UEqn_GPU.fvc_grad_vector();
     UEqn_GPU.dev2T();
     UEqn_GPU.fvc_div_tensor(&nuEff[0]);
     UEqn_GPU.fvm_laplacian();
+    UEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // start2 = std::clock();
     // fvVectorMatrix turb_source
@@ -88,15 +88,15 @@
     }
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 // start1 = std::clock();
 // // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
 // UEqn_GPU.solve();
 // end1 = std::clock();
 // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-// time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
 // start1 = std::clock();
 // // // t.join();

diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -1,4 +1,3 @@
-start = std::clock();
 hDiffCorrFlux = Zero;
 diffAlphaD = Zero;
 sumYDiffError = Zero;
@@ -14,6 +13,7 @@ tmp<fv::convectionScheme<scalar>> mvConvection
     )
 );
 
+#ifdef CPUSolver_
 start1 = std::clock();
 forAll(Y, i)
 {
@@ -22,78 +22,86 @@ forAll(Y, i)
 const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 start1 = std::clock();
 time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
 
 #ifdef GPUSolver_
     start1 = std::clock();
-    // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
     UEqn_GPU.solve();
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
-#ifdef GPUSolver_
     start1 = std::clock();
-    start2 = std::clock();
-    std::vector<double*> Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size());
-    std::vector<const double*> rhoD_GPU(Y.size());
+    std::vector<double*> Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size());
+    std::vector<const double*> hai(Y.size()), rhoD(Y.size());
     for (size_t i = 0; i < Y.size(); ++i)
     {
         volScalarField& Yi = Y[i];
-        const volScalarField& rhoDi = chemistry->rhoD(i);
         Y_old[i] = &Yi.oldTime()[0];
-        rhoD_GPU[i] = &chemistry->rhoD(i)[0];
-        cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double));
-        cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double));
+        cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double));
+        const volScalarField& haii = chemistry->hai(i);
+        const volScalarField& rhoDi = chemistry->rhoD(i);
+        hai[i] = &haii[0];
+        rhoD[i] = &rhoDi[0];
+        cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double));
+        cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double));
         int offset = 0;
         forAll(Yi.boundaryField(), patchi)
         {
             const scalarField& patchYi = Yi.boundaryField()[patchi];
+            const scalarField& patchHaii = haii.boundaryField()[patchi];
             const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
             int patchSize = patchYi.size();
 
-            memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double));
-            memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double));
+            memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double));
+            memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double));
+            memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double));
             offset += patchSize;
         }
     }
-
     volScalarField mut_sct = turbulence->mut().ref()/Sct;
-    std::vector<double> boundary_mutsct;
+    double *boundary_mutsct = nullptr;
+    cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double));
+    int offset = 0;
     forAll(p.boundaryField(), patchi)
     {
         const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
         int patchSize = patchMut_sct.size();
-        boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize);
+        memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double));
+        offset += patchSize;
     }
-    end2 = std::clock();
-    time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare);
 
+    start1 = std::clock();
+    YEqn_GPU.initializeTimeStep();
     YEqn_GPU.upwindWeight();
-    YEqn_GPU.correctVelocity(Y_old, boundary_Y_init, rhoD_GPU);
+    YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y,
+            hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]);
     YEqn_GPU.fvm_ddt();
     YEqn_GPU.fvm_div_phi();
     YEqn_GPU.fvm_div_phiUc();
-    YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init);
-
+    YEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
     YEqn_GPU.solve();
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 label flag_mpi_init;
 MPI_Initialized(&flag_mpi_init);
 if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-end = std::clock();
-time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
 {
     if (!splitting)
@@ -108,52 +116,50 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
         time_monitor_chem += processingTime.count();
     }
 
+#ifdef GPUSolver_
+    start1 = std::clock();
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+        YEqn_GPU.updatePsi(&Yi[0], i);
+        Yi.correctBoundaryConditions();
+    }
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#else
+    start2 = std::clock();
     volScalarField Yt(0.0*Y[0]);
-
-    start = std::clock();
     int speciesIndex = 0;
     forAll(Y, i)
     {
         volScalarField& Yi = Y[i];
         hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
         diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
-
         if (i != inertIndex)
         {
-            #ifdef GPUSolver_
-                start1 = std::clock();
-                YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
-                Yi.correctBoundaryConditions();
-                end1 = std::clock();
-                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-            #else
-                start1 = std::clock();
-                tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
-                fvScalarMatrix YiEqn
+            start1 = std::clock();
+            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+            fvScalarMatrix YiEqn
                 (
-                    fvm::ddt(rho, Yi)
-                + mvConvection->fvmDiv(phi, Yi)
-                + mvConvection->fvmDiv(phiUc, Yi)
-                ==
-                    (
-                        splitting
-                    ?   fvm::laplacian(DEff(), Yi)
-                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
-                    )
+                 fvm::ddt(rho, Yi)
+                 + mvConvection->fvmDiv(phi, Yi)
+                 + mvConvection->fvmDiv(phiUc, Yi)
+                 ==
+                 (
+                  splitting
+                  ?   fvm::laplacian(DEff(), Yi)
+                  :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+                 )
                 );
-                end1 = std::clock();
-                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                // YiEqn.relax();
+            end1 = std::clock();
+            time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+            // YiEqn.relax();
 
-                start1 = std::clock();
-                YiEqn.solve("Yi");
-                end1 = std::clock();
-                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-            #endif
+            start1 = std::clock();
+            YiEqn.solve("Yi");
+            end1 = std::clock();
+            time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
             Yi.max(0.0);
             Yt += Yi;
@@ -163,7 +169,7 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
     Y[inertIndex] = scalar(1) - Yt;
     Y[inertIndex].max(0.0);
-
-    end = std::clock();
-    time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC);
+    end2 = std::clock();
+    time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+#endif
 }