diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H new file mode 100644 index 000000000..2520485a7 --- /dev/null +++ b/GPUTest/GPUTestBase.H @@ -0,0 +1,646 @@ + +enum initType{ + original, + randomInit +}; + +struct testGPUDataBase { + // some fvm ops don't use d_source; + // some fvm ops don't use d_internal_coeffs and d_boundary_coeffs; + // all the fvc ops only use d_source + double *d_lower = nullptr; + double *d_upper = nullptr; + double *d_diag = nullptr; + double *d_source = nullptr; + double *d_internal_coeffs = nullptr; + double *d_boundary_coeffs = nullptr; + + double *d_value_internal_coeffs = nullptr; + double *d_value_boundary_coeffs = nullptr; + double *d_gradient_internal_coeffs = nullptr; + double *d_gradient_boundary_coeffs = nullptr; + + std::vector patch_type; + + // constructor + testGPUDataBase() {} + + // deconstructor + ~testGPUDataBase() { + if (d_lower) checkCudaErrors(cudaFree(d_lower)); + if (d_upper) checkCudaErrors(cudaFree(d_upper)); + if (d_diag) checkCudaErrors(cudaFree(d_diag)); + if (d_source) checkCudaErrors(cudaFree(d_source)); + if (d_internal_coeffs) checkCudaErrors(cudaFree(d_internal_coeffs)); + if (d_boundary_coeffs) checkCudaErrors(cudaFree(d_boundary_coeffs)); + + if (d_value_internal_coeffs) checkCudaErrors(cudaFree(d_value_internal_coeffs)); + if (d_value_boundary_coeffs) checkCudaErrors(cudaFree(d_value_boundary_coeffs)); + if (d_gradient_internal_coeffs) checkCudaErrors(cudaFree(d_gradient_internal_coeffs)); + if (d_gradient_boundary_coeffs) checkCudaErrors(cudaFree(d_gradient_boundary_coeffs)); + } +}; + +template +void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) { + size_t s = 1; + bool isVol = false; + if (typeid(T) == typeid(surfaceScalarField)) { + s = 1; + isVol = false; + } else if (typeid(T) == typeid(surfaceVectorField)) { + s = 3; + isVol = false; + } else if (typeid(T) == typeid(surfaceTensorField)) { + s = 9; + isVol = false; + } else if (typeid(T) == typeid(volScalarField)) { + s = 1; + isVol = true; + } else if (typeid(T) == typeid(volVectorField)) { + s = 3; + isVol = true; + } else if (typeid(T) == typeid(volTensorField)) { + s = 9; + isVol = true; + } else { + fprintf(stderr, "ERROR! Unsupported field type()!\n"); + exit(EXIT_FAILURE); + } + *stride = s; + *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * s; + *boundary_size = dfDataBase.num_boundary_surfaces * s; +} + + +template +void getFieldPtr(std::queue& fieldPtrQue, T& field){ + fieldPtrQue.push(&field[0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0]); + } +}; + +// template +// void getFieldPtr(std::vector& fieldPtrQue, T& field){ +// fieldPtrQue.push_back(&field[0]); +// forAll(field.boundaryField(), patchi){ +// auto& patchField = field.boundaryFieldRef()[patchi]; +// fieldPtrQue.push_back(&patchField[0]); +// Info << "patchi " << patchi << endl; +// } +// }; + + +template +void randomInitField(T& field) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double) * stride; + std::queue fieldPtrQue; + // std::vector fieldPtrQue; + getFieldPtr(fieldPtrQue, field); + + // random init field value to (-0.5, 0.5) + // internal + double *&field_internal_ptr = fieldPtrQue.front(); fieldPtrQue.pop(); + // double *field_internal_ptr = fieldPtrQue[0]; + std::vector init_field_internal; + init_field_internal.resize(internal_size * stride); + for (size_t i = 0; i < internal_size * stride; i++) { + init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes); + // boundary + int ptrIndex = 1; + forAll(field.boundaryField(), patchi) + { + auto& patchField = field.boundaryFieldRef()[patchi]; + size_t patchsize = patchField.size(); + double *&field_boundary_ptr = fieldPtrQue.front(); fieldPtrQue.pop(); + // double *field_boundary_ptr = fieldPtrQue[ptrIndex]; + // ptrIndex ++; + std::vector init_field_boundary; + init_field_boundary.resize(patchsize * stride); + for (size_t i = 0; i < patchsize * stride; i++) { + init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * stride * sizeof(double)); + } +} + +template +void uploadRegisteredField(dfMatrixDataBase& dfDataBase, const T& field, const char* fieldAlias) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double); + size_t boundary_value_bytes = boundary_size * sizeof(double); + + double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); + double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); + double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); + double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); + + // internal + memcpy(h_internal_field, &field[0], internal_value_bytes); + // boundary + int offset = 0; + forAll(field.boundaryField(), patchi) + { + const auto& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field + offset * stride, &patchField[0], patchsize * stride * sizeof(double)); + offset += patchsize; + } + // transfer + checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + +template +void uploadField(dfMatrixDataBase& dfDataBase, const T& field, double *d_field, double *d_boundary_field) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double); + size_t boundary_value_bytes = boundary_size * sizeof(double); + + std::vector h_boundary_field; + h_boundary_field.resize(boundary_size); + int offset = 0; + forAll(field.boundaryField(), patchi) + { + const auto& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field.data() + offset * stride, &patchField[0], patchsize * stride * sizeof(double)); + offset += patchsize; + } + checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + +template +void buildTestGPUDataBase(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const T& field, + bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, + bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) { + if ((typeid(T) != typeid(volScalarField)) && (typeid(T) != typeid(volVectorField))) { + fprintf(stderr, "ERROR! Unsupported field type()!\n"); + exit(EXIT_FAILURE); + } + bool isVec = (typeid(T) == typeid(volVectorField)); + size_t stride = isVec ? 3 : 1; + + // ldu + if (lowerFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes)); + } + if (upperFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes)); + } + if (diagFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes)); + } + if (sourceFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes * stride)); + } + if (internalCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); + } + if (boundaryCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); + } + // boundary coeffs + if (valueInternalCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); + } + if (valueBoundaryCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); + } + if (gradientInternalCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); + } + if (gradientBoundaryCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); + } + // patch type + testData.patch_type.resize(dfDataBase.num_patches); + forAll(field.boundaryField(), patchi) + { + constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type()); + } +} + +// TODO: It seems that compareResult of scalar and vector can't be merged +void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBase& testData, fvVectorMatrix& dfMatrix, bool printFlag) { + //if ((typeid(T) != typeid(fvScalarMatrix)) && (typeid(T) != typeid(fvVectorMatrix))) { + // fprintf(stderr, "ERROR! Unsupported field type()!\n"); + // exit(EXIT_FAILURE); + //} + //bool isVec = (typeid(T) == typeid(fvVectorMatrix)); + //size_t stride = isVec ? 3 : 1; + + size_t stride = 3; + if (testData.d_lower) { + std::vector h_lower; + h_lower.resize(dfDataBase.num_surfaces); + checkCudaErrors(cudaMemcpy(h_lower.data(), testData.d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.lower()[0], h_lower.data(), 1e-14, printFlag); + } + if (testData.d_upper) { + std::vector h_upper; + h_upper.resize(dfDataBase.num_surfaces); + checkCudaErrors(cudaMemcpy(h_upper.data(), testData.d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.upper()[0], h_upper.data(), 1e-14, printFlag); + } + if (testData.d_diag) { + std::vector h_diag; + h_diag.resize(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_diag.data(), testData.d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &dfMatrix.diag()[0], h_diag.data(), 1e-14, printFlag); + } + if (testData.d_source) { + std::vector h_source; + h_source.resize(dfDataBase.num_cells * stride); + checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_bytes * stride, cudaMemcpyDeviceToHost)); + //void *source_ptr = isVec ? (&dfMatrix.source()[0][0]) : (&dfMatrix.source()[0]); + double *source_ptr = &dfMatrix.source()[0][0]; + checkVectorEqual(dfDataBase.num_cells * stride, source_ptr, h_source.data(), 1e-14, printFlag); + } + if (testData.d_internal_coeffs) { + std::vector h_internal_coeffs; + h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * stride); + checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost)); + std::vector cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * stride); + int offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + //const void* internal_coeff_ptr = isVec ? (&dfMatrix.internalCoeffs()[patchi][0][0]) : (&dfMatrix.internalCoeffs()[patchi][0]); + const void* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0]; + memcpy(cpu_internal_coeffs.data() + offset * stride, internal_coeff_ptr, patchsize * stride * sizeof(double)); + offset += patchsize; + } + checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag); + } + if (testData.d_boundary_coeffs) { + std::vector h_boundary_coeffs; + h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * stride); + checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost)); + std::vector cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * stride); + int offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + //const void* boundary_coeff_ptr = isVec ? (&dfMatrix.boundaryCoeffs()[patchi][0][0]) : (&dfMatrix.boundaryCoeffs()[patchi][0]); + const void* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0]; + memcpy(cpu_boundary_coeffs.data() + offset * stride, boundary_coeff_ptr, patchsize * stride * sizeof(double)); + offset += patchsize; + } + checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag); + } +} + +// unittest of fvm::ddt(rho, U) +void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) { + if (type == initType::randomInit) { + rho.oldTime(); + randomInitField(rho); + } + + // run CPU + // fvVectorMatrix dfMatrix = fvm::ddt(rho, U); + fvVectorMatrix dfMatrix = EulerDdtSchemeFvmDdt(rho, U); + + // prepare for run GPU + // prepare rho, rho.old, U + uploadRegisteredField(dfDataBase, rho, "rho"); + uploadRegisteredField(dfDataBase, rho.oldTime(), "rho_old"); + uploadRegisteredField(dfDataBase, U.oldTime(), "u"); + // prepare testData + testGPUDataBase testData; + // only use diag and source + buildTestGPUDataBase(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false); + // run GPU + fvm_ddt_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t, + dfDataBase.d_rho, dfDataBase.d_rho_old, dfDataBase.d_u, dfDataBase.d_volume, + testData.d_diag, testData.d_source); + + // compare result + bool printFlag = false; + compareResultVector(dfDataBase, testData, dfMatrix, printFlag); +} + +// unittest of fvm::div(phi, U) +void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { + if (type == initType::randomInit) { + phi.oldTime(); + randomInitField(phi); + } + + // run CPU + // fvVectorMatrix dfMatrix = fvm::div(phi, U); + fvVectorMatrix dfMatrix = gaussConvectionSchemeFvmDiv(phi, U); + + // prepare for run GPU + // prepare phi field + uploadRegisteredField(dfDataBase, phi, "phi"); + // prepare testData + testGPUDataBase testData; + // not use source + // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them + buildTestGPUDataBase(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); + // prepare boundary coeffs + // TODO: updating boundary coeffs should be complemented later + update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, + dfDataBase.patch_size.data(), testData.patch_type.data(), + testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs); + + // run GPU + fvm_div_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_phi, dfDataBase.d_weight, + testData.d_lower, testData.d_upper, testData.d_diag, // end for internal + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_phi, testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_internal_coeffs, testData.d_boundary_coeffs); + + // compare result + bool printFlag = false; + compareResultVector(dfDataBase, testData, dfMatrix, printFlag); +} + +// unittest of fvm::laplacian(gamma, vf) +void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, + volScalarField& gamma, volVectorField& U, initType type) +{ + if (type == initType::randomInit) { + gamma.oldTime(); + randomInitField(gamma); + } + + // run CPU + // fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U); + fvVectorMatrix dfMatrix = gaussLaplacianSchemeFvmLaplacian(gamma, U); + + // prepare for run GPU + // prepare gamma on GPU + double *d_gamma = nullptr; + double *d_boundary_gamma = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_gamma, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_gamma, dfDataBase.boundary_surface_value_bytes)); + uploadField(dfDataBase, gamma, d_gamma, d_boundary_gamma); + // prepare testData + testGPUDataBase testData; + // not use source + // value_internal_coeffs, value_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them + buildTestGPUDataBase(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); + // prepare boundary coeffs + // TODO: updating boundary coeffs should be complemented later + update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, + dfDataBase.patch_size.data(), testData.patch_type.data(), + testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs); + + // run GPU + fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_mag_sf, dfDataBase.d_delta_coeffs, d_gamma, + testData.d_lower, testData.d_upper, testData.d_diag, // end for internal + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_mag_sf, d_boundary_gamma, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs, + testData.d_internal_coeffs, testData.d_boundary_coeffs); + + // compare result + bool printFlag = false; + compareResultVector(dfDataBase, testData, dfMatrix, printFlag); + + // free resources + checkCudaErrors(cudaFree(d_gamma)); + checkCudaErrors(cudaFree(d_boundary_gamma)); +} + +// unittest of fvc::ddt(rho, K) +void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) { + if (type == initType::randomInit) { + rho.oldTime(); + randomInitField(rho); + K.oldTime(); + randomInitField(K); + } + + // run CPU + // volScalarField fvc_ouput_scalar = fvc::ddt(rho, K); + volScalarField fvc_ouput_scalar = EulerDdtSchemeFvcDdt(rho, K); + + // prepare for run GPU + // prepare rho, rho.old on GPU + uploadRegisteredField(dfDataBase, rho, "rho"); + uploadRegisteredField(dfDataBase, rho.oldTime(), "rho_old"); + // prepare K, K_old on GPU + double *d_K = nullptr; + double *d_K_old = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_K, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_K_old, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_K, &K[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_K_old, &K.oldTime()[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + // there is no need for fvc ops to build testGPUDataBase, just build d_fvc_ouput_scalar directly. + double *d_fvc_ouput_scalar = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes)); + // run GPU + // fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign). + fvc_ddt_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t, + dfDataBase.d_rho, dfDataBase.d_rho_old, d_K, d_K_old, + d_fvc_ouput_scalar); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_scalar; + h_fvc_ouput_scalar.resize(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); + + // free resources + checkCudaErrors(cudaFree(d_K)); + checkCudaErrors(cudaFree(d_K_old)); + checkCudaErrors(cudaFree(d_fvc_ouput_scalar)); +} + +// unittest of fvc::grad(U) +void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) { + if (type == initType::randomInit) { + U.oldTime(); + randomInitField(U); + } + + // run CPU + // volTensorField fvc_ouput_tensor = fvc::grad(U); + volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U); + + // prepare for run GPU + // prepare U on GPU + uploadRegisteredField(dfDataBase, U, "u"); + + double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes)); + + // only need patch_type + testGPUDataBase testData; + buildTestGPUDataBase(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false); + + fvc_grad_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_tensor, + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf, + dfDataBase.d_volume, dfDataBase.d_boundary_mag_sf, d_fvc_ouput_boundary_tensor, dfDataBase.d_boundary_delta_coeffs); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_tensor(dfDataBase.num_cells * 9); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag); +} + +// unittest of fvc::div(phi) +void test_fvc_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, initType type) { + if (type == initType::randomInit) { + phi.oldTime(); + randomInitField(phi); + } + + // run CPU + volScalarField fvc_ouput_scalar = fvc::div(phi); + // volScalarField fvc_ouput_scalar = gaussConvectionSchemeFvcDiv(phi); + + // prepare for run GPU + // prepare phi on GPU + uploadRegisteredField(dfDataBase, phi, "phi"); + + double *d_fvc_ouput_scalar = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes)); + + fvc_div_surface_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_phi, dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_phi, dfDataBase.d_volume, d_fvc_ouput_scalar); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_scalar(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); +} + +// unittest of fvc::div(U) +void test_fvc_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) { + if (type == initType::randomInit) { + U.oldTime(); + randomInitField(U); + } + + // run CPU + // volScalarField fvc_ouput_scalar = fvc::div(U); + volScalarField fvc_ouput_scalar = gaussDivFvcdiv(U); + + // prepare for run GPU + // prepare phi on GPU + uploadRegisteredField(dfDataBase, U, "u"); + + double *d_fvc_ouput_scalar = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes)); + + // only need patch_type + testGPUDataBase testData; + buildTestGPUDataBase(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false); + + fvc_div_cell_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_scalar, + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf, + dfDataBase.d_volume); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_scalar(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); +} + +// unittest of fvc::grad(p) +void test_fvc_grad_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type) { + if (type == initType::randomInit) { + p.oldTime(); + randomInitField(p); + } + + // run CPU + // volVectorField fvc_ouput_vector = fvc::grad(p); + volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p); + + // prepare for run GPU + // prepare p on GPU + uploadRegisteredField(dfDataBase, p, "p"); + + double *d_fvc_ouput_vector = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes)); + + // only need patch_type + testGPUDataBase testData; + buildTestGPUDataBase(dfDataBase, testData, p, false, false, false, false, false, false, false, false, false, false); + + fvc_grad_cell_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_p, d_fvc_ouput_vector, + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_p, dfDataBase.d_boundary_sf, dfDataBase.d_volume); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_vector(dfDataBase.num_cells * 3); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag); +} + + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // +template <> +void getFieldPtr(std::queue& fieldPtrQue, volVectorField& field) { + fieldPtrQue.push(&field[0][0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0][0]); + } +}; + +template <> +void getFieldPtr(std::queue& fieldPtrQue, volTensorField& field) { + fieldPtrQue.push(&field[0][0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0][0]); + } +}; \ No newline at end of file diff --git a/GPUTest/GPUTestRefBase.H b/GPUTest/GPUTestRefBase.H new file mode 100644 index 000000000..754219e64 --- /dev/null +++ b/GPUTest/GPUTestRefBase.H @@ -0,0 +1,63 @@ + +// unittest of fvc::grad(U) +void test_fvc_grad_vector_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type, + dfMatrixDataBaseOrig* dfDataBaseOrig) +{ + if (type == initType::randomInit) { + U.oldTime(); + randomInitField(U); + } + + // run CPU + // volTensorField fvc_ouput_tensor = fvc::grad(U); + volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U); + + // prepare for run GPU + // prepare U on GPU + uploadRegisteredField(dfDataBase, U, "u"); + + double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr, *d_fvc_ouput_boundary_tensor_init = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor_init, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor_init, 0, dfDataBase.boundary_surface_value_tsr_bytes)); + + fvc_grad_vector_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_tensor, d_fvc_ouput_boundary_tensor_init, d_fvc_ouput_boundary_tensor); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_tensor(dfDataBase.num_cells * 9); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag); +} + +void test_fvc_grad_scalar_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type, + dfMatrixDataBaseOrig* dfDataBaseOrig) +{ + if (type == initType::randomInit) { + p.oldTime(); + randomInitField(p); + } + + // run CPU + // volVectorField fvc_ouput_vector = fvc::grad(p); + volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p); + + // prepare for run GPU + // prepare p on GPU + uploadRegisteredField(dfDataBase, p, "p"); + + double *d_fvc_ouput_vector = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes)); + + fvc_grad_scalar_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_vector); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_vector(dfDataBase.num_cells * 3); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag); +} \ No newline at end of file diff --git a/GPUTest/Make/files b/GPUTest/Make/files new file mode 100644 index 000000000..d78085ff8 --- /dev/null +++ b/GPUTest/Make/files @@ -0,0 +1,4 @@ +unittest.C + +EXE = $(DF_APPBIN)/unitTest + diff --git a/GPUTest/Make/options b/GPUTest/Make/options new file mode 100644 index 000000000..e8e07b6a5 --- /dev/null +++ b/GPUTest/Make/options @@ -0,0 +1,50 @@ +-include $(GENERAL_RULES)/mplibType + +EXE_INC = -std=c++14 \ + -g \ + -fopenmp \ + -Wno-unused-variable \ + -Wno-unused-but-set-variable \ + -Wno-old-style-cast \ + $(PFLAGS) $(PINC) \ + $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ + $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ + -I$(LIB_SRC)/transportModels/compressible/lnInclude \ + -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \ + -I$(LIB_SRC)/finiteVolume/cfdTools \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/sampling/lnInclude \ + -I$(LIB_SRC)/dynamicFvMesh/lnInclude \ + -I$(LIB_SRC)/Pstream/mpi \ + -I$(DF_SRC)/dfCanteraMixture/lnInclude \ + -I$(DF_SRC)/dfChemistryModel/lnInclude \ + -I$(DF_SRC)/dfCombustionModels/lnInclude \ + -I$(CANTERA_ROOT)/include \ + -I$(DF_ROOT)/src_gpu \ + -I$(DF_ROOT)/src_gpu_orig \ + -I$(DF_ROOT)/GPUTestRef/lnInclude \ + -I/usr/local/cuda-11.6/include \ + -I$(AMGX_DIR)/include + +EXE_LIBS = \ + -lcompressibleTransportModels \ + -lturbulenceModels \ + -lfiniteVolume \ + -lmeshTools \ + -lsampling \ + -L$(DF_LIBBIN) \ + -ldfFluidThermophysicalModels \ + -ldfCompressibleTurbulenceModels \ + -ldfCanteraMixture \ + -ldfChemistryModel \ + -ldfCombustionModels \ + -ldfGenMatrix \ + $(CANTERA_ROOT)/lib/libcantera.so \ + /usr/local/cuda-11.6/lib64/libcudart.so \ + $(AMGX_DIR)/build/libamgxsh.so \ + $(DF_ROOT)/src_gpu/build/libdfMatrix.so \ + $(DF_ROOT)/src_gpu_orig/build/libdfMatrixOrig.so + diff --git a/GPUTest/correctPhi.H b/GPUTest/correctPhi.H new file mode 100644 index 000000000..3cd82d29e --- /dev/null +++ b/GPUTest/correctPhi.H @@ -0,0 +1,12 @@ +CorrectPhi +( + U, + phi, + p, + rho, + psi, + dimensionedScalar("rAUf", dimTime, 1), + divrhoU(), + pimple, + true +); diff --git a/GPUTest/createFields.H b/GPUTest/createFields.H new file mode 100644 index 000000000..9e750c334 --- /dev/null +++ b/GPUTest/createFields.H @@ -0,0 +1,176 @@ +#include "createRDeltaT.H" + +Info<< "Reading thermophysical properties\n" << endl; + +// fluidThermo* pThermo = new hePsiThermo(mesh, word::null); +fluidThermo* pThermo = new heRhoThermo(mesh, word::null); +fluidThermo& thermo = *pThermo; +// thermo.validate(args.executable(), "ha"); + +const volScalarField& psi = thermo.psi(); +volScalarField& p = thermo.p(); +volScalarField& T = thermo.T(); +volScalarField rho +( + IOobject + ( + "rho", + runTime.timeName(), + mesh, + IOobject::READ_IF_PRESENT, + IOobject::AUTO_WRITE + ), + thermo.rho() +); + + +Info<< "Reading field U\n" << endl; +volVectorField U +( + IOobject + ( + "U", + runTime.timeName(), + mesh, + IOobject::MUST_READ, + IOobject::AUTO_WRITE + ), + mesh +); + +#include "compressibleCreatePhi.H" + +pressureControl pressureControl(p, rho, pimple.dict(), false); + +mesh.setFluxRequired(p.name()); + +Info<< "Creating turbulence model\n" << endl; +autoPtr turbulence +( + compressible::turbulenceModel::New + ( + rho, + U, + phi, + thermo + ) +); + +Info<< "Creating field dpdt\n" << endl; +volScalarField dpdt +( + IOobject + ( + "dpdt", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar("dpdt",p.dimensions()/dimTime, 0) +); + + +Info<< "Creating reaction model\n" << endl; +autoPtr> combustion +( + CombustionModel::New(thermo, turbulence()) +); +Info<< "end Creating reaction model\n" << endl; + + +const word combModelName(mesh.objectRegistry::lookupObject("combustionProperties").lookup("combustionModel")); +Info << "Combustion Model Name is confirmed as "<< combModelName << endl; + +const word turbName(mesh.objectRegistry::lookupObject("turbulenceProperties").lookup("simulationType")); + +dfChemistryModel* chemistry = combustion->chemistry(); +PtrList& Y = chemistry->Y(); +const word inertSpecie(chemistry->lookup("inertSpecie")); +const label inertIndex(chemistry->species()[inertSpecie]); +chemistry->setEnergyName("ha"); +chemistry->updateEnergy(); + + +chemistry->correctThermo(); +Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; + +//for dpdt + +Info<< "Creating field kinetic energy K\n" << endl; +volScalarField K("K", 0.5*magSqr(U)); + +multivariateSurfaceInterpolationScheme::fieldTable fields; + +if(combModelName!="flareFGM") +{ +forAll(Y, i) +{ + fields.add(Y[i]); +} +fields.add(thermo.he()); +} + + +const scalar Sct = chemistry->lookupOrDefault("Sct", 1.); +volScalarField diffAlphaD +( + IOobject + ( + "diffAlphaD", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar(dimEnergy/dimTime/dimVolume, 0) +); +volVectorField hDiffCorrFlux +( + IOobject + ( + "hDiffCorrFlux", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero) +); +volVectorField sumYDiffError +( + IOobject + ( + "sumYDiffError", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero) +); + +IOdictionary CanteraTorchProperties +( + IOobject + ( + "CanteraTorchProperties", + runTime.constant(), + mesh, + IOobject::MUST_READ, + IOobject::NO_WRITE + ) +); +const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false); +#ifdef USE_PYTORCH + const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); + const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); +#endif +#ifdef USE_LIBTORCH + const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); + const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); +#endif diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H new file mode 100644 index 000000000..516386473 --- /dev/null +++ b/GPUTest/createGPUSolver.H @@ -0,0 +1,114 @@ + +dfMatrixDataBase dfDataBase; + +void createGPUBase(fvMesh& mesh, PtrList& Y) { + // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + int num_boundary_surfaces = 0; + int num_patches = 0; + std::vector patch_size; + forAll(mesh.boundary(), patchi) { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + int patchsize = sub_boundary.size(); + patch_size.push_back(patchsize); + num_boundary_surfaces += patchsize; + num_patches++; + } + // TODO: get deltaT fomr time API + double rDeltaT = 1 / 1e-6; + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT); + + // prepare constant indexes: owner, neighbor + dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); + + // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume + double *boundary_sf = new double[3 * num_boundary_surfaces]; + double *boundary_mag_sf = new double[num_boundary_surfaces]; + double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int *boundary_face_cell = new int[num_boundary_surfaces]; + int offset = 0; + forAll(mesh.boundary(), patchi) { + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells(); + + int patchsize = pMagSf.size(); + + memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); + memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); + memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int)); + offset += patchsize; + } + + dfDataBase.createConstantFieldsInternal(); + dfDataBase.createConstantFieldsBoundary(); + dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell); + + // prepare internal and boundary of Y + dfDataBase.createNonConstantFieldsInternal(); + dfDataBase.createNonConstantFieldsBoundary(); + forAll(Y, speciesI) { + volScalarField& Yi = Y[speciesI]; + memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); + offset = 0; + forAll(Yi.boundaryField(), patchi) { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + int patchsize = patchYi.size(); + memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double)); + offset += patchsize; + } + } + dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); + dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); +}; + + +dfMatrixDataBaseOrig* createGPUBaseOrig(fvMesh& mesh, PtrList& Y, volVectorField& U) { + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + + std::vector boundaryCellIndex; + std::vector boundary_face_vector_init; + std::vector boundary_face_init; + std::vector boundary_deltaCoeffs_init; + std::vector> patchTypes; + std::vector patchTypeU, patchTypeY; + int num_boundary_faces = 0; + int patchSize; + forAll(mesh.boundary(), patchi) + { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + patchSize = sub_boundary.size(); + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + + boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize); + boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize); + boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize); + boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize); + num_boundary_faces += patchSize; + + constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize); + constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize); + } + patchTypes.emplace_back(patchTypeU); + patchTypes.emplace_back(patchTypeY); + + int num_boundary_cells; + + dfMatrixDataBaseOrig* dfDataBase = new dfMatrixDataBaseOrig(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, + &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], + &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, + boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes); + + return dfDataBase; +} \ No newline at end of file diff --git a/GPUTest/setRDeltaT.H b/GPUTest/setRDeltaT.H new file mode 100644 index 000000000..074d05e3d --- /dev/null +++ b/GPUTest/setRDeltaT.H @@ -0,0 +1,85 @@ +{ + volScalarField& rDeltaT = trDeltaT.ref(); + + const dictionary& pimpleDict = pimple.dict(); + + scalar maxCo + ( + pimpleDict.lookupOrDefault("maxCo", 0.8) + ); + + scalar rDeltaTSmoothingCoeff + ( + pimpleDict.lookupOrDefault("rDeltaTSmoothingCoeff", 0.02) + ); + + scalar rDeltaTDampingCoeff + ( + pimpleDict.lookupOrDefault("rDeltaTDampingCoeff", 1.0) + ); + + scalar maxDeltaT + ( + pimpleDict.lookupOrDefault("maxDeltaT", great) + ); + + volScalarField rDeltaT0("rDeltaT0", rDeltaT); + + // Set the reciprocal time-step from the local Courant number + rDeltaT.ref() = max + ( + 1/dimensionedScalar(dimTime, maxDeltaT), + fvc::surfaceSum(mag(phi))()() + /((2*maxCo)*mesh.V()*rho()) + ); + + if (pimple.transonic()) + { + surfaceScalarField phid + ( + "phid", + fvc::interpolate(psi)*fvc::flux(U) + ); + + rDeltaT.ref() = max + ( + rDeltaT(), + fvc::surfaceSum(mag(phid))()() + /((2*maxCo)*mesh.V()*psi()) + ); + } + + // Update tho boundary values of the reciprocal time-step + rDeltaT.correctBoundaryConditions(); + + Info<< "Flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + + if (rDeltaTSmoothingCoeff < 1.0) + { + fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff); + } + + Info<< "Smoothed flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + + // Limit rate of change of time scale + // - reduce as much as required + // - only increase at a fraction of old time scale + if + ( + rDeltaTDampingCoeff < 1.0 + && runTime.timeIndex() > runTime.startTimeIndex() + 1 + ) + { + rDeltaT = + rDeltaT0 + *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff); + + Info<< "Damped flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + } +} diff --git a/GPUTest/setRootCase2.H b/GPUTest/setRootCase2.H new file mode 100644 index 000000000..45d966e63 --- /dev/null +++ b/GPUTest/setRootCase2.H @@ -0,0 +1,5 @@ +Foam::argList args(argc,argv,true,true,/*initialise=*/false); +if (!args.checkRootCase()) +{ + Foam::FatalError.exit(); +} \ No newline at end of file diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C new file mode 100644 index 000000000..80eafef9d --- /dev/null +++ b/GPUTest/unittest.C @@ -0,0 +1,169 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +Application + unittest + +Description + GPU unittest + +\*---------------------------------------------------------------------------*/ + +#include "dfChemistryModel.H" +#include "CanteraMixture.H" +// #include "hePsiThermo.H" +#include "heRhoThermo.H" + +#include "fvCFD.H" +#include "fluidThermo.H" +#include "turbulentFluidThermoModel.H" +#include "pimpleControl.H" +#include "pressureControl.H" +#include "localEulerDdtScheme.H" +#include "fvcSmooth.H" +#include "PstreamGlobals.H" +#include "basicThermo.H" +#include "CombustionModel.H" + +#include +#include +#include "upwind.H" + +// debug +#include "GenFvMatrix.H" +#include +#include + +#include "dfMatrixDataBase.H" +#include "dfMatrixDataBaseOrig.H" +#include "dfMatrixOpBase.H" +#include "dfMatrixOpBaseOrig.H" +#include "createGPUSolver.H" +#include "GPUTestBase.H" +#include "GPUTestRefBase.H" + +int main(int argc, char *argv[]) +{ +#ifdef USE_PYTORCH + pybind11::scoped_interpreter guard{};//start python interpreter +#endif + #include "postProcess.H" + + // #include "setRootCaseLists.H" + #include "listOptions.H" + #include "setRootCase2.H" + #include "listOutput.H" + + #include "createTime.H" + #include "createMesh.H" + #include "createDyMControls.H" + #include "initContinuityErrs.H" + #include "createFields.H" + #include "createRhoUfIfPresent.H" + + turbulence->validate(); + + if (!LTS) + { + #include "compressibleCourantNo.H" + #include "setInitialDeltaT.H" + } + + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + { + #include "readDyMControls.H" + + if (LTS) + { + #include "setRDeltaT.H" + } + else + { + #include "compressibleCourantNo.H" + #include "setDeltaT.H" + } + + createGPUBase(mesh, Y); + DEBUG_TRACE; + dfMatrixDataBaseOrig* dfDataBaseOrig = createGPUBaseOrig(mesh, Y, U); + DEBUG_TRACE; + + // unittest of fvm::ddt(rho, U) + test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original); + DEBUG_TRACE; + test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit); + DEBUG_TRACE; + + // unittest of fvm::div(phi, U) + test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original); + DEBUG_TRACE; + test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit); + DEBUG_TRACE; + + // unittest of fvm::laplacian(gamma, U) + const tmp nuEff_tmp(turbulence->nuEff()); + const volScalarField& nuEff = nuEff_tmp(); + volScalarField gamma = rho * nuEff; + test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original); + DEBUG_TRACE; + test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit); + DEBUG_TRACE; + + // unittest of fvc::ddt(rho, K) + K = 0.5*magSqr(U); + test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::original); + DEBUG_TRACE; + test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit); + DEBUG_TRACE; + + // unittest of fvc::grad(U) + test_fvc_grad_vector(dfDataBase, mesh, U, initType::original); + DEBUG_TRACE; + // test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit); + // DEBUG_TRACE; + test_fvc_grad_vector_orig(dfDataBase, mesh, U, initType::original, dfDataBaseOrig); + DEBUG_TRACE; + + // unittest of fvc::div(phi) + test_fvc_div_scalar(dfDataBase, mesh, phi, initType::original); + DEBUG_TRACE; + test_fvc_div_scalar(dfDataBase, mesh, phi, initType::randomInit); + DEBUG_TRACE; + + // unittest of fvc::div(U) + test_fvc_div_vector(dfDataBase, mesh, U, initType::original); + DEBUG_TRACE; + // test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit); + // DEBUG_TRACE; + + // unittest of fvc::grad(p) + test_fvc_grad_scalar(dfDataBase, mesh, p, initType::original); + DEBUG_TRACE; + test_fvc_grad_scalar(dfDataBase, mesh, p, initType::randomInit); + DEBUG_TRACE; + test_fvc_grad_scalar_orig(dfDataBase, mesh, p, initType::original, dfDataBaseOrig); + DEBUG_TRACE + } + return 0; +} + diff --git a/GPUTestRef/EulerDdtScheme.C b/GPUTestRef/EulerDdtScheme.C new file mode 100644 index 000000000..0875e0033 --- /dev/null +++ b/GPUTestRef/EulerDdtScheme.C @@ -0,0 +1,322 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +\*---------------------------------------------------------------------------*/ + +#include "GenFvMatrix.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// namespace fv +// { + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> tfvm + ( + new fvMatrix + ( + vf, + rho.dimensions()*vf.dimensions()*dimVol/dimTime + ) + ); + fvMatrix& fvm = tfvm.ref(); + + scalar rDeltaT = 1.0/mesh.time().deltaTValue(); + + fvm.diag() = rDeltaT*rho.primitiveField()*mesh.Vsc(); + + if (mesh.moving()) + { + fvm.source() = rDeltaT + *rho.oldTime().primitiveField() + *vf.oldTime().primitiveField()*mesh.Vsc0(); + } + else + { + fvm.source() = rDeltaT + *rho.oldTime().primitiveField() + *vf.oldTime().primitiveField()*mesh.Vsc(); + } + return tfvm; +} + +template +tmp> +EulerDdtSchemeFvcDdt +( + const volScalarField& rho, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT(); + + IOobject ddtIOobject + ( + "ddt("+rho.name()+','+vf.name()+')', + mesh.time().timeName(), + mesh + ); + + if (mesh.moving()) + { + return tmp> + ( + new GeometricField + ( + ddtIOobject, + rDeltaT* + ( + rho()*vf() + - rho.oldTime()() + *vf.oldTime()()*mesh.Vsc0()/mesh.Vsc() + ), + rDeltaT.value()* + ( + rho.boundaryField()*vf.boundaryField() + - rho.oldTime().boundaryField() + *vf.oldTime().boundaryField() + ) + ) + ); + } + else + { + return tmp> + ( + new GeometricField + ( + ddtIOobject, + rDeltaT*(rho*vf - rho.oldTime()*vf.oldTime()) + ) + ); + } +} + + +tmp +EulerDdtSchemeFvcDdtCorr +( + const volScalarField& rho, + const volVectorField& U, + const surfaceScalarField& phi, + const autoPtr& Uf +) +{ + Info << "EulerDdtSchemeFvcDdtCorr start" << endl; + + const fvMesh& mesh = U.mesh(); + + dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT(); + + GeometricField rhoU0 + ( + rho.oldTime() * U.oldTime() + ); + + surfaceScalarField phiCorr + ( + phi.oldTime() - fvc::dotInterpolate(mesh.Sf(), rhoU0) + ); + + return tmp + ( + new surfaceScalarField + ( + IOobject + ( + "ddtCorr(" + + rho.name() + ',' + U.name() + ',' + phi.name() + ')', + mesh.time().timeName(), + mesh + ), + EulerDdtSchemeFvcDdtPhiCoeff + ( + rhoU0, + phi.oldTime(), + phiCorr, + rho.oldTime() + )*rDeltaT*phiCorr + ) + ); + +} + +tmp +EulerDdtSchemeFvcDdtPhiCoeff +( + const volVectorField& U, + const surfaceScalarField& phi, + const surfaceScalarField& phiCorr, + const volScalarField& rho +) +{ + const fvMesh& mesh = U.mesh(); + tmp tddtCouplingCoeff = scalar(1) - min(mag(phiCorr)/(mag(phi) + dimensionedScalar("small", phi.dimensions(), SMALL)),scalar(1)); + + surfaceScalarField& ddtCouplingCoeff = tddtCouplingCoeff.ref(); + + surfaceScalarField::Boundary& ccbf = ddtCouplingCoeff.boundaryFieldRef(); + + forAll(U.boundaryField(), patchi) + { + if + ( U.boundaryField()[patchi].fixesValue() + || isA(mesh.boundary()[patchi]) + ) + { + ccbf[patchi] = 0.0; + } + } + + return tddtCouplingCoeff; +} + +template +tmp> +EulerDdtSchemeFvmDdt +( + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> tfvm + ( + new fvMatrix + ( + vf, + vf.dimensions()*dimVol/dimTime + ) + ); + + fvMatrix& fvm = tfvm.ref(); + + scalar rDeltaT = 1.0/mesh.time().deltaTValue(); + + fvm.diag() = rDeltaT*mesh.Vsc(); + + if (mesh.moving()) + { + fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc0(); + } + else + { + fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc(); + } + + return tfvm; +} + +template +tmp> +EulerDdtSchemeFvcDdt +( + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT(); + + IOobject ddtIOobject + ( + "ddt("+vf.name()+')', + mesh.time().timeName(), + mesh + ); + + return tmp> + ( + new GeometricField + ( + ddtIOobject, + rDeltaT*(vf - vf.oldTime()) + ) + ); +} + +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvcDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvmDdt +( + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvcDdt +( + const GeometricField& vf +); + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// } // End namespace fv + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/GenFvMatrix.H b/GPUTestRef/GenFvMatrix.H new file mode 100644 index 000000000..d76fa94d9 --- /dev/null +++ b/GPUTestRef/GenFvMatrix.H @@ -0,0 +1,261 @@ +#pragma once + +#include "tmp.H" +#include "dimensionedType.H" +#include "volFieldsFwd.H" +#include "surfaceFieldsFwd.H" +#include "typeInfo.H" +#include "runTimeSelectionTables.H" +#include "fvMatrices.H" +#include "fvMesh.H" +#include "turbulentFluidThermoModel.H" +#include "CombustionModel.H" +#include +#include +#include "PstreamGlobals.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + + +// namespace fv +// { + +// fvm::ddt +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvmDdt +( + const GeometricField& vf +); + +// fvc::ddt +template +tmp> +EulerDdtSchemeFvcDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvcDdt +( + const GeometricField& vf +); + +// fvc::ddtCorr +tmp +EulerDdtSchemeFvcDdtCorr +( + const volScalarField& rho, + const volVectorField& U, + const surfaceScalarField& phi, + const autoPtr& Uf +); + +tmp +EulerDdtSchemeFvcDdtPhiCoeff +( + const volVectorField& U, + const surfaceScalarField& phi, + const surfaceScalarField& phiCorr, + const volScalarField& rho +); + +template +Foam::tmp> +UEqn_H +( + fvMatrix& UEqn +); + +tmp +rAUConstructor +( + fvMatrix& UEqn +); + +tmp +rhorAUfConstructor +( + const volScalarField& rhorAU, + const surfaceScalarField& linear_weights +); + +tmp +phiHbyAConstructor +( + const volScalarField& rho, + const volVectorField& HbyA, + const surfaceScalarField& rhorAUf, + const surfaceScalarField& tddtCorr, + const surfaceScalarField& linear_weights +); + + +// fvm::div +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + const word& name +); + +// fvc::div +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +); + +template +tmp +< + GeometricField + < + typename innerProduct::type, fvPatchField, volMesh + > +> +gaussDivFvcdiv +( + const GeometricField& vf +); + +// fvc::grad +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +); + + +template +tmp> +gaussLaplacianSchemeFvmLaplacianUncorrected +( + const surfaceScalarField& gammaMagSf, + const surfaceScalarField& deltaCoeffs, + const GeometricField& vf +); + +// fvm::laplacian +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +); + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gamma, + const GeometricField& vf +); + +// turbulence->divDevRhoReff(U) +tmp +turbulenceModelLinearViscousStressDivDevRhoReff +( + volVectorField& U, + compressible::turbulenceModel& turbulence +); + +tmp +GenMatrix_U( + const volScalarField& rho, + volVectorField& U, + const surfaceScalarField& phi, + const volScalarField& p, + compressible::turbulenceModel& turbulence +); + +tmp +GenMatrix_Y( + const volScalarField& rho, + volScalarField& Yi, + const surfaceScalarField& phi, + const surfaceScalarField& phiUc, + const volScalarField& rhoD, + const volScalarField& mut, + const Switch splitting, + const scalar Sct, + CombustionModel& combustion, + fv::convectionScheme& mvConvection +); + +tmp +GenMatrix_E( + const volScalarField& rho, + volScalarField& he, + const surfaceScalarField& phi, + const volScalarField& K, + const volScalarField& dpdt, + const volScalarField& alphaEff, + const volScalarField& diffAlphaD, + const volVectorField& hDiffCorrFlux, + const surfaceScalarField& linear_weights +); + +tmp +GenMatrix_p( + const volScalarField& rho, + volScalarField& p, + const surfaceScalarField& phiHbyA, + const surfaceScalarField& rhorAUf, + const volScalarField& phi +); + + +void check_fvmatrix_equal(fvScalarMatrix& a,fvScalarMatrix& b); +void check_fvmatrix_equal(fvVectorMatrix& a,fvVectorMatrix& b); + +void check_field_equal(Field& a, Field& b); + + +} // End namespace Foam + + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/Make/files b/GPUTestRef/Make/files new file mode 100644 index 000000000..314f1f495 --- /dev/null +++ b/GPUTestRef/Make/files @@ -0,0 +1,6 @@ +gaussGrad.C +gaussConvectionScheme.C +gaussLaplacianScheme.C +EulerDdtScheme.C + +LIB = $(DF_LIBBIN)/libdfGenMatrix \ No newline at end of file diff --git a/GPUTestRef/Make/options b/GPUTestRef/Make/options new file mode 100644 index 000000000..0523a67e8 --- /dev/null +++ b/GPUTestRef/Make/options @@ -0,0 +1,31 @@ +-include $(GENERAL_RULES)/mplibType + +EXE_INC = \ + -g \ + $(PFLAGS) $(PINC) \ + -I$(LIB_SRC)/transportModels/compressible/lnInclude \ + -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \ + -I$(LIB_SRC)/finiteVolume/cfdTools \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/sampling/lnInclude \ + -I$(LIB_SRC)/dynamicFvMesh/lnInclude \ + -I$(LIB_SRC)/Pstream/mpi \ + -I$(DF_SRC)/dfCanteraMixture/lnInclude \ + -I$(DF_SRC)/dfChemistryModel/lnInclude \ + -I$(DF_SRC)/dfCombustionModels/lnInclude \ + -I$(LIB_SRC)/parallel/decompose/decompositionMethods/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/fileFormats/lnInclude \ + -I$(LIB_SRC)/triSurface/lnInclude \ + -I$(LIB_SRC)/surfMesh/lnInclude \ + -I$(LIB_SRC)/dynamicMesh/lnInclude \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(CANTERA_ROOT)/include + +EXE_LIBS = \ + -lOpenFOAM \ + -ltriSurface \ + -lmeshTools \ No newline at end of file diff --git a/GPUTestRef/gaussConvectionScheme.C b/GPUTestRef/gaussConvectionScheme.C new file mode 100644 index 000000000..b8157d2d1 --- /dev/null +++ b/GPUTestRef/gaussConvectionScheme.C @@ -0,0 +1,351 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +\*---------------------------------------------------------------------------*/ + +#include "GenFvMatrix.H" +#include "fvcSurfaceIntegrate.H" +#include "fvMatrices.H" +#include "gaussConvectionScheme.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + const word& name +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> cs = fv::convectionScheme::New(mesh,faceFlux,mesh.divScheme(name)); + fv::gaussConvectionScheme& gcs = dynamic_cast&>(cs.ref()); + + tmp tweights = gcs.interpScheme().weights(vf); + const surfaceScalarField& weights = tweights(); + + tmp> tfvm + ( + new fvMatrix + ( + vf, + faceFlux.dimensions()*vf.dimensions() + ) + ); + fvMatrix& fvm = tfvm.ref(); + fvm.lower() = -weights.primitiveField()*faceFlux.primitiveField(); + fvm.upper() = fvm.lower() + faceFlux.primitiveField(); + fvm.negSumDiag(); + forAll(vf.boundaryField(), patchi) + { + const fvPatchField& psf = vf.boundaryField()[patchi]; + const fvsPatchScalarField& patchFlux = faceFlux.boundaryField()[patchi]; + const fvsPatchScalarField& pw = weights.boundaryField()[patchi]; + + fvm.internalCoeffs()[patchi] = patchFlux*psf.valueInternalCoeffs(pw); + fvm.boundaryCoeffs()[patchi] = -patchFlux*psf.valueBoundaryCoeffs(pw); + } + if (gcs.interpScheme().corrected()) + { + fvm += fvc::surfaceIntegrate(faceFlux*gcs.interpScheme().correction(vf)); + } + return tfvm; +} + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +) +{ + word name("div("+faceFlux.name()+','+vf.name()+')'); + return gaussConvectionSchemeFvmDiv(faceFlux,vf,name); +} + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +) +{ + word name("div("+faceFlux.name()+','+vf.name()+')'); + return gaussConvectionSchemeFvcDiv(faceFlux, vf, name); +} + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + const word& name +) +{ + Info << "gaussConvectionSchemeFvcDiv start" << endl; + + const fvMesh& mesh = vf.mesh(); + + Istream& divIntScheme = mesh.divScheme(name); + word divScheme(divIntScheme); + + tmp> tinterpScheme_ = + surfaceInterpolationScheme::New(mesh, faceFlux, divIntScheme); + + // tmp> tinterpScheme_ = + // tmp> + // ( + // new linear(mesh) + // ); + + + // surfaceInterpolationScheme interpScheme_ = tinterpScheme_.ref(); + + tmp> tConvection + ( + fvc::surfaceIntegrate(gaussConvectionSchemeFlux(faceFlux, vf, tinterpScheme_)) + ); + + tConvection.ref().rename + ( + "convection(" + faceFlux.name() + ',' + vf.name() + ')' + ); + + return tConvection; +} + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +) +{ + return tmp> + ( + new GeometricField + ( + "div("+ssf.name()+')', + fvcSurfaceIntegrate(ssf) + ) + ); +} + +template +tmp +< + GeometricField + < + typename innerProduct::type, fvPatchField, volMesh + > +> +gaussDivFvcdiv +( + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + Istream& divIntScheme = mesh.divScheme("div("+vf.name()+')'); + word divScheme(divIntScheme); + + tmp> tinterpScheme_ = + surfaceInterpolationScheme::New(mesh, divIntScheme); + + tmp + < + GeometricField + ::type, fvPatchField, volMesh> + > tDiv + ( + fvcSurfaceIntegrate + ( + (tinterpScheme_().dotInterpolate(mesh.Sf(), vf))() + ) + ); + + + return tDiv; +} + +template +tmp> +fvcSurfaceIntegrate +( + const GeometricField& ssf +) +{ + const fvMesh& mesh = ssf.mesh(); + + tmp> tvf + ( + new GeometricField + ( + IOobject + ( + "surfaceIntegrate("+ssf.name()+')', + ssf.instance(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensioned + ( + "0", + ssf.dimensions()/dimVol, + Zero + ), + extrapolatedCalculatedFvPatchField::typeName + ) + ); + GeometricField& vf = tvf.ref(); + + fvcSurfaceIntegrate(vf.primitiveFieldRef(), ssf); + vf.correctBoundaryConditions(); + + return tvf; +} + +template +void fvcSurfaceIntegrate +( + Field& ivf, + const GeometricField& ssf +) +{ + const fvMesh& mesh = ssf.mesh(); + + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + + const Field& issf = ssf; + + forAll(owner, facei) + { + ivf[owner[facei]] += issf[facei]; + ivf[neighbour[facei]] -= issf[facei]; + } + + forAll(mesh.boundary(), patchi) + { + const labelUList& pFaceCells = + mesh.boundary()[patchi].faceCells(); + + const fvsPatchField& pssf = ssf.boundaryField()[patchi]; + + forAll(mesh.boundary()[patchi], facei) + { + ivf[pFaceCells[facei]] += pssf[facei]; + } + } + + ivf /= mesh.Vsc(); +} + +template +tmp> +gaussConvectionSchemeFlux +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + tmp> tinterpScheme +) +{ + Info << vf.name() <> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +); + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +); + +template +tmp +< + GeometricField + < + typename innerProduct::type, fvPatchField, volMesh + > +> +gaussDivFvcdiv +( + const GeometricField& vf +); + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/gaussGrad.C b/GPUTestRef/gaussGrad.C new file mode 100644 index 000000000..401eab38b --- /dev/null +++ b/GPUTestRef/gaussGrad.C @@ -0,0 +1,332 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +\*---------------------------------------------------------------------------*/ + +#include "gaussGrad.H" +#include "extrapolatedCalculatedFvPatchField.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + + + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +) +{ + return gaussGradSchemeGrad(vsf, "grad(" + vsf.name() + ')'); +} + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf, + const word& name +) +{ + const fvMesh& mesh = vsf.mesh(); + + typedef typename outerProduct::type GradType; + typedef GeometricField GradFieldType; + + if (!mesh.changing() && mesh.cache(name)) + { + if (!mesh.objectRegistry::template foundObject(name)) + { + solution::cachePrintMessage("Calculating and caching", name, vsf); + tmp tgGrad = gaussGradCalcGrad(vsf, name); + regIOobject::store(tgGrad.ptr()); + } + + solution::cachePrintMessage("Retrieving", name, vsf); + GradFieldType& gGrad = + mesh.objectRegistry::template lookupObjectRef + ( + name + ); + + if (gGrad.upToDate(vsf)) + { + return gGrad; + } + else + { + solution::cachePrintMessage("Deleting", name, vsf); + gGrad.release(); + delete &gGrad; + + solution::cachePrintMessage("Recalculating", name, vsf); + tmp tgGrad = gaussGradCalcGrad(vsf, name); + + solution::cachePrintMessage("Storing", name, vsf); + regIOobject::store(tgGrad.ptr()); + GradFieldType& gGrad = + mesh.objectRegistry::template lookupObjectRef + ( + name + ); + + return gGrad; + } + } + else + { + if (mesh.objectRegistry::template foundObject(name)) + { + GradFieldType& gGrad = + mesh.objectRegistry::template lookupObjectRef + ( + name + ); + + if (gGrad.ownedByRegistry()) + { + solution::cachePrintMessage("Deleting", name, vsf); + gGrad.release(); + delete &gGrad; + } + } + + solution::cachePrintMessage("Calculating", name, vsf); + return gaussGradCalcGrad(vsf, name); + } +} + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradCalcGrad +( + const GeometricField& vsf, + const word& name +) +{ + const fvMesh& mesh = vsf.mesh(); + + tmp> tinterpScheme_ = + tmp> + ( + new linear(mesh) + ); + + typedef typename outerProduct::type GradType; + + tmp> tinterpolate = tinterpScheme_().interpolate(vsf); + + tmp> tgGrad + ( + gaussGradGradf(tinterpolate.ref(), name) + ); + GeometricField& gGrad = tgGrad.ref(); + + gaussGradCorrectBoundaryConditions(vsf, gGrad); + + return tgGrad; +} + +template +void gaussGradCorrectBoundaryConditions +( + const GeometricField& vsf, + GeometricField + < + typename outerProduct::type, fvPatchField, volMesh + >& gGrad +) +{ + typename GeometricField + < + typename outerProduct::type, fvPatchField, volMesh + >::Boundary& gGradbf = gGrad.boundaryFieldRef(); + + forAll(vsf.boundaryField(), patchi) + { + if (!vsf.boundaryField()[patchi].coupled()) + { + const vectorField n + ( + vsf.mesh().Sf().boundaryField()[patchi] + / vsf.mesh().magSf().boundaryField()[patchi] + ); + + gGradbf[patchi] += n * + ( + vsf.boundaryField()[patchi].snGrad() + - (n & gGradbf[patchi]) + ); + } + } +} + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradGradf +( + const GeometricField& ssf, + const word& name +) +{ + typedef typename outerProduct::type GradType; + + const fvMesh& mesh = ssf.mesh(); + + tmp> tgGrad + ( + new GeometricField + ( + IOobject + ( + name, + ssf.instance(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensioned + ( + "0", + ssf.dimensions()/dimLength, + Zero + ), + extrapolatedCalculatedFvPatchField::typeName + ) + ); + GeometricField& gGrad = tgGrad.ref(); + + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + const vectorField& Sf = mesh.Sf(); + + Field& igGrad = gGrad; + const Field& issf = ssf; + + forAll(owner, facei) + { + GradType Sfssf = Sf[facei]*issf[facei]; + + igGrad[owner[facei]] += Sfssf; + igGrad[neighbour[facei]] -= Sfssf; + } + + forAll(mesh.boundary(), patchi) + { + const labelUList& pFaceCells = + mesh.boundary()[patchi].faceCells(); + + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + + const fvsPatchField& pssf = ssf.boundaryField()[patchi]; + + forAll(mesh.boundary()[patchi], facei) + { + igGrad[pFaceCells[facei]] += pSf[facei]*pssf[facei]; + if (pFaceCells[facei] == 0) + { + // Info << "CPU add = " << pSf[facei]*pssf[facei] << endl; + // Info << "surface CPU = " << pSf[facei] << endl; + // Info << "field CPU = " << pssf[facei] << endl; + } + } + } + + igGrad /= mesh.V(); + + gGrad.correctBoundaryConditions(); + + return tgGrad; +} + + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +); + + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +); + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/gaussLaplacianScheme.C b/GPUTestRef/gaussLaplacianScheme.C new file mode 100644 index 000000000..ed321ceda --- /dev/null +++ b/GPUTestRef/gaussLaplacianScheme.C @@ -0,0 +1,273 @@ +/*---------------------------------------------------------------------------* + ========= | + / F ield | OpenFOAM: The Open Source CFD Toolbox + / O peration | Website: https://openfoam.org + / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation +/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +*---------------------------------------------------------------------------*/ + +#include "gaussLaplacianScheme.H" +#include "surfaceInterpolate.H" +#include "fvcDiv.H" +#include "fvcGrad.H" +#include "fvMatrices.H" +#include "snGradScheme.H" +#include "linear.H" +#include "orthogonalSnGrad.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +gaussLaplacianSchemeFvmLaplacianUncorrected +( + const surfaceScalarField& gammaMagSf, + const surfaceScalarField& deltaCoeffs, + const GeometricField& vf +) +{ + tmp> tfvm + ( + new fvMatrix + ( + vf, + deltaCoeffs.dimensions()*gammaMagSf.dimensions()*vf.dimensions() + ) + ); + fvMatrix& fvm = tfvm.ref(); + + fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField(); + fvm.negSumDiag(); + + forAll(vf.boundaryField(), patchi) + { + const fvPatchField& pvf = vf.boundaryField()[patchi]; + const fvsPatchScalarField& pGamma = gammaMagSf.boundaryField()[patchi]; + const fvsPatchScalarField& pDeltaCoeffs = + deltaCoeffs.boundaryField()[patchi]; + + if (pvf.coupled()) + { + fvm.internalCoeffs()[patchi] = + pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs); + fvm.boundaryCoeffs()[patchi] = + -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs); + } + else + { + fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs(); + fvm.boundaryCoeffs()[patchi] = -pGamma*pvf.gradientBoundaryCoeffs(); + } + } + + return tfvm; +} + + +template +tmp> +gaussLaplacianSchemeGammaSnGradCorr +( + const surfaceVectorField& SfGammaCorr, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> tgammaSnGradCorr + ( + new GeometricField + ( + IOobject + ( + "gammaSnGradCorr("+vf.name()+')', + vf.instance(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + SfGammaCorr.dimensions() + *vf.dimensions()*mesh.deltaCoeffs().dimensions() + ) + ); + + for (direction cmpt = 0; cmpt < pTraits::nComponents; cmpt++) + { + tgammaSnGradCorr.ref().replace + ( + cmpt, + fvc::dotInterpolate(SfGammaCorr, fvc::grad(vf.component(cmpt))) + ); + } + + return tgammaSnGradCorr; +} + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + tmp> tinterpGammaScheme_(new linear(mesh)); + tmp> tsnGradScheme_(new fv::orthogonalSnGrad(mesh)); + + tmp> tgamma = tinterpGammaScheme_().interpolate(gammaScalarVol); + const GeometricField& gamma = tgamma.ref(); + + GeometricField gammaMagSf + ( + gamma*mesh.magSf() + ); + + tmp> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected + ( + gammaMagSf, + tsnGradScheme_().deltaCoeffs(vf), + vf + ); + fvMatrix& fvm = tfvm.ref(); + + if (tsnGradScheme_().corrected()) + { + if (mesh.fluxRequired(vf.name())) + { + fvm.faceFluxCorrectionPtr() = new + GeometricField + ( + gammaMagSf*tsnGradScheme_().correction(vf) + ); + + fvm.source() -= + mesh.V()* + fvc::div + ( + *fvm.faceFluxCorrectionPtr() + )().primitiveField(); + } + else + { + fvm.source() -= + mesh.V()* + fvc::div + ( + gammaMagSf*tsnGradScheme_().correction(vf) + )().primitiveField(); + } + } + return tfvm; +} + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gamma, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + tmp> tsnGradScheme_(new fv::orthogonalSnGrad(mesh)); + + GeometricField gammaMagSf + ( + gamma*mesh.magSf() + ); + + tmp> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected + ( + gammaMagSf, + tsnGradScheme_().deltaCoeffs(vf), + vf + ); + fvMatrix& fvm = tfvm.ref(); + + if (tsnGradScheme_().corrected()) + { + if (mesh.fluxRequired(vf.name())) + { + fvm.faceFluxCorrectionPtr() = new + GeometricField + ( + gammaMagSf*tsnGradScheme_().correction(vf) + ); + + fvm.source() -= + mesh.V()* + fvc::div + ( + *fvm.faceFluxCorrectionPtr() + )().primitiveField(); + } + else + { + fvm.source() -= + mesh.V()* + fvc::div + ( + gammaMagSf*tsnGradScheme_().correction(vf) + )().primitiveField(); + } + } + return tfvm; +} + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +); + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +); + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gamma, + const GeometricField& vf +); + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options index e2a57bd00..bda93210e 100644 --- a/applications/solvers/dfLowMachFoam/Make/options +++ b/applications/solvers/dfLowMachFoam/Make/options @@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \ $(PFLAGS) $(PINC) \ $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ - $(if $(AMGX_DIR),-DGPUSolver_,) \ -I$(LIB_SRC)/transportModels/compressible/lnInclude \ -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ @@ -29,7 +28,8 @@ EXE_INC = -std=c++14 \ $(PYTHON_INC_DIR) \ $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \ $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \ - $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) + $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \ + -I$(DF_ROOT)/GPUTestRef/lnInclude \ EXE_LIBS = \ -lcompressibleTransportModels \ @@ -43,6 +43,7 @@ EXE_LIBS = \ -ldfCanteraMixture \ -ldfChemistryModel \ -ldfCombustionModels \ + -ldfGenMatrix \ $(CANTERA_ROOT)/lib/libcantera.so \ $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \ $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \ diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index c3ee91068..38934abdb 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -86,6 +86,121 @@ // K = 0.5*magSqr(U); // } // UEqn_GPU.checkValue(true); +#elif defined GPUSolverNew_ + const tmp nuEff_tmp(turbulence->nuEff()); + const volScalarField& nuEff = nuEff_tmp(); + + // run CPU, for temp + tmp tUEqn + ( + fvm::ddt(rho, U) + + + fvm::div(phi, U) + + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + + // run GPU + // preProcess + // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) + UEqn_GPU.sync(); + double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); + double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); + double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); + memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); + memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); + int offset = 0; + forAll(phi.boundaryField(), patchi) + { + const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; + int patchsize = patchPhi.size(); + memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + offset += patchsize; + } + UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); + DEBUG_TRACE; + + TICK_START; + // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() + double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); + double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); + double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); + double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); + double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); + double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); + double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); + TICK_STOP(get pointer); + + TICK_START; + U.oldTime(); + memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); + memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); + memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); + TICK_STOP(copy to pinned memory); + + TICK_START; + offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + const fvPatchScalarField& patchP = p.boundaryField()[patchi]; + const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; + const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; + int patchsize = patchU.size(); + memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); + memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); + memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); + memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); + offset += patchsize; + } + TICK_STOP(CPU prepare boundary time); + + TICK_START; + UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); + DEBUG_TRACE; + UEqn_GPU.sync(); + TICK_STOP(GPU preProcess time); + + // process + TICK_START; + UEqn_GPU.process(); + DEBUG_TRACE; + UEqn_GPU.sync(); + TICK_STOP(GPU process time); + + TICK_START; + UEqn_GPU.solve(); + TICK_STOP(GPU solve time); + + // postProcess + TICK_START; + UEqn_GPU.postProcess(h_u); + U.correctBoundaryConditions(); + DEBUG_TRACE; + TICK_STOP(post process time); + + // checkResult + // TODO: for temp, now we compare ldu, finally we compare csr + std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; + } + bool printFlag = false; + UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], + h_internal_coeffs.data(), h_boundary_coeffs.data(), + // &DivTensor[0][0], + printFlag); + DEBUG_TRACE; #else start1 = std::clock(); tmp tUEqn diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H new file mode 100644 index 000000000..94fff1125 --- /dev/null +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -0,0 +1,97 @@ +dfMatrixDataBase dfDataBase; +//dfRhoEqn rhoEqn_GPU; +dfUEqn UEqn_GPU(dfDataBase); +//dfYEqn YEqn_GPU; +//dfEEqn EEqn_GPU; + +void createGPUBase(fvMesh& mesh, PtrList& Y) { + // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + int num_boundary_surfaces = 0; + int num_patches = 0; + std::vector patch_size; + forAll(mesh.boundary(), patchi) { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + int patchsize = sub_boundary.size(); + patch_size.push_back(patchsize); + num_boundary_surfaces += patchsize; + num_patches++; + } + // TODO: get deltaT fomr time API + double rDeltaT = 1 / 1e-6; + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT); + + // prepare constant indexes: owner, neighbor + dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); + + // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume + double *boundary_sf = new double[3 * num_boundary_surfaces]; + double *boundary_mag_sf = new double[num_boundary_surfaces]; + double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int *boundary_face_cell = new int[num_boundary_surfaces]; + int offset = 0; + forAll(mesh.boundary(), patchi) { + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells(); + + int patchsize = pMagSf.size(); + + memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); + memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); + memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int)); + offset += patchsize; + } + + dfDataBase.createConstantFieldsInternal(); + dfDataBase.createConstantFieldsBoundary(); + dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell); + + // prepare internal and boundary of Y + dfDataBase.createNonConstantFieldsInternal(); + dfDataBase.createNonConstantFieldsBoundary(); + forAll(Y, speciesI) { + volScalarField& Yi = Y[speciesI]; + memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); + offset = 0; + forAll(Yi.boundaryField(), patchi) { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + int patchsize = patchYi.size(); + memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double)); + offset += patchsize; + } + } + dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); + dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); +} + +void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) { + // prepare mode_string and setting_path + string mode_string = "dDDI"; + string settingPath; + settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); + UEqn_GPU.setConstantValues(mode_string, settingPath); + + // prepare patch_type + std::vector patch_type; + patch_type.resize(dfDataBase.num_patches); + forAll(U.boundaryField(), patchi) + { + constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type()); + } + UEqn_GPU.setConstantFields(patch_type); + + // prepare internal and boundary of xxx + UEqn_GPU.createNonConstantFieldsInternal(); + UEqn_GPU.createNonConstantFieldsBoundary(); + UEqn_GPU.createNonConstantLduAndCsrFields(); + // UEqn_GPU has no internal non-constant fields to be init + // UEqn_GPU.initNonConstantFieldsInternal(); + UEqn_GPU.initNonConstantFieldsBoundary(); +} diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index db6b25b18..6ea4251af 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -60,14 +60,34 @@ Description #include "basicThermo.H" #include "CombustionModel.H" -#ifdef GPUSolver_ +#define GPUSolverNew_ +#define TIME + +#ifdef GPUSolverNew_ #include "dfUEqn.H" -#include "dfYEqn.H" -#include "dfRhoEqn.H" -#include "dfEEqn.H" +// #include "dfYEqn.H" +// #include "dfRhoEqn.H" +// #include "dfEEqn.H" +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" #include #include + +#include "createGPUSolver.H" + #include "upwind.H" +#include "GenFvMatrix.H" +#endif + +#ifdef TIME + #define TICK_START \ + start_new = std::clock(); + #define TICK_STOP(prefix) \ + stop_new = std::clock(); \ + Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl; +#else + #define TICK_START + #define TICK_STOP(prefix) #endif // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // @@ -148,6 +168,8 @@ int main(int argc, char *argv[]) label timeIndex = 0; clock_t start, end, start1, end1, start2, end2; + clock_t start_new, stop_new; + double time_new = 0; turbulence->validate(); @@ -158,9 +180,11 @@ int main(int argc, char *argv[]) } start1 = std::clock(); - #ifdef GPUSolver_ - #include "createdfSolver.H" - #endif +#ifdef GPUSolverNew_ + createGPUBase(mesh, Y); + createGPUUEqn(CanteraTorchProperties, U); +#endif + end1 = std::clock(); time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC); @@ -187,7 +211,9 @@ int main(int argc, char *argv[]) runTime++; Info<< "Time = " << runTime.timeName() << nl << endl; - +#ifdef GPUSolverNew_ + dfDataBase.preTimeStep(&rho.oldTime()[0]); +#endif clock_t loop_start = std::clock(); // --- Pressure-velocity PIMPLE corrector loop while (pimple.loop()) @@ -276,6 +302,10 @@ int main(int argc, char *argv[]) rho = thermo.rho(); +#ifdef GPUSolverNew_ + dfDataBase.postTimeStep(); +#endif + runTime.write(); Info<< "========Time Spent in diffenet parts========"<< endl; Info<< "loop Time = " << loop_time << " s" << endl; diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H new file mode 100644 index 000000000..41b804a4b --- /dev/null +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -0,0 +1,116 @@ +#ifdef GPUSolver_ +const tmp nuEff_tmp(turbulence->nuEff()); +const volScalarField& nuEff = nuEff_tmp(); + +// run CPU, for temp +tmp tUEqn +( + fvm::ddt(rho, U) + fvm::div(phi, U) + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) +); +// tmp tUEqn_ref // test turbulence->divDevRhoReff(U) +// ( +// - fvc::div((turbulence->rho()*turbulence->nuEff())*dev2(Foam::T(fvc::grad(U)))) +// - fvm::laplacian(turbulence->rho()*turbulence->nuEff(), U) +// ); + +fvVectorMatrix& UEqn = tUEqn.ref(); + +// run GPU +// preProcess +// TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) +UEqn_GPU.sync(); +double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); +double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); +double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); +memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); +memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); +int offset = 0; +forAll(phi.boundaryField(), patchi) +{ + const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; + int patchsize = patchPhi.size(); + memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + offset += patchsize; +} +UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); +DEBUG_TRACE; +clock_t start = std::clock(); +// preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() +double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); +double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); +double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); +double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); +double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); +double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); +double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); +double end = std::clock(); +Info << "get pointer" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +start = std::clock(); +memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); +memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); +memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); +end = std::clock(); +Info << "copy to pinned memory" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +start = std::clock(); +offset = 0; +forAll(U.boundaryField(), patchi) +{ + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + const fvPatchScalarField& patchP = p.boundaryField()[patchi]; + const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; + const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; + int patchsize = patchU.size(); + memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); + memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); + memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); + memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); + offset += patchsize; +} +end = std::clock(); +Info << "CPU prepare boundary time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +start = std::clock(); +UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); +DEBUG_TRACE; +UEqn_GPU.sync(); +end = std::clock(); +Info << "GPU preProcess time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +// process +start = std::clock(); +UEqn_GPU.process(); +end = std::clock(); +DEBUG_TRACE; +UEqn_GPU.sync(); +// end = std::clock(); +Info << "GPU process time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +// postProcess +UEqn_GPU.postProcess(h_u); +DEBUG_TRACE; + +// checkResult +// TODO: for temp, now we compare ldu, finally we compare csr +std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); +std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); +offset = 0; +for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) +{ + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; +} +bool printFlag = true; +UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], + h_internal_coeffs.data(), h_boundary_coeffs.data(), + // &DivTensor[0][0], + printFlag); +DEBUG_TRACE; +#endif diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C new file mode 100644 index 000000000..7d867687f --- /dev/null +++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C @@ -0,0 +1,113 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +Application + unittest + +Description + GPU unittest + +\*---------------------------------------------------------------------------*/ + +#include "dfChemistryModel.H" +#include "CanteraMixture.H" +// #include "hePsiThermo.H" +#include "heRhoThermo.H" + +#include "fvCFD.H" +#include "fluidThermo.H" +#include "turbulentFluidThermoModel.H" +#include "pimpleControl.H" +#include "pressureControl.H" +#include "localEulerDdtScheme.H" +#include "fvcSmooth.H" +#include "PstreamGlobals.H" +#include "basicThermo.H" +#include "CombustionModel.H" + +#include +#include +#include "upwind.H" + +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" +#include "GenFvMatrix.H" +#include "dfUEqn.H" +#include "createGPUSolver.H" + +#define GPUSolver_ + +int main(int argc, char *argv[]) +{ +#ifdef USE_PYTORCH + pybind11::scoped_interpreter guard{};//start python interpreter +#endif + #include "postProcess.H" + + // #include "setRootCaseLists.H" + #include "listOptions.H" + #include "setRootCase2.H" + #include "listOutput.H" + + #include "createTime.H" + #include "createMesh.H" + #include "createDyMControls.H" + #include "initContinuityErrs.H" + #include "createFields.H" + #include "createRhoUfIfPresent.H" + + turbulence->validate(); + + if (!LTS) + { + #include "compressibleCourantNo.H" + #include "setInitialDeltaT.H" + } + + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + { + #include "readDyMControls.H" + + if (LTS) + { + #include "setRDeltaT.H" + } + else + { + #include "compressibleCourantNo.H" + #include "setDeltaT.H" + } + + createGPUBase(mesh, Y); + createGPUUEqn(CanteraTorchProperties, U); + + // for (int timestep = 0; timestep < 10; timestep++) { + dfDataBase.preTimeStep(&rho.oldTime()[0]); + #include "new_UEqn.H" + dfDataBase.postTimeStep(); + // } + } + return 0; +} + + diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 6e4a7efef..03a7fe6db 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -12,6 +12,8 @@ find_package(MPI REQUIRED) find_package(CUDAToolkit REQUIRED) find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) +add_compile_options(-arch=sm_70 -fmad=false) + include_directories( ${MPI_INCLUDE_PATH} ${CUDA_INCLUDE_DIRS} @@ -19,13 +21,11 @@ include_directories( ) add_library(${PROJECT_NAME} - SHARED - dfUEqn.cu - dfRhoEqn.cu - dfYEqn.cu - dfEEqn.cu + SHARED AmgXSolver.cu - dfMatrixDataBase.cu) + dfMatrixDataBase.cu + dfMatrixOpBase.cu + dfUEqn.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 8efb4bf62..cac7264a8 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -12,7 +12,9 @@ #include #include #include +#include +#define DEBUG_TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__); static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); @@ -30,17 +32,29 @@ void check(T result, char const *const func, const char *const file, #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) -inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { +inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error, bool print = false) { for (size_t i = 0; i < count; ++i) { double abs_diff = fabs(basevec[i] - vec[i]); double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); + if (print) + fprintf(stderr, "index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) if (abs_diff > 1e-15 && rel_diff > max_relative_error) - fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); + fprintf(stderr, "mismatch index %d, cpu data: %.30lf, gpu data: %.30lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); } } +enum location { + cpu, + gpu +}; + +enum position { + internal, + boundary +}; + enum boundaryConditions{ zeroGradient, fixedValue, @@ -48,594 +62,147 @@ enum boundaryConditions{ empty }; -void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); +void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr); struct dfMatrixDataBase { - // - cuda resource + // cuda resource cudaStream_t stream; - // - number of cell size - int num_cells; - // - number of face size - int num_surfaces; - // - number of offdiagnal entry size (2*num_surfaces) - int num_faces; - // - number of boundary cells - int num_boundary_cells; - // - number of boundary faces - int num_boundary_faces; - - int num_species; - - // - mesh variables - // - csr_row_index - int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; - // - csr_col_index - int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr; - // - csr_diag_index - int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr; - - // - the pre-permutated and post-permutated interpolation weight list - std::vector h_weight_vec_init, h_weight_vec; - // - the pre-permutated and post-permutated flux (phi) list - std::vector h_phi_vec_init, h_phi_vec; - // - the pre-permutated and post-permutated cell face vector list - std::vector h_face_vector_vec_init, h_face_vector_vec; - std::vector h_face_vec_init, h_face_vec; - std::vector h_deltaCoeffs_vec_init, h_deltaCoeffs_vec; - // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list - double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, - *h_pressure = nullptr; - const double *h_volume = nullptr; - // - the host pointer to the pre-permutated and post-permutated interpolation weight list - double *h_weight_init = nullptr, *h_weight = nullptr; - // - the host pointer to the pre-permutated and post-permutated flux (phi) list - double *h_phi_init = nullptr, *h_phi = nullptr; - // - the host pointer to the pre-permutated and post-permutated cell face vector list - double *h_face_vector_init = nullptr, *h_face_vector = nullptr; - double *h_face_init = nullptr, *h_face = nullptr; - double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr; - // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list - double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, - *d_pressure = nullptr, *d_volume = nullptr; - // - the device pointer to Y(vector Yi) - //std::vector d_Y; - double *d_Y = nullptr; - // - the device pointer to the pre-permutated and post-permutated interpolation weight list - double *d_weight_init = nullptr, *d_weight = nullptr; - double *d_weight_upwind = nullptr; - // - the device pointer to the pre-permutated and post-permutated flux (phi) list - double *d_phi_init = nullptr, *d_phi = nullptr; - // - the device pointer to the pre-permutated and post-permutated cell face vector list - double *d_face_vector_init = nullptr, *d_face_vector = nullptr; - double *d_face_init = nullptr, *d_face = nullptr; - double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; - std::vector d_rhoD_vector; - - double *d_hDiffCorrFlux = nullptr; - double *d_diffAlphaD = nullptr; - double *d_rhoD = nullptr; - double *d_alpha = nullptr; - - double rdelta_t = 1/1e-6; - - /** - * @brief boundary related variables - */ - int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr; - int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr; - double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr, - *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr, - *h_boundary_face = nullptr, *d_boundary_face = nullptr, - *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, - *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr, - *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr, - *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr, - *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, - *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, - *d_boundary_pressure_init = nullptr, - *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, - *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, - *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, - *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; - std::vector d_boundary_Y_vector; - std::vector d_boundary_Y_init_vector; - std::vector d_internal_coeffs_Y_vector; - std::vector d_boundary_coeffs_Y_vector; - std::vector d_laplac_internal_coeffs_Y_vector; - std::vector d_laplac_boundary_coeffs_Y_vector; - double *d_internal_coeffs_Y = nullptr; - double *d_boundary_coeffs_Y = nullptr; - double *d_laplac_internal_coeffs_Y = nullptr; - double *d_laplac_boundary_coeffs_Y = nullptr; - std::vector d_boundary_rhoD_vector; - double *d_boundary_mut_sct = nullptr; - double *d_boundary_rhoD = nullptr; - double *d_boundary_alpha = nullptr; - - double *d_boundary_hDiffCorrFlux = nullptr; - int *d_boundary_UpatchType = nullptr; - int *d_boundary_YpatchType = nullptr; - - std::vector boundPermutationList; - std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; - std::vector boundary_face_vector; - std::vector boundary_pressure; - std::vector boundary_face; - std::vector boundary_deltaCoeffs; - std::vector> patch_type_init; - std::vector> patch_type; - - // - the device pointer to the permutated index list - std::vector permedIndex; - int *d_permedIndex=nullptr; - int *d_bouPermedIndex = nullptr; - - - // bytesize - // - bytes of diagnal entries - size_t cell_bytes; - // - bytes of diagnal entries (vector) - size_t cell_vec_bytes; - // - bytes of diagnal index - size_t cell_index_bytes; - // - bytes of diagnal index - size_t face_bytes; - size_t face_vec_bytes; - size_t face_index_bytes; - - size_t boundary_cell_bytes; - size_t boundary_cell_vec_bytes; - size_t boundary_cell_index_bytes; - - size_t boundary_face_bytes; - size_t boundary_face_vec_bytes; - size_t boundary_face_index_bytes; - - // A_csr has one more element in each row: itself - size_t csr_row_index_bytes; - size_t csr_col_index_bytes; - size_t csr_value_bytes; - size_t csr_value_vec_bytes; - - // extra matrix information - double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr; - std::vector h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx; - std::vector h_turbSrc_init_src_vec, h_turbSrc_src_vec; - std::vector tmpPermutatedList; - int * d_tmpPermutatedList = nullptr; - - // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; - // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; - - int num_iteration; - - double time_monitor_CPU; - double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; - - double* d_grad = nullptr; - double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr; - double* d_nuEff = nullptr; + // constant values -- basic + int num_cells = 0; + int num_surfaces = 0; + int num_boundary_surfaces = 0; + int num_patches = 0; + int num_species = 0; + std::vector patch_size; + double rdelta_t = 0; + + // constant values -- ldu bytesize + size_t cell_value_bytes = 0; + size_t cell_value_vec_bytes = 0; + size_t cell_value_tsr_bytes = 0; + size_t cell_index_bytes = 0; + size_t surface_value_bytes = 0; + size_t surface_index_bytes = 0; + size_t surface_value_vec_bytes = 0; + size_t boundary_surface_value_bytes = 0; + size_t boundary_surface_value_vec_bytes = 0; + size_t boundary_surface_value_tsr_bytes = 0; + size_t boundary_surface_index_bytes = 0; + + // constant values -- csr bytesize + size_t csr_row_index_bytes = 0; + size_t csr_col_index_bytes = 0; + size_t csr_value_bytes = 0; + size_t csr_value_vec_bytes = 0; + + // constant indexes + int *d_owner = nullptr; + int *d_neighbor = nullptr; + int *d_lower_to_csr_index = nullptr; + int *d_diag_to_csr_index= nullptr; + int *d_upper_to_csr_index= nullptr; + int *d_csr_row_index= nullptr; + int *d_csr_col_index= nullptr; + + // constant fields - internal + double *d_sf = nullptr; + double *d_mag_sf = nullptr; + double *d_weight = nullptr; + double *d_delta_coeffs = nullptr; + double *d_volume = nullptr; + + // constant fields - boundary + double *d_boundary_sf = nullptr; + double *d_boundary_mag_sf = nullptr; + double *d_boundary_weight = nullptr; + double *d_boundary_delta_coeffs = nullptr; + int *d_boundary_face_cell = nullptr; + + // non-constant fields - internal + // TODO: further estimate + // fields solved by eqns - new + double *d_rho = nullptr; + double *d_u = nullptr; + double *d_y = nullptr; + double *d_he = nullptr; + double *d_p = nullptr; + // fields solved by eqns - old + // TODO: not all fields need to store oldTime + double *d_rho_old = nullptr; + //double *d_u_old = nullptr; + //double *d_y_old = nullptr; + //double *d_he_old = nullptr; + //double *d_p_old = nullptr; + // other shared fields between eqns + double *d_phi = nullptr; + // computed on GPU, used on CPU, need memcpyd2h - host + double *h_rho = nullptr; + double *h_rho_old = nullptr; + double *h_u= nullptr; + double *h_y= nullptr; + double *h_he= nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_p= nullptr; + double *h_phi= nullptr; + + // non-constant fields - boundary + // TODO: further estimate + // fields solved by eqns - new + double *d_boundary_rho = nullptr; + double *d_boundary_u = nullptr; + double *d_boundary_y = nullptr; + double *d_boundary_he = nullptr; + double *d_boundary_p = nullptr; + // fields solved by eqns - old + double *d_boundary_rho_old = nullptr; + //double *d_boundary_u_old = nullptr; + //double *d_boundary_y_old = nullptr; + //double *d_boundary_he_old = nullptr; + //double *d_boundary_p_old = nullptr; + // other shared fields between eqns + double *d_boundary_phi = nullptr; + // computed on GPU, used on CPU, need memcpyd2h - host + double *h_boundary_rho = nullptr; + double *h_boundary_rho_old = nullptr; + double *h_boundary_u= nullptr; + double *h_boundary_y= nullptr; + double *h_boundary_he= nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_boundary_p= nullptr; + double *h_boundary_phi= nullptr; + + std::unordered_map fieldPointerMap; // constructor dfMatrixDataBase(); - dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, - const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, - const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, - std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init, std::vector> patch_type_init) - : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), - num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init) - { - // create cuda stream - checkCudaErrors(cudaStreamCreate(&stream)); - - // allocate field pointer in pin memory - cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); - cudaMallocHost(&h_rho_old, num_cells * sizeof(double)); - - h_weight_vec_init.resize(num_faces); - h_weight_vec.resize(num_faces); - h_face_vector_vec_init.resize(num_faces*3); - h_face_vector_vec.resize(num_faces*3); - h_face_vec_init.resize(num_faces); - h_face_vec.resize(num_faces); - h_deltaCoeffs_vec_init.resize(num_faces); - h_deltaCoeffs_vec.resize(num_faces); - h_turbSrc_init_mtx_vec.resize(num_faces + num_cells); - h_turbSrc_init_1mtx.resize(num_faces + num_cells); - h_turbSrc_init_src_vec.resize(3*num_cells); - h_turbSrc_src_vec.resize(3*num_cells); - - // byte sizes - cell_bytes = num_cells * sizeof(double); - cell_vec_bytes = num_cells * 3 * sizeof(double); - cell_index_bytes = num_cells * sizeof(int); - - face_bytes = num_faces * sizeof(double); - face_vec_bytes = num_faces * 3 * sizeof(double); - face_index_bytes = num_faces * sizeof(int); - - // A_csr has one more element in each row: itself - csr_row_index_bytes = (num_cells + 1) * sizeof(int); - csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); - csr_value_bytes = (num_cells + num_faces) * sizeof(double); - csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); - - /************************construct mesh variables****************************/ - /** - * 1. h_csr_row_index & h_csr_diag_index - */ - std::vector h_mtxEntry_perRow_vec(num_cells); - std::vector h_csr_diag_index_vec(num_cells); - std::vector h_csr_row_index_vec(num_cells + 1, 0); - - for (int faceI = 0; faceI < num_surfaces; faceI++) - { - h_csr_diag_index_vec[neighbour[faceI]]++; - h_mtxEntry_perRow_vec[neighbour[faceI]]++; - h_mtxEntry_perRow_vec[owner[faceI]]++; - } - - // - consider diagnal element in each row - std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n) - {return n + 1;}); - // - construct h_csr_row_index & h_csr_diag_index - std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1); - // - assign h_csr_row_index & h_csr_diag_index - h_A_csr_row_index = h_csr_row_index_vec.data(); - h_A_csr_diag_index = h_csr_diag_index_vec.data(); - - /** - * 2. h_csr_col_index - */ - std::vector rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells); - std::iota(diagIndex.begin(), diagIndex.end(), 0); - - // initialize the RowIndex (rowIndex of lower + upper + diagnal) - std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin()); - std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces); - std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces); - // initialize the ColIndex (colIndex of lower + upper + diagnal) - std::copy(owner, owner + num_surfaces, colIndex.begin()); - std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces); - std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces); - - // - construct hashTable for sorting - std::multimap rowColPair; - for (int i = 0; i < 2*num_surfaces+num_cells; i++) - { - rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i])); - } - // - sort - std::vector> globalPerm(rowColPair.begin(), rowColPair.end()); - std::sort(globalPerm.begin(), globalPerm.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - - std::vector h_csr_col_index_vec; - std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), [] - (const std::pair& pair) { - return pair.second; - }); - h_A_csr_col_index = h_csr_col_index_vec.data(); - - // construct a tmp permutated List for add fvMatrix - std::vector tmp_permutation(2*num_surfaces + num_cells); - std::vector tmp_rowIndex(2*num_surfaces + num_cells); - std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0); - std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin()); - std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces); - std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells); - std::multimap tmpPair; - for (int i = 0; i < 2*num_surfaces+num_cells; i++) - { - tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i])); - } - std::vector> tmpPerm(tmpPair.begin(), tmpPair.end()); - std::sort(tmpPerm.begin(), tmpPerm.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), [] - (const std::pair& pair) { - return pair.second; - }); - - /** - * 3. boundary imformations - */ - // get boundPermutation and offset lists - std::vector boundPermutationListInit(num_boundary_faces); - std::vector boundOffsetList; - std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0); - - // - construct hashTable for sorting - std::multimap boundPermutation; - for (int i = 0; i < num_boundary_faces; i++) - { - boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i])); - } - - // - sort - std::vector> boundPermPair(boundPermutation.begin(), boundPermutation.end()); - std::sort(boundPermPair.begin(), boundPermPair.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - - // - construct boundPermedIndex and boundary_cell_id - std::vector boundary_cell_id; - boundPermutationList.clear(); - std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), [] - (const std::pair& pair) { - return pair.first; - }); - std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), [] - (const std::pair& pair) { - return pair.second; - }); - - // construct boundary_cell_offset - std::map countMap; - std::vector boundaryCellcount; - for (const auto& cellIndex : boundary_cell_id) - ++ countMap[cellIndex]; - for (const auto& [cellIndex, count] : countMap) - boundaryCellcount.push_back(count); - - num_boundary_cells = boundaryCellcount.size(); - num_boundary_cells_output = num_boundary_cells; - - std::vector boundary_cell_offset(boundaryCellcount.size() + 1, 0); - std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1); - - // assign h_boundary_cell_offset & h_boundary_cell_id - h_boundary_cell_offset = boundary_cell_offset.data(); - h_boundary_cell_id = boundary_cell_id.data(); - - // - boundary_cell_bytes = num_boundary_cells * sizeof(double); - boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); - boundary_cell_index_bytes = num_boundary_cells * sizeof(int); - - boundary_face_bytes = num_boundary_faces * sizeof(double); - boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); - boundary_face_index_bytes = num_boundary_faces * sizeof(int); - - ueqn_internalCoeffs.resize(3*num_boundary_faces); - ueqn_boundaryCoeffs.resize(3*num_boundary_faces); - - boundary_face_vector.resize(3*num_boundary_faces); - boundary_pressure.resize(num_boundary_faces); - boundary_face.resize(num_boundary_faces); - boundary_deltaCoeffs.resize(num_boundary_faces); - - patch_type.resize(2); - patch_type[0].resize(num_boundary_faces); - patch_type[1].resize(num_boundary_faces); - - /** - * 4. permutation list for field variables - */ - std::vector offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces); - // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper) - std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin()); - std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces); - - // - initialize the permIndex (0, 1, ..., 2*num_surfaces) - std::iota(permIndex.begin(), permIndex.end(), 0); - - // - construct hashTable for sorting - std::multimap permutation; - for (int i = 0; i < 2*num_surfaces; i++) - { - permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i])); - } - // - sort - std::vector> permPair(permutation.begin(), permutation.end()); - std::sort(permPair.begin(), permPair.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - // - form permedIndex list - std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), [] - (const std::pair& pair) { - return pair.second; - }); - - // copy and permutate cell variables - std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin()); - std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces); - std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin()); - std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces); - std::copy(face, face + num_surfaces, h_face_vec_init.begin()); - std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces); - std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin()); - std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces); - for (int i = 0; i < num_faces; i++) - { - h_weight_vec[i] = h_weight_vec_init[permedIndex[i]]; - h_face_vec[i] = h_face_vec_init[permedIndex[i]]; - h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]]; - h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]]; - h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1]; - h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2]; - } - h_weight = h_weight_vec.data(); - h_face_vector = h_face_vector_vec.data(); - h_face = h_face_vec.data(); - h_deltaCoeffs = h_deltaCoeffs_vec.data(); - - for (int i = 0; i < num_boundary_faces; i++) - { - boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]]; - boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1]; - boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; - boundary_face[i] = boundary_face_init[boundPermutationList[i]]; - boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; - patch_type[0][i] = patch_type_init[0][boundPermutationList[i]]; - patch_type[1][i] = patch_type_init[1][boundPermutationList[i]]; - } - h_boundary_face_vector = boundary_face_vector.data(); - h_boundary_face = boundary_face.data(); - h_boundary_deltaCoeffs = boundary_deltaCoeffs.data(); - - /************************allocate memory on device****************************/ - int total_bytes = 0; - - checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); - total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - - //d_Y.resize(num_species); - d_rhoD_vector.resize(num_species); - d_boundary_Y_vector.resize(num_species); - d_boundary_Y_init_vector.resize(num_species); - d_internal_coeffs_Y_vector.resize(num_species); - d_boundary_coeffs_Y_vector.resize(num_species); - d_laplac_internal_coeffs_Y_vector.resize(num_species); - d_laplac_boundary_coeffs_Y_vector.resize(num_species); - d_boundary_rhoD_vector.resize(num_species); - - for (size_t i = 0; i < num_species; ++i){ - //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); - } - checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes)); - total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); - total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int)); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes)); - for (size_t i = 0; i < num_species; ++i){ - checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes)); - } - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes)); - - total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11); - - // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); - // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); - // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); - total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); - - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); - total_bytes += (2*csr_value_bytes + cell_vec_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes)); - total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double))); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9)); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); - total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename - checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes)); + // deconstructor + ~dfMatrixDataBase(); - fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); + // member function + void setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces, + int num_patches, std::vector patch_size, + int num_species, double rdelta_t); + void setConstantIndexes(const int *owner, const int *neighbor); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + void createConstantFieldsInternal(); + void createConstantFieldsBoundary(); + void initConstantFieldsInternal(const double *sf, const double *mag_sf, + const double *weight, const double *delta_coeffs, const double *volume); + void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, + const double *boundary_delta_coeffs, const int *boundary_face_cell); - checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + void createNonConstantFieldsInternal(); + void createNonConstantFieldsBoundary(); + void initNonConstantFieldsInternal(const double *y); + void initNonConstantFieldsBoundary(const double *boundary_y); - checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - }; + void preTimeStep(const double *rho_old); + void postTimeStep(); - ~dfMatrixDataBase(){ - std::cout << "Destructor called." << std::endl; - // TODO: free pointers - - }; + // getter + double* getFieldPointer(const char* fieldAlias, location loc, position pos); }; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index d4f5a7ab0..4e49faf99 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -1,8 +1,6 @@ #include "dfMatrixDataBase.H" - -void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, - const int patchSize) +void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr) { boundaryConditions patchCondition; std::vector tmpSelector; @@ -22,27 +20,315 @@ void constructBoundarySelector(std::vector& patchTypeSelector, const std::s switch (patchCondition){ case zeroGradient: { - tmpSelector.resize(patchSize, 0); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + *patchTypeSelector = 0; break; } case fixedValue: { - tmpSelector.resize(patchSize, 1); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + *patchTypeSelector = 1; break; } case empty: { - tmpSelector.resize(patchSize, 2); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + *patchTypeSelector = 2; break; } case coupled: { - tmpSelector.resize(patchSize, 3); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + *patchTypeSelector = 3; break; } } } + +dfMatrixDataBase::dfMatrixDataBase() { + checkCudaErrors(cudaStreamCreate(&stream)); +} + +dfMatrixDataBase::~dfMatrixDataBase() { + // destroy cuda resources + checkCudaErrors(cudaStreamDestroy(stream)); + // TODO: free pointers +} + +void dfMatrixDataBase::setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces, + int num_patches, std::vector patch_size, + int num_species, double rdelta_t) { + // constant values -- basic + this->num_cells = num_cells; + this->num_surfaces = num_surfaces; + this->num_boundary_surfaces = num_boundary_surfaces; + this->num_patches = num_patches; + this->patch_size = patch_size; + this->num_species = num_species; + this->rdelta_t = rdelta_t; + + // constant values -- ldu bytesize + cell_value_bytes = num_cells * sizeof(double); + cell_value_vec_bytes = num_cells * 3 * sizeof(double); + cell_value_tsr_bytes = num_cells * 9 * sizeof(double); + cell_index_bytes = num_cells * sizeof(int); + surface_value_bytes = num_surfaces * sizeof(double); + surface_index_bytes = num_surfaces * sizeof(int); + surface_value_vec_bytes = num_surfaces * 3 * sizeof(double); + boundary_surface_value_bytes = num_boundary_surfaces * sizeof(double); + boundary_surface_value_vec_bytes = num_boundary_surfaces * 3 * sizeof(double); + boundary_surface_value_tsr_bytes = num_boundary_surfaces * 9 * sizeof(double); + boundary_surface_index_bytes = num_boundary_surfaces * sizeof(int); + + // constant values -- csr bytesize + csr_row_index_bytes = (num_cells + 1) * sizeof(int); + csr_col_index_bytes = (num_cells + num_surfaces * 2) * sizeof(int); + csr_value_bytes = (num_cells + num_surfaces * 2) * sizeof(double); + csr_value_vec_bytes = (num_cells + num_surfaces * 2) * 3 * sizeof(double); +} + +void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) { + // build d_owner, d_neighbor + checkCudaErrors(cudaMalloc((void**)&d_owner, surface_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_neighbor, surface_index_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_owner, owner, surface_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_neighbor, neighbor, surface_index_bytes, cudaMemcpyHostToDevice, stream)); + + + // build d_lower_to_csr_index, d_diag_to_csr_index, d_upper_to_csr_index + std::vector upperNum(num_cells, 0); + std::vector lowerNum(num_cells, 0); + std::vector lowerPermListInit(num_surfaces); + + int *upperOffset = (int*)calloc(num_cells + 1, sizeof(int)); + int *lowerOffset = (int*)calloc(num_cells + 1, sizeof(int)); + + for(int faceI = 0; faceI < num_surfaces; ++faceI){ + upperNum[owner[faceI]] ++; + lowerNum[neighbor[faceI]] ++; + } + std::partial_sum(upperNum.begin(), upperNum.end(), + upperOffset+1); + std::partial_sum(lowerNum.begin(), lowerNum.end(), + lowerOffset+1); + + std::iota(lowerPermListInit.begin(), lowerPermListInit.end(), 0); + + std::multimap permutation; + for (int i = 0; i < num_surfaces; ++i){ + permutation.insert(std::make_pair(neighbor[i], lowerPermListInit[i])); + } + std::vector> permPair(permutation.begin(), permutation.end()); + std::sort(permPair.begin(), permPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + std::vector lowerPermList; + std::transform(permPair.begin(), permPair.end(), std::back_inserter(lowerPermList), [] + (const std::pair& pair) { + return pair.second; + }); + + std::vector lowCSRIndex, uppCSRIndex, diagCSRIndex, CSRRowIndex, CSRColIndex; + int uppIndexInCSR = 0, uppIndexInLdu = 0, lowIndexInCSR = 0, lowIndexInLdu = 0, lowNumInLdu = 0; + CSRColIndex.resize(2 * num_surfaces + num_cells); + lowCSRIndex.resize(num_surfaces); + for (int i = 0; i < num_cells; ++i) { + int numUppPerRow = upperOffset[i + 1] - upperOffset[i]; + int numLowPerRow = lowerOffset[i + 1] - lowerOffset[i]; + int numNZBefore = upperOffset[i] + lowerOffset[i] + i; // add diag + // csr row index + CSRRowIndex.push_back(numNZBefore); + // upper + for (int j = 0; j < numUppPerRow; ++j) { + uppIndexInCSR = numNZBefore + numLowPerRow + 1 + j; // 1 means diag + uppCSRIndex.push_back(uppIndexInCSR); + CSRColIndex[uppIndexInCSR] = neighbor[uppIndexInLdu]; // fill upper entry in CSRColIndex + uppIndexInLdu ++; + } + // lower + for (int j = 0; j < numLowPerRow; ++j) { + lowIndexInCSR = numNZBefore + j; + lowIndexInLdu = lowerPermList[lowNumInLdu]; + lowCSRIndex[lowIndexInLdu] = lowIndexInCSR; + CSRColIndex[lowIndexInCSR] = owner[lowIndexInLdu]; // fill lower entry in CSRColIndex + lowNumInLdu ++; + } + // diag + int diagIndexInCSR = numNZBefore + numLowPerRow; + diagCSRIndex.push_back(diagIndexInCSR); + CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex + } + CSRRowIndex.push_back(2 * num_surfaces + num_cells); + + checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_upper_to_csr_index, surface_index_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_lower_to_csr_index, lowCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_upper_to_csr_index, uppCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream)); + + + // build d_csr_row_index, d_csr_col_index + checkCudaErrors(cudaMalloc((void**)&d_csr_row_index, csr_row_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_csr_col_index, csr_col_index_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_csr_row_index, CSRRowIndex.data(), csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_csr_col_index, CSRColIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::createConstantFieldsInternal() { + checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes)); + fieldPointerMap["d_sf"] = d_sf; + fieldPointerMap["d_mag_sf"] = d_mag_sf; + fieldPointerMap["d_weight"] = d_weight; + fieldPointerMap["d_delta_coeffs"] = d_delta_coeffs; + fieldPointerMap["d_volume"] = d_volume; +} + +void dfMatrixDataBase::createConstantFieldsBoundary() { + checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face_cell, boundary_surface_index_bytes)); + fieldPointerMap["d_boundary_sf"] = d_boundary_sf; + fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf; + fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs; +} + +void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, + const double *weight, const double *delta_coeffs, const double *volume) { + checkCudaErrors(cudaMemcpyAsync(d_sf, sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_volume, volume, cell_value_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, + const double *boundary_delta_coeffs, const int *boundary_face_cell) { + checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face_cell, boundary_face_cell, boundary_surface_index_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::createNonConstantFieldsInternal() { + checkCudaErrors(cudaMalloc((void**)&d_rho, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_u, cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes)); + fieldPointerMap["d_rho"] = d_rho; + fieldPointerMap["d_u"] = d_u; + fieldPointerMap["d_y"] = d_y; + fieldPointerMap["d_he"] = d_he; + fieldPointerMap["d_p"] = d_p; + + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes)); + fieldPointerMap["d_rho_old"] = d_rho_old; + // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species)); + // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes)); + fieldPointerMap["d_phi"] = d_phi; + + // computed on GPU, used on CPU, need memcpyd2h + checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species)); + checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes)); + fieldPointerMap["h_rho"] = h_rho; + fieldPointerMap["h_rho_old"] = h_rho_old; + fieldPointerMap["h_u"] = h_u; + fieldPointerMap["h_y"] = h_y; + fieldPointerMap["h_he"] = h_he; + + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes)); + fieldPointerMap["h_p"] = h_p; + fieldPointerMap["h_phi"] = h_phi; +} + +void dfMatrixDataBase::createNonConstantFieldsBoundary() { + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_u, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_rho"] = d_boundary_rho; + fieldPointerMap["d_boundary_u"] = d_boundary_u; + fieldPointerMap["d_boundary_y"] = d_boundary_y; + fieldPointerMap["d_boundary_he"] = d_boundary_he; + fieldPointerMap["d_boundary_p"] = d_boundary_p; + + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_rho_old"] = d_boundary_rho_old; + // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_phi"] = d_boundary_phi; + + // computed on GPU, used on CPU, need memcpyd2h + checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho_old, boundary_surface_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes)); + fieldPointerMap["h_boundary_rho"] = h_boundary_rho; + fieldPointerMap["h_boundary_rho_old"] = h_boundary_rho_old; + fieldPointerMap["h_boundary_u"] = h_boundary_u; + fieldPointerMap["h_boundary_y"] = h_boundary_y; + fieldPointerMap["h_boundary_he"] = h_boundary_he; + + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes)); + fieldPointerMap["h_boundary_p"] = h_boundary_p; + fieldPointerMap["h_boundary_phi"] = h_boundary_phi; +} + +void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) { + checkCudaErrors(cudaMemcpyAsync(d_y, y, cell_value_bytes * num_species, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) { + checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::preTimeStep(const double *rho_old) { + checkCudaErrors(cudaMemcpyAsync(d_rho_old, rho_old, cell_value_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::postTimeStep() {} + +double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) { + char mergedName[256]; + if (pos == position::internal) { + sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } else if (pos == position::boundary) { + sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } + + double *pointer = nullptr; + if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) { + pointer = fieldPointerMap[std::string(mergedName)]; + } + if (pointer == nullptr) { + fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName); + } + //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer); + + return pointer; +} diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H new file mode 100644 index 000000000..71dd82c38 --- /dev/null +++ b/src_gpu/dfMatrixOpBase.H @@ -0,0 +1,88 @@ +#pragma once +// #define TIME_GPU + +// tools +void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); +void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output); + +void field_multiply_scalar(cudaStream_t stream, + int num_cells, const double *input1, const double *input2, double *output, + int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output); + +void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source); + +void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, + const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, + const double *lower, const double *upper, const double *diag, const double *source, + const double *internal_coeffs, const double *boundary_coeffs, + double *A, double *b, double *diag_vec); + +void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, + const int *patch_size, const int *patch_type, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs); + +// fvm ops + +void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *volume, + double *diag, double *source, double sign = 1.); + +void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_sourfaces, + const int *lowerAddr, const int *upperAddr, + const double *phi, const double *weight, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs, double sign = 1.); + +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_mag_sf, const double *boundary_gamma, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs, double sign = 1.); + +// fvc ops +// fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign). +void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *vf_old, + double *output, double sign = 1.); + +void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, const double *boundary_mag_Sf, double *boundary_output, + const double *boundary_deltaCoeffs, double sign = 1.); + +void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, + const double *boundary_ssf, const double *volume, double *output, double sign = 1.); + +void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, double sign = 1.); + +void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, double sign = 1.); + +void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign = 1.); + +// others +void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, + int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2); diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu new file mode 100644 index 000000000..e3616fac3 --- /dev/null +++ b/src_gpu/dfMatrixOpBase.cu @@ -0,0 +1,1286 @@ +#include "dfMatrixOpBase.H" +#include "dfMatrixDataBase.H" + +#include +#include "cuda_profiler_api.h" + +#ifdef TIME_GPU + #define TICK_INIT_EVENT \ + float time_elapsed_kernel=0;\ + cudaEvent_t start_kernel, stop_kernel;\ + checkCudaErrors(cudaEventCreate(&start_kernel));\ + checkCudaErrors(cudaEventCreate(&stop_kernel)); + + #define TICK_START_EVENT \ + checkCudaErrors(cudaEventRecord(start_kernel,0)); + + #define TICK_END_EVENT(prefix) \ + checkCudaErrors(cudaEventRecord(stop_kernel,0));\ + checkCudaErrors(cudaEventSynchronize(start_kernel));\ + checkCudaErrors(cudaEventSynchronize(stop_kernel));\ + checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ + printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); +#else + #define TICK_INIT_EVENT + #define TICK_START_EVENT + #define TICK_END_EVENT(prefix) +#endif + +__global__ void warmup(int num_cells) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; +} + +__global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[index * 3 + 0] = input[num_cells * 0 + index]; + output[index * 3 + 1] = input[num_cells * 1 + index]; + output[index * 3 + 2] = input[num_cells * 2 + index]; +} + +__global__ void permute_vector_h2d_kernel(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[num_cells * 0 + index] = input[index * 3 + 0]; + output[num_cells * 1 + index] = input[index * 3 + 1]; + output[num_cells * 2 + index] = input[index * 3 + 2]; +} + +__global__ void field_multiply_scalar_kernel(int num_cells, int num_boundary_surfaces, + const double *input1, const double *input2, double *output, + const double *boundary_input1, const double *boundary_input2, double *boundary_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < num_cells) { + output[index] = input1[index] * input2[index]; + } + if (index < num_boundary_surfaces) { + boundary_output[index] = boundary_input1[index] * boundary_input2[index]; + } +} + +__global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, const double *fvc_output, double *source) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index]; + // source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index]; + // source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index]; + source[index * 3 + 0] += fvc_output[index * 3 + 0]; + source[index * 3 + 1] += fvc_output[index * 3 + 1]; + source[index * 3 + 2] += fvc_output[index * 3 + 2]; +} + +__global__ void update_boundary_coeffs_zeroGradient_vector(int num_boundary_surfaces, int num, int offset, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + // valueInternalCoeffs = 1 + // valueBoundaryCoeffs = 0 + // gradientInternalCoeffs = 0 + // gradientBoundaryCoeffs = 0 + value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 1; + value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 1; + value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 1; + value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0; + value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0; + value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0; + gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 0; + gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 0; + gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 0; + gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0; + gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0; + gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0; +} + +__global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + double scale = vf1[index]; + double val_xx = vf2[num * 0 + index]; + double val_xy = vf2[num * 1 + index]; + double val_xz = vf2[num * 2 + index]; + double val_yx = vf2[num * 3 + index]; + double val_yy = vf2[num * 4 + index]; + double val_yz = vf2[num * 5 + index]; + double val_zx = vf2[num * 6 + index]; + double val_zy = vf2[num * 7 + index]; + double val_zz = vf2[num * 8 + index]; + double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz); + vf2[num * 0 + index] = scale * (val_xx - trace_coeff); + vf2[num * 1 + index] = scale * val_yx; + vf2[num * 2 + index] = scale * val_zx; + vf2[num * 3 + index] = scale * val_xy; + vf2[num * 4 + index] = scale * (val_yy - trace_coeff); + vf2[num * 5 + index] = scale * val_zy; + vf2[num * 6 + index] = scale * val_xz; + vf2[num * 7 + index] = scale * val_yz; + vf2[num * 8 + index] = scale * (val_zz - trace_coeff); + + // if (index == 0) + // { + // printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2], + // vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]); + // } + +} + +__global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *volume, + double *diag, double *source, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + double rho_old_kernel = rho_old[index]; + + diag[index] += rDeltaT * rho[index] * vol * sign; + // TODO: skip moving + source[num_cells * 0 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 0 + index] * vol * sign; + source[num_cells * 1 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 1 + index] * vol * sign; + source[num_cells * 2 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 2 + index] * vol * sign; +} + +__global__ void fvm_div_vector_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *phi, const double *weight, + double *lower, double *upper, double *diag, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double f = phi[index]; + + double lower_value = (-w) * f * sign; + double upper_value = (1 - w) * f * sign; + lower[index] += lower_value; + upper[index] += upper_value; + // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]); + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + atomicAdd(&(diag[owner]), -lower_value); + atomicAdd(&(diag[neighbor]), -upper_value); +} + +// TODO: modify the data structure of internal and boundary coeffs +__global__ void fvm_div_vector_boundary(int num_boundary_surfaces, int num, int offset, + const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + double boundary_f = boundary_phi[start_index]; + internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign; +} + +__global__ void fvm_laplacian_vector_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, + double *lower, double *upper, double *diag, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double w = weight[index]; + double upper_face_gamma = w * gamma[owner] + (1 - w) * gamma[neighbor]; + double upper_value = upper_face_gamma * mag_sf[index] * delta_coeffs[index]; + + // laplacian doesn't use the original lower, but use lower = upper + //double lower_face_gamma = w * gamma[neighbor] + (1 - w) * gamma[owner]; + //double lower_value = lower_face_gamma * mag_sf[index] * delta_coeffs[index]; + double lower_value = upper_value; + + lower_value = lower_value * sign; + upper_value = upper_value * sign; + + lower[index] += lower_value; + upper[index] += upper_value; + + atomicAdd(&(diag[owner]), -lower_value); + atomicAdd(&(diag[neighbor]), -upper_value); +} + +__global__ void fvm_laplacian_vector_boundary(int num_boundary_surfaces, int num, int offset, + const double *boundary_mag_sf, const double *boundary_gamma, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index]; + internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign; +} + +__global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *vf_old, + double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + /* + // workaround way1 (use printf): + double val_new = rho[index] * vf[index]; + double val_old = rho_old[index] * vf_old[index]; + // TODO: skip moving + // TODO: wyr + // for the case of rho = rho_old and vf = vf_old, the floating-point numerical problem will be exposed. + // it expect zero as output, but the gpu result get a sub-normal minimal value for (val_new - val_old), + // which smaller than 1e-16, and then enlarged by rDeltaT (1e6) + // then the comparison of cpu result and gpu result will failed with relative error: inf, + // e.g.: + // cpu data: 0.0000000000000000, gpu data: 0.0000000000298050, relative error: inf + // if I add the print line for intermediate variables of val_new and val_old, the problem disappears. + // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler. + if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old); + output[index] += rDeltaT * (val_new - val_old); + */ + /* + // workaround way2 (use volatile): + // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler. + volatile double val_new = rho[index] * vf[index]; + volatile double val_old = rho_old[index] * vf_old[index]; + output[index] += rDeltaT * (val_new - val_old); + */ + // workaround way3 (use nvcc option -fmad=false) + output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * sign; +} + +__global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, + const int *lower_index, const int *upper_index, const double *face_vector, + const double *weight, const double *field_vector, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]); + double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]); + double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]); + + double grad_xx = Sfx * ssfx; + double grad_xy = Sfx * ssfy; + double grad_xz = Sfx * ssfz; + double grad_yx = Sfy * ssfx; + double grad_yy = Sfy * ssfy; + double grad_yz = Sfy * ssfz; + double grad_zx = Sfz * ssfx; + double grad_zy = Sfz * ssfy; + double grad_zz = Sfz * ssfz; + + // // owner + // atomicAdd(&(output[num_cells * 0 + owner]), grad_xx); + // atomicAdd(&(output[num_cells * 1 + owner]), grad_xy); + // atomicAdd(&(output[num_cells * 2 + owner]), grad_xz); + // atomicAdd(&(output[num_cells * 3 + owner]), grad_yx); + // atomicAdd(&(output[num_cells * 4 + owner]), grad_yy); + // atomicAdd(&(output[num_cells * 5 + owner]), grad_yz); + // atomicAdd(&(output[num_cells * 6 + owner]), grad_zx); + // atomicAdd(&(output[num_cells * 7 + owner]), grad_zy); + // atomicAdd(&(output[num_cells * 8 + owner]), grad_zz); + + // // neighbour + // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx); + // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy); + // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz); + // atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx); + // atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy); + // atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz); + // atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx); + // atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy); + // atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz); + + atomicAdd(&(output[num_cells * 0 + owner]), grad_xx); + atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx); + atomicAdd(&(output[num_cells * 1 + owner]), grad_xy); + atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy); + atomicAdd(&(output[num_cells * 2 + owner]), grad_xz); + atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz); + atomicAdd(&(output[num_cells * 3 + owner]), grad_yx); + atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx); + atomicAdd(&(output[num_cells * 4 + owner]), grad_yy); + atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy); + atomicAdd(&(output[num_cells * 5 + owner]), grad_yz); + atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz); + atomicAdd(&(output[num_cells * 6 + owner]), grad_zx); + atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx); + atomicAdd(&(output[num_cells * 7 + owner]), grad_zy); + atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy); + atomicAdd(&(output[num_cells * 8 + owner]), grad_zz); + atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz); +} + +// update boundary of interpolation field +// calculate the grad field +// TODO: this function is implemented for uncoupled boundary conditions +// so it should use the more specific func name +__global__ void fvc_grad_vector_boundary(int num_cells, int num, + int offset, const int *face2Cells, const double *boundary_face_vector, + const double *boundary_field_vector, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + double boussfx = boundary_field_vector[start_index * 3 + 0]; + double boussfy = boundary_field_vector[start_index * 3 + 1]; + double boussfz = boundary_field_vector[start_index * 3 + 2]; + + int cellIndex = face2Cells[start_index]; + + double grad_xx = bouSfx * boussfx; + double grad_xy = bouSfx * boussfy; + double grad_xz = bouSfx * boussfz; + double grad_yx = bouSfy * boussfx; + double grad_yy = bouSfy * boussfy; + double grad_yz = bouSfy * boussfz; + double grad_zx = bouSfz * boussfx; + double grad_zy = bouSfz * boussfy; + double grad_zz = bouSfz * boussfz; + + atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_xx); + atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_xy); + atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_xz); + atomicAdd(&(output[num_cells * 3 + cellIndex]), grad_yx); + atomicAdd(&(output[num_cells * 4 + cellIndex]), grad_yy); + atomicAdd(&(output[num_cells * 5 + cellIndex]), grad_yz); + atomicAdd(&(output[num_cells * 6 + cellIndex]), grad_zx); + atomicAdd(&(output[num_cells * 7 + cellIndex]), grad_zy); + atomicAdd(&(output[num_cells * 8 + cellIndex]), grad_zz); +} + +__global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces, + const int *lower_index, const int *upper_index, const double *face_vector, + const double *weight, const double *vf, double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]); + + double grad_x = Sfx * ssf * sign; + double grad_y = Sfy * ssf * sign; + double grad_z = Sfz * ssf * sign; + + // owner + atomicAdd(&(output[num_cells * 0 + owner]), grad_x); + atomicAdd(&(output[num_cells * 1 + owner]), grad_y); + atomicAdd(&(output[num_cells * 2 + owner]), grad_z); + + // neighbour + atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x); + atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y); + atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z); + +} + +__global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_vf, double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouvf = boundary_vf[start_index]; + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + int cellIndex = face2Cells[start_index]; + + double grad_x = bouSfx * bouvf; + double grad_y = bouSfy * bouvf; + double grad_z = bouSfz * bouvf; + + atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_x * sign); + atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_y * sign); + atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_z * sign); + + // if (cellIndex == 5) + // { + // printf("Sfx = %.10e, ssf = %.10e\n", bouSfx, bouvf); + // printf("gradx = %.10e, output = %.10e\n\n", grad_x, output[5]); + // } +} + +__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol * sign; + output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol * sign; + output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol * sign; + output[num_cells * 3 + index] = output[num_cells * 3 + index] / vol * sign; + output[num_cells * 4 + index] = output[num_cells * 4 + index] / vol * sign; + output[num_cells * 5 + index] = output[num_cells * 5 + index] / vol * sign; + output[num_cells * 6 + index] = output[num_cells * 6 + index] / vol * sign; + output[num_cells * 7 + index] = output[num_cells * 7 + index] / vol * sign; + output[num_cells * 8 + index] = output[num_cells * 8 + index] / vol * sign; +} + +__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + + output[index * 3 + 0] = output[index * 3 + 0] / vol * sign; + output[index * 3 + 1] = output[index * 3 + 1] / vol * sign; + output[index * 3 + 2] = output[index * 3 + 2] / vol * sign; +} + +__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + + output[index] = output[index] / vol * sign; +} + +__global__ void fvc_grad_vector_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, + int num, int offset, const int *face2Cells, + const double *internal_grad, const double *vf, const double *boundary_sf, + const double *boundary_mag_sf, double *boundary_grad, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + int cellIndex = face2Cells[start_index]; + + double grad_xx = internal_grad[num_cells * 0 + cellIndex]; + double grad_xy = internal_grad[num_cells * 1 + cellIndex]; + double grad_xz = internal_grad[num_cells * 2 + cellIndex]; + double grad_yx = internal_grad[num_cells * 3 + cellIndex]; + double grad_yy = internal_grad[num_cells * 4 + cellIndex]; + double grad_yz = internal_grad[num_cells * 5 + cellIndex]; + double grad_zx = internal_grad[num_cells * 6 + cellIndex]; + double grad_zy = internal_grad[num_cells * 7 + cellIndex]; + double grad_zz = internal_grad[num_cells * 8 + cellIndex]; + + double vfx = vf[cellIndex * 3 + 0]; + double vfy = vf[cellIndex * 3 + 1]; + double vfz = vf[cellIndex * 3 + 2]; + + double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index]; + double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index]; + double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index]; + + double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0 + double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + + boundary_grad[num_boundary_surfaces * 0 + start_index] = (grad_xx + n_x * grad_correction_x) * sign; + boundary_grad[num_boundary_surfaces * 1 + start_index] = (grad_xy + n_x * grad_correction_y) * sign; + boundary_grad[num_boundary_surfaces * 2 + start_index] = (grad_xz + n_x * grad_correction_z) * sign; + boundary_grad[num_boundary_surfaces * 3 + start_index] = (grad_yx + n_y * grad_correction_x) * sign; + boundary_grad[num_boundary_surfaces * 4 + start_index] = (grad_yy + n_y * grad_correction_y) * sign; + boundary_grad[num_boundary_surfaces * 5 + start_index] = (grad_yz + n_y * grad_correction_z) * sign; + boundary_grad[num_boundary_surfaces * 6 + start_index] = (grad_zx + n_z * grad_correction_x) * sign; + boundary_grad[num_boundary_surfaces * 7 + start_index] = (grad_zy + n_z * grad_correction_y) * sign; + boundary_grad[num_boundary_surfaces * 8 + start_index] = (grad_zz + n_z * grad_correction_z) * sign; +} + +__global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, + const double *internal_grad, const double *vf, const double *boundary_sf, + const double *boundary_mag_sf, double *boundary_grad, + const double *boundary_deltaCoeffs, const double *boundary_vf, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + int cellIndex = face2Cells[start_index]; + + double grad_xx = internal_grad[cellIndex * 9 + 0]; + double grad_xy = internal_grad[cellIndex * 9 + 1]; + double grad_xz = internal_grad[cellIndex * 9 + 2]; + double grad_yx = internal_grad[cellIndex * 9 + 3]; + double grad_yy = internal_grad[cellIndex * 9 + 4]; + double grad_yz = internal_grad[cellIndex * 9 + 5]; + double grad_zx = internal_grad[cellIndex * 9 + 6]; + double grad_zy = internal_grad[cellIndex * 9 + 7]; + double grad_zz = internal_grad[cellIndex * 9 + 8]; + + double vfx = vf[cellIndex * 3 + 0]; + double vfy = vf[cellIndex * 3 + 1]; + double vfz = vf[cellIndex * 3 + 2]; + + double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index]; + double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index]; + double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index]; + + // sn_grad: solving according to fixedValue BC + double sn_grad_x = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 0] - vf[cellIndex * 3 + 0]); + double sn_grad_y = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 1] - vf[cellIndex * 3 + 1]); + double sn_grad_z = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 2] - vf[cellIndex * 3 + 2]); + + double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0 + double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + + boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign; + boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign; + boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign; +} + +__global__ void fvc_div_surface_scalar_internal(int num_surfaces, + const int *lower_index, const int *upper_index, const double *ssf, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double issf = ssf[index]; + + // owner + atomicAdd(&(output[owner]), issf); + + // neighbor + atomicAdd(&(output[neighbor]), -issf); +} + +__global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int *face2Cells, + const double *boundary_ssf, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_face) + return; + + int cellIndex = face2Cells[index]; + + atomicAdd(&(output[cellIndex]), boundary_ssf[index]); +} + +__global__ void fvc_div_cell_vector_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *field_vector, const double *weight, const double *face_vector, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]); + double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]); + double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]); + + double div = Sfx * ssfx + Sfy * ssfy + Sfz * ssfz; + + // owner + atomicAdd(&(output[owner]), div); + + // neighbour + atomicAdd(&(output[neighbor]), -div); +} + +__global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_field_vector, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + double boussfx = boundary_field_vector[start_index * 3 + 0]; + double boussfy = boundary_field_vector[start_index * 3 + 1]; + double boussfz = boundary_field_vector[start_index * 3 + 2]; + + int cellIndex = face2Cells[start_index]; + + double bouDiv = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz; + + atomicAdd(&(output[cellIndex]), bouDiv); + +} + +__global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces, + const int *lower_index, const int *upper_index, + const double *vf, const double *weight, const double *face_vector, + double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssf_xx = (w * (vf[num_cells * 0 + owner] - vf[num_cells * 0 + neighbor]) + vf[num_cells * 0 + neighbor]); + double ssf_xy = (w * (vf[num_cells * 1 + owner] - vf[num_cells * 1 + neighbor]) + vf[num_cells * 1 + neighbor]); + double ssf_xz = (w * (vf[num_cells * 2 + owner] - vf[num_cells * 2 + neighbor]) + vf[num_cells * 2 + neighbor]); + double ssf_yx = (w * (vf[num_cells * 3 + owner] - vf[num_cells * 3 + neighbor]) + vf[num_cells * 3 + neighbor]); + double ssf_yy = (w * (vf[num_cells * 4 + owner] - vf[num_cells * 4 + neighbor]) + vf[num_cells * 4 + neighbor]); + double ssf_yz = (w * (vf[num_cells * 5 + owner] - vf[num_cells * 5 + neighbor]) + vf[num_cells * 5 + neighbor]); + double ssf_zx = (w * (vf[num_cells * 6 + owner] - vf[num_cells * 6 + neighbor]) + vf[num_cells * 6 + neighbor]); + double ssf_zy = (w * (vf[num_cells * 7 + owner] - vf[num_cells * 7 + neighbor]) + vf[num_cells * 7 + neighbor]); + double ssf_zz = (w * (vf[num_cells * 8 + owner] - vf[num_cells * 8 + neighbor]) + vf[num_cells * 8 + neighbor]); + double div_x = (Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx) * sign; + double div_y = (Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy) * sign; + double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign; + + // owner + atomicAdd(&(output[num_cells * 0 + owner]), div_x); + atomicAdd(&(output[num_cells * 1 + owner]), div_y); + atomicAdd(&(output[num_cells * 2 + owner]), div_z); + + // neighbour + atomicAdd(&(output[num_cells * 0 + neighbor]), -div_x); + atomicAdd(&(output[num_cells * 1 + neighbor]), -div_y); + atomicAdd(&(output[num_cells * 2 + neighbor]), -div_z); +} + +__global__ void fvc_div_cell_tensor_boundary(int num_cells, int num_boundary_faces, int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_vf, double *output, double sign) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + double boussf_xx = boundary_vf[num_boundary_faces * 0 + start_index]; + double boussf_xy = boundary_vf[num_boundary_faces * 1 + start_index]; + double boussf_xz = boundary_vf[num_boundary_faces * 2 + start_index]; + double boussf_yx = boundary_vf[num_boundary_faces * 3 + start_index]; + double boussf_yy = boundary_vf[num_boundary_faces * 4 + start_index]; + double boussf_yz = boundary_vf[num_boundary_faces * 5 + start_index]; + double boussf_zx = boundary_vf[num_boundary_faces * 6 + start_index]; + double boussf_zy = boundary_vf[num_boundary_faces * 7 + start_index]; + double boussf_zz = boundary_vf[num_boundary_faces * 8 + start_index]; + int cellIndex = face2Cells[start_index]; + + double bouDiv_x = (bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx) * sign; + double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign; + double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign; + + atomicAdd(&(output[num_cells * 0 + cellIndex]), bouDiv_x); + atomicAdd(&(output[num_cells * 1 + cellIndex]), bouDiv_y); + atomicAdd(&(output[num_cells * 2 + cellIndex]), bouDiv_z); + + // if (cellIndex == 0) + // { + // // printf("gpu output[0] = %.5e, %.5e, %.5e\n", output[0], output[1], output[2]); + // // printf("gpu output[0] += %.5e, %.5e, %.5e\n", bouDiv_x, bouDiv_y, bouDiv_z); + // printf("gpu bouvf[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", + // boussf_xx, boussf_xy, boussf_xz, boussf_yx, boussf_yy, boussf_yz, boussf_zx, boussf_zy, boussf_zz); + // printf("gpu bouSf[0] = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz); + // printf("gpu boufinal[0] = (%.5e, %.5e, %.5e)\n", bouDiv_x, bouDiv_y, bouDiv_z); + // printf("bouIndex = %d\n\n", start_index); + // } + + // if (index == 0) + // { + // printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2], + // vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]); + // } +} + +__global__ void constructVecDiag(int num_cells, const double *diag, double *diag_vec, + const double *source, double *b) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + diag_vec[num_cells * 0 + index] = diag[index]; + diag_vec[num_cells * 1 + index] = diag[index]; + diag_vec[num_cells * 2 + index] = diag[index]; + + b[num_cells * 0 + index] = source[num_cells * 0 + index]; + b[num_cells * 1 + index] = source[num_cells * 1 + index]; + b[num_cells * 2 + index] = source[num_cells * 2 + index]; +} + +__global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, const int *face2Cells, + const double *internal_coeffs, const double *boundary_coeffs, double *diag_vec, double *b) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_surfaces) + return; + + int cellIndex = face2Cells[index]; + + double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index]; + double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index]; + double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + index]; + + double boundaryCoeffx = boundary_coeffs[num_boundary_surfaces * 0 + index]; + double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index]; + double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index]; + + atomicAdd(&diag_vec[num_cells * 0 + cellIndex], internalCoeffx); + atomicAdd(&diag_vec[num_cells * 1 + cellIndex], internalCoeffy); + atomicAdd(&diag_vec[num_cells * 2 + cellIndex], internalCoeffz); + + atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx); + atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy); + atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz); +} + +__global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces, + const int *lowCSRIndex, const int *uppCSRIndex, + const double *lower, const double *upper, + double *A_csr) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int uppIndex = uppCSRIndex[index]; + int lowIndex = lowCSRIndex[index]; + int upp = upper[index]; + int low = lower[index]; + A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index]; + A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index]; + A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index]; + + A_csr[(num_cells + 2 * num_surfaces) * 0 + lowIndex] = lower[index]; + A_csr[(num_cells + 2 * num_surfaces) * 1 + lowIndex] = lower[index]; + A_csr[(num_cells + 2 * num_surfaces) * 2 + lowIndex] = lower[index]; +} + +__global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, + const int *diagCSRIndex, const double *diag_vec, + double *A_csr) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + int diagIndex = diagCSRIndex[index]; + A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diag_vec[num_cells * 0 + index]; + A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diag_vec[num_cells * 1 + index]; + A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diag_vec[num_cells * 2 + index]; +} + + +void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_vector_d2h_kernel<<>>(num_cells, input, output); +} + +void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_vector_h2d_kernel<<>>(num_cells, input, output); +} + +void field_multiply_scalar(cudaStream_t stream, + int num_cells, const double *input1, const double *input2, double *output, + int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) +{ + TICK_INIT_EVENT; + size_t threads_per_block = 256; + size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + field_multiply_scalar_kernel<<>>(num_cells, num_boundary_surfaces, + input1, input2, output, boundary_input1, boundary_input2, boundary_output); + TICK_END_EVENT(field_multiply_scalar_kernel); +} + +void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_to_source_vector_kernel<<>>(num_cells, + volume, fvc_output, source); +} + +void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, + const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, + const double *lower, const double *upper, const double *diag, const double *source, + const double *internal_coeffs, const double *boundary_coeffs, + double *A, double *b, double *diag_vec) +{ + // construct new diag with size of 3*num_cells + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + constructVecDiag<<>>(num_cells, diag, diag_vec, source, b); + + // add coeff to source and diagnal + blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block; + addBoundaryDiagSrc<<>>(num_cells, num_boundary_surface, + boundary_cell_face, internal_coeffs, boundary_coeffs, diag_vec, b); + + // convert offdiag + blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + ldu_to_csr_offDiag<<>>(num_cells, num_surfaces, + lower_to_csr_index, upper_to_csr_index, lower, upper, A); + + // convert diag + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + ldu_to_csr_Diag<<>>(num_cells, num_surfaces, + diag_to_csr_index, diag_vec, A); + +} + +void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, + const int *patch_size, const int *patch_type, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = 1; + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + // TODO: just vector version now + if (patch_type[i] == boundaryConditions::zeroGradient) { + update_boundary_coeffs_zeroGradient_vector<<>>(num_boundary_surfaces, patch_size[i], offset, + value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } +} + +void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *volume, + double *diag, double *source, double sign) +{ +#ifdef TIME_GPU + printf("#############kernel profile#############\n"); +#endif + TICK_INIT_EVENT; + size_t threads_per_block = 64; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; +#ifdef TIME_GPU + printf("warm up ...\n"); + warmup<<>>(num_cells); +#endif + TICK_START_EVENT; + fvm_ddt_vector_kernel<<>>(num_cells, + rDeltaT, rho, rho_old, vf, volume, diag, source, sign); + TICK_END_EVENT(fvm_ddt_vector_kernel); +} + +void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *phi, const double *weight, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs, double sign) +{ + TICK_INIT_EVENT; + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; +#ifdef TIME_GPU + printf("warm up ...\n"); + warmup<<>>(num_surfaces); +#endif + TICK_START_EVENT; + fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, + phi, weight, lower, upper, diag, sign); + TICK_END_EVENT(fvm_div_vector_internal); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + TICK_START_EVENT; + fvm_div_vector_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, + boundary_phi, value_internal_coeffs, value_boundary_coeffs, + internal_coeffs, boundary_coeffs, sign); + TICK_END_EVENT(fvm_div_vector_boundary); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } +} + +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_mag_sf, const double *boundary_gamma, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs, double sign) +{ + TICK_INIT_EVENT; + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + fvm_laplacian_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, + weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); + TICK_END_EVENT(fvm_laplacian_vector_internal); + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + TICK_START_EVENT; + fvm_laplacian_vector_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, + boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, + internal_coeffs, boundary_coeffs, sign); + TICK_END_EVENT(fvm_laplacian_vector_boundary); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } +} + +void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *vf_old, + double *output, double sign) +{ + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_ddt_scalar_kernel<<>>(num_cells, + rDeltaT, rho, rho_old, vf, vf_old, output, sign); +} + +void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, const double *boundary_mag_Sf, double *boundary_output, + const double *boundary_deltaCoeffs, double sign) +{ + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream)); + TICK_INIT_EVENT; + size_t threads_per_block = 32; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + fvc_grad_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, + Sf, weight, vf, output); + TICK_END_EVENT(fvc_grad_vector_internal); + + int offset = 0; + // finish conctruct grad field except dividing cell volume + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + TICK_START_EVENT; + fvc_grad_vector_boundary<<>>(num_cells, + patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output); + TICK_END_EVENT(fvc_grad_vector_boundary); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // divide cell volume + threads_per_block = 512; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + divide_cell_volume_tsr<<>>(num_cells, volume, output, sign); + TICK_END_EVENT(divide_cell_volume_tsr); + + // correct boundary conditions + offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient) { + // TODO: just vector version now + TICK_START_EVENT; + fvc_grad_vector_correctBC_zeroGradient<<>>(num_cells, num_boundary_surfaces, + patch_size[i], offset, boundary_cell_face, + output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign); + TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient); + } else if (patch_type[i] == boundaryConditions::fixedValue) { + // TODO: implement fixedValue version + fvc_grad_vector_correctBC_fixedValue<<>>(patch_size[i], offset, boundary_cell_face, + output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf, sign); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } +} + +void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, + int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2) +{ + TICK_INIT_EVENT; + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + scale_dev2t_tensor_kernel<<>>(num_cells, vf1, vf2); + TICK_END_EVENT(scale_dev2t_tensor_kernel); + + blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; + scale_dev2t_tensor_kernel<<>>(num_boundary_surfaces, boundary_vf1, boundary_vf2); +} + +void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, + const double *boundary_ssf, const double *volume, double *output, double sign) +{ + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_surface_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, ssf, output); + + threads_per_block = 1024; + blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_surface_scalar_boundary<<>>(num_boundary_surfaces, boundary_cell_face, + boundary_ssf, output); + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_scalar<<>>(num_cells, volume, output, sign); +} + +void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, double sign) +{ + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_cell_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + fvc_div_cell_vector_boundary<<>>(patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_scalar<<>>(num_cells, volume, output, sign); +} + +void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, double sign) +{ + // checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); + TICK_INIT_EVENT; + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + fvc_div_cell_tensor_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); + TICK_END_EVENT(fvc_div_cell_tensor_internal); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + TICK_START_EVENT; + fvc_div_cell_tensor_boundary<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output, sign); + TICK_END_EVENT(fvc_div_cell_tensor_boundary); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // // divide cell volume + // threads_per_block = 1024; + // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + // divide_cell_volume_vec<<>>(num_cells, volume, output, sign); +} + +void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) +{ + TICK_INIT_EVENT; + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, + Sf, weight, vf, output, sign); + TICK_END_EVENT(fvc_grad_scalar_internal); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just non-coupled patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + TICK_START_EVENT; + fvc_grad_scalar_boundary<<>>(num_cells, patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output, sign); + TICK_END_EVENT(fvc_grad_scalar_boundary); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // // divide cell volume + // threads_per_block = 1024; + // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + // divide_cell_volume_vec<<>>(num_cells, volume, output, sign); +} diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index ec739db5e..49edc1b7a 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -3,60 +3,114 @@ #include "AmgXSolver.H" #include #include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" class dfUEqn { private: - dfMatrixDataBase &dataBase_; - cudaStream_t stream; - AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr; - int num_iteration; - - // common variables - int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; - int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; - - // Matrix variables - double *d_A_csr, *d_b, *d_psi, *d_psi_permute, *d_H, *d_H_permute, *d_A; - double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr; - - double *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr; + dfMatrixDataBase &dataBase_; + + // cuda resource + // one graph for one eqn before using self-developed solver + cudaGraph_t graph; + cudaGraphExec_t graph_instance; + bool graph_created=false; + + // constant values -- basic + std::string mode_string; + std::string setting_path; + + // constant values -- amgx solvers + AmgXSolver *UxSolver = nullptr; + AmgXSolver *UySolver = nullptr; + AmgXSolver *UzSolver = nullptr; + int num_iteration = 0; + + // constant fields - internal + // 无 + + // constant fields - boundary + std::vector patch_type; + + // non-constant fields - internal + // thermophysical fields + double *d_nu_eff = nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_nu_eff = nullptr; + // intermediate fields + double *d_grad_u = nullptr; + double *d_rho_nueff = nullptr; + double *d_permute = nullptr; + double *d_fvc_output = nullptr; // TODO: no need anymore + + // non-constant fields - boundary + // thermophysical fields + double *d_boundary_nu_eff = nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_boundary_nu_eff = nullptr; + // intermediate fields + double *d_boundary_grad_u = nullptr; + double *d_boundary_rho_nueff = nullptr; + // boundary coeff fields + double *d_value_internal_coeffs = nullptr; + double *d_value_boundary_coeffs= nullptr; + double *d_gradient_internal_coeffs= nullptr; + double *d_gradient_boundary_coeffs= nullptr; + + // non-constant fields - ldu + double *d_lower = nullptr; + double *d_upper = nullptr; + double *d_diag = nullptr; + double *d_source = nullptr; + double *d_internal_coeffs = nullptr; + double *d_boundary_coeffs = nullptr; + double *d_diag_vector = nullptr; + + // non-constant fields - csr + double *d_A = nullptr; + double *d_b = nullptr; // TODO: needless + + // field pointer map + std::unordered_map fieldPointerMap; public: - dfUEqn(); - dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile); - ~dfUEqn(); - - void checkValue(bool print); - - void fvm_ddt(double *vector_old); - - void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, - double *boundary_nuEff_init, double *boundary_rho_init); - - void fvc_grad(double *pressure); - - void fvc_grad_vector(); - - void dev2T(); - - void fvc_div_tensor(const double *nuEff); - - void fvm_laplacian(); - - void A(double *Psi); - - void H(double *Psi); + // 构造函数 + dfUEqn(dfMatrixDataBase &dataBase) + : dataBase_(dataBase) {} + + // 析构函数 + ~dfUEqn(){ + if (graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance)); + checkCudaErrors(cudaGraphDestroy(graph)); + } + } + + // 成员函数 + + // getter函数 + double* getFieldPointer(const char* fieldAlias, location loc, position pos); + + // 初始化构建 + void setConstantValues(const std::string &mode_string, const std::string &setting_path); + void setConstantFields(const std::vector patch_type); + void createNonConstantFieldsInternal(); + void createNonConstantFieldsBoundary(); + void createNonConstantLduAndCsrFields(); + // dfUEqn has no internal non-constant fields to be init + //void initNonConstantFieldsInternal(xxx); + void initNonConstantFieldsBoundary(); + + // 方程运行 + void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi); + void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho); + void process(); + void postProcess(double *h_u); void solve(); + void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, + // const double *tmpVal, + bool printFlag); void sync(); - - void updatePsi(double *Psi); - - void correctBoundaryConditions(); - - void correctPsi(double *Psi); - - void initializeTimeStep(); }; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 56983e038..d30c06131 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -1,1481 +1,306 @@ #include "dfUEqn.H" -// kernel functions -__global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t, - const int *csr_row_index, const int *csr_diag_index, - const double *rho_old, const double *rho_new, const double *volume, const double *velocity_old, - const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int diag_index = csr_diag_index[index]; - - int csr_dim = num_cells + num_faces; - int csr_index = row_index + diag_index; - double ddt_diag = rdelta_t * rho_new[index] * volume[index]; - A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag; - A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag; - A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag; - - double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0]; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1]; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2]; - - psi[num_cells * 0 + index] = velocity_old[index * 3 + 0]; - psi[num_cells * 1 + index] = velocity_old[index * 3 + 1]; - psi[num_cells * 2 + index] = velocity_old[index * 3 + 2]; -} - -__global__ void fvm_div_internal(int num_cells, int num_faces, - const int *csr_row_index, const int *csr_diag_index, - const double *weight, const double *phi, - const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - int csr_dim = num_cells + num_faces; - - double div_diag = 0; - for (int i = row_index; i < next_row_index; i++) - { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) - { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f; - // lower neighbors contribute to sum of -1 - div_diag += (w - 1) * f; - } - // upper - if (inner_index > diag_index) - { - // upper, index - 1, consider of diag - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f; - // upper neighbors contribute to sum of 1 - div_diag += w * f; - } - } - A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag - A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag - A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag -} - -__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells, - const int *csr_row_index, const int *csr_diag_index, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *internal_coeffs, const double *boundary_coeffs, - const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, - double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int cell_index = boundary_cell_id[cell_offset]; - int loop_size = boundary_cell_offset[index + 1] - cell_offset; - - int row_index = csr_row_index[cell_index]; - int diag_index = csr_diag_index[cell_index]; - int csr_dim = num_cells + num_faces; - int csr_index = row_index + diag_index; - - // construct internalCoeffs & boundaryCoeffs - double internal_coeffs_x = 0; - double internal_coeffs_y = 0; - double internal_coeffs_z = 0; - double boundary_coeffs_x = 0; - double boundary_coeffs_y = 0; - double boundary_coeffs_z = 0; - for (int i = 0; i < loop_size; i++) - { - internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0]; - internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1]; - internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2]; - boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0]; - boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1]; - boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2]; - } - ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x; - ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y; - ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z; - ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x; - ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y; - ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z; - - A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x; - A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y; - A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z; - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z; -} - -__global__ void fvc_grad_internal_face(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *face_vector, const double *weight, const double *pressure, - const double *b_input, double *b_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - - double own_cell_p = pressure[index]; - double grad_bx = 0; - double grad_by = 0; - double grad_bz = 0; - double grad_bx_low = 0; - double grad_bx_upp = 0; - double grad_by_low = 0; - double grad_by_upp = 0; - double grad_bz_low = 0; - double grad_bz_upp = 0; - for (int i = row_index; i < next_row_index; i++) - { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) - { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - int neighbor_cell_id = csr_col_index[row_index + inner_index]; - double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; - grad_bx_low -= face_p * sfx; - grad_by_low -= face_p * sfy; - grad_bz_low -= face_p * sfz; - } - // upper - if (inner_index > diag_index) - { - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - int neighbor_cell_id = csr_col_index[row_index + inner_index]; - double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p; - grad_bx_upp += face_p * sfx; - grad_by_upp += face_p * sfy; - grad_bz_upp += face_p * sfz; - } - } - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp; -} - -__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_face_vector, const double *boundary_pressure, - const double *b_input, double *b_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - // compute boundary gradient - double grad_bx = 0; - double grad_by = 0; - double grad_bz = 0; - for (int i = cell_offset; i < next_cell_offset; i++) - { - double sfx = boundary_face_vector[i * 3 + 0]; - double sfy = boundary_face_vector[i * 3 + 1]; - double sfz = boundary_face_vector[i * 3 + 2]; - double face_p = boundary_pressure[i]; - grad_bx += face_p * sfx; - grad_by += face_p * sfy; - grad_bz += face_p * sfz; - } - - //// correct the boundary gradient - // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; - // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; - // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; - // double sn_grad = 0; - // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); - // grad_bx += nx * grad_correction; - // grad_by += ny * grad_correction; - // grad_bz += nz * grad_correction; - - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz; +void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->mode_string = mode_string; + this->setting_path = setting_path; + UxSolver = new AmgXSolver(mode_string, setting_path); + UySolver = new AmgXSolver(mode_string, setting_path); + UzSolver = new AmgXSolver(mode_string, setting_path); } -__global__ void add_fvMatrix_kernel(int num_cells, int num_faces, - const int *csr_row_index, - const double *turbSrc_A, const double *turbSrc_b, - const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int csr_dim = num_cells + num_faces; - double A_entry; - - for (int i = row_index; i < next_row_index; i++) - { - A_entry = turbSrc_A[i]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry; - } - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[index * 3 + 0]; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[index * 3 + 1]; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[index * 3 + 2]; -} - -__global__ void offdiagPermutation(const int num_faces, const int *permedIndex, - const double *d_phi_init, double *d_phi) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_faces) - return; - - int p = permedIndex[index]; - d_phi[index] = d_phi_init[p]; -} - -__global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, - const double *boundary_pressure_init, const double *boundary_velocity_init, - double *boundary_pressure, double *boundary_velocity, - double *boundary_nuEff_init, double *boundary_nuEff, - double *boundary_rho_init, double *boundary_rho) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_faces) - return; - - int p = bouPermedIndex[index]; - boundary_velocity[3 * index] = boundary_velocity_init[3 * p]; - boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1]; - boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2]; - boundary_pressure[index] = boundary_pressure_init[p]; - boundary_rho[index] = boundary_rho_init[p]; - boundary_nuEff[index] = boundary_nuEff_init[p]; +void dfUEqn::setConstantFields(const std::vector patch_type) { + this->patch_type = patch_type; } -__global__ void fvc_grad_vector_internal(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *sf, const double *vf, const double *tlambdas, const double *volume, - double *grad) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int row_elements = csr_row_index[index + 1] - row_index; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - - double own_vf_x = vf[index * 3 + 0]; - double own_vf_y = vf[index * 3 + 1]; - double own_vf_z = vf[index * 3 + 2]; - double grad_xx = 0; - double grad_xy = 0; - double grad_xz = 0; - double grad_yx = 0; - double grad_yy = 0; - double grad_yz = 0; - double grad_zx = 0; - double grad_zy = 0; - double grad_zz = 0; - // lower - for (int i = 0; i < diag_index; i++) - { - int neighbor_index = neighbor_offset + i; - int neighbor_cell_id = csr_col_index[row_index + i]; - double w = tlambdas[neighbor_index]; - double sf_x = sf[neighbor_index * 3 + 0]; - double sf_y = sf[neighbor_index * 3 + 1]; - double sf_z = sf[neighbor_index * 3 + 2]; - double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; - double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; - double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; - double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x; - double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y; - double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z; - grad_xx -= sf_x * face_x; - grad_xy -= sf_x * face_y; - grad_xz -= sf_x * face_z; - grad_yx -= sf_y * face_x; - grad_yy -= sf_y * face_y; - grad_yz -= sf_y * face_z; - grad_zx -= sf_z * face_x; - grad_zy -= sf_z * face_y; - grad_zz -= sf_z * face_z; - } - // upper - for (int i = diag_index + 1; i < row_elements; i++) - { - int neighbor_index = neighbor_offset + i - 1; - int neighbor_cell_id = csr_col_index[row_index + i]; - double w = tlambdas[neighbor_index]; - double sf_x = sf[neighbor_index * 3 + 0]; - double sf_y = sf[neighbor_index * 3 + 1]; - double sf_z = sf[neighbor_index * 3 + 2]; - double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; - double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; - double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; - double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x; - double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y; - double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z; - grad_xx += sf_x * face_x; - grad_xy += sf_x * face_y; - grad_xz += sf_x * face_z; - grad_yx += sf_y * face_x; - grad_yy += sf_y * face_y; - grad_yz += sf_y * face_z; - grad_zx += sf_z * face_x; - grad_zy += sf_z * face_y; - grad_zz += sf_z * face_z; - // if (index == 0) - // { - // printf("grad_xx = %.20lf\n", grad_xx); - // // printf("sf_x = %.20lf\n", sf_x); - // // printf("face_x = %.20lf\n", face_x); - // } - } - double vol = volume[index]; - grad[index * 9 + 0] = grad_xx / vol; - grad[index * 9 + 1] = grad_xy / vol; - grad[index * 9 + 2] = grad_xz / vol; - grad[index * 9 + 3] = grad_yx / vol; - grad[index * 9 + 4] = grad_yy / vol; - grad[index * 9 + 5] = grad_yz / vol; - grad[index * 9 + 6] = grad_zx / vol; - grad[index * 9 + 7] = grad_zy / vol; - grad[index * 9 + 8] = grad_zz / vol; -} - -__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_sf, const double *boundary_vf, const double *volume, - double *grad, double *grad_boundary_init) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - double grad_xx = 0; - double grad_xy = 0; - double grad_xz = 0; - double grad_yx = 0; - double grad_yy = 0; - double grad_yz = 0; - double grad_zx = 0; - double grad_zy = 0; - double grad_zz = 0; - for (int i = cell_offset; i < next_cell_offset; i++) - { - double sf_x = boundary_sf[i * 3 + 0]; - double sf_y = boundary_sf[i * 3 + 1]; - double sf_z = boundary_sf[i * 3 + 2]; - double vf_x = boundary_vf[i * 3 + 0]; - double vf_y = boundary_vf[i * 3 + 1]; - double vf_z = boundary_vf[i * 3 + 2]; - grad_xx += sf_x * vf_x; - grad_xy += sf_x * vf_y; - grad_xz += sf_x * vf_z; - grad_yx += sf_y * vf_x; - grad_yy += sf_y * vf_y; - grad_yz += sf_y * vf_z; - grad_zx += sf_z * vf_x; - grad_zy += sf_z * vf_y; - grad_zz += sf_z * vf_z; - } - - double vol = volume[cell_index]; - - grad[cell_index * 9 + 0] += grad_xx / vol; - grad[cell_index * 9 + 1] += grad_xy / vol; - grad[cell_index * 9 + 2] += grad_xz / vol; - grad[cell_index * 9 + 3] += grad_yx / vol; - grad[cell_index * 9 + 4] += grad_yy / vol; - grad[cell_index * 9 + 5] += grad_yz / vol; - grad[cell_index * 9 + 6] += grad_zx / vol; - grad[cell_index * 9 + 7] += grad_zy / vol; - grad[cell_index * 9 + 8] += grad_zz / vol; +void dfUEqn::createNonConstantFieldsInternal() { + // thermophysical fields + checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes)); + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes)); + // intermediate fields + checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes)); - grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0]; - grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1]; - grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2]; - grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3]; - grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4]; - grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5]; - grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6]; - grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7]; - grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8]; - // if (index == 1) - // { - // printf("grad[1] = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2], - // grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]); - // } + // getter for h_nu_eff + fieldPointerMap["h_nu_eff"] = h_nu_eff; } - -__global__ void correct_boundary_conditions(int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_sf, const double *mag_sf, - double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs, - const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - // initialize boundary_grad - double grad_xx = boundary_grad_init[index * 9 + 0]; - double grad_xy = boundary_grad_init[index * 9 + 1]; - double grad_xz = boundary_grad_init[index * 9 + 2]; - double grad_yx = boundary_grad_init[index * 9 + 3]; - double grad_yy = boundary_grad_init[index * 9 + 4]; - double grad_yz = boundary_grad_init[index * 9 + 5]; - double grad_zx = boundary_grad_init[index * 9 + 6]; - double grad_zy = boundary_grad_init[index * 9 + 7]; - double grad_zz = boundary_grad_init[index * 9 + 8]; - - double internal_U_x = internal_velocity[cell_index * 3 + 0]; - double internal_U_y = internal_velocity[cell_index * 3 + 1]; - double internal_U_z = internal_velocity[cell_index * 3 + 2]; - - for (int i = cell_offset; i < next_cell_offset; i++) - { - // OpenFoam code - // const vectorField n - // ( - // vsf.mesh().Sf().boundaryField()[patchi] - // / vsf.mesh().magSf().boundaryField()[patchi] - // ); - // gGradbf[patchi] += n * - // ( - // vsf.boundaryField()[patchi].snGrad() - // - (n & gGradbf[patchi]) - // ); - // template // fixedValue - // Foam::tmp> Foam::fvPatchField::snGrad() const - // { - // return patch_.deltaCoeffs()*(*this - patchInternalField()); - // } - - double n_x = boundary_sf[i * 3 + 0] / mag_sf[i]; - double n_y = boundary_sf[i * 3 + 1] / mag_sf[i]; - double n_z = boundary_sf[i * 3 + 2] / mag_sf[i]; - - double sn_grad_x, sn_grad_y, sn_grad_z; - int patchIndex = U_patch_type[i]; - if (patchIndex == 0) { // zeroGradient - sn_grad_x = 0; - sn_grad_y = 0; - sn_grad_z = 0; - } else if (patchIndex == 1) { // fixedValue - sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 0] - internal_velocity[cell_index * 3 + 0]); - sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 1] - internal_velocity[cell_index * 3 + 1]); - sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 2] - internal_velocity[cell_index * 3 + 2]); - // if (index == 1) - // { - // printf("cell_index = %d\n", cell_index); - // printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]); - // printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]); - // } - - } - // TODO: implement other BCs - double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); - double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); - double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); - boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x; - boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y; - boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z; - boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x; - boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y; - boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z; - boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x; - boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y; - boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z; - // if (index == 1) - // { - // printf("boundary_grad = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", boundary_grad[i * 9 + 0], boundary_grad[i * 9 + 1], boundary_grad[i * 9 + 2], - // boundary_grad[i * 9 + 3], boundary_grad[i * 9 + 4], boundary_grad[i * 9 + 5], boundary_grad[i * 9 + 6], boundary_grad[i * 9 + 7], boundary_grad[i * 9 + 8]); - // } +void dfUEqn::createNonConstantFieldsBoundary() { + // thermophysical fields + checkCudaErrors(cudaMalloc((void**)&d_boundary_nu_eff, dataBase_.boundary_surface_value_bytes)); + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes)); + // intermediate fields + checkCudaErrors(cudaMalloc((void**)&d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_nueff, dataBase_.boundary_surface_value_bytes)); + // boundary coeff fields + checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + + // getter for h_boundary_nu_eff + fieldPointerMap["h_boundary_nu_eff"] = h_boundary_nu_eff; +} + +void dfUEqn::createNonConstantLduAndCsrFields() { + checkCudaErrors(cudaMalloc((void**)&d_lower, dataBase_.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag_vector, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes)); +} + +void dfUEqn::initNonConstantFieldsBoundary() { + update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches, + dataBase_.patch_size.data(), patch_type.data(), + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); +} + +void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) { + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho, h_rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); +} + +void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, + const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho) { + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_u, h_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_nu_eff, h_nu_eff, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + + checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_diag_vector, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); // TODO: maybe a better way +} + +void dfUEqn::process() { + //使用event计算时间 + float time_elapsed=0; + cudaEvent_t start,stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start,0)); + +#ifndef TIME_GPU + if(!graph_created) { + DEBUG_TRACE; + checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); +#endif + + permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute); + fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, + dataBase_.d_rho, dataBase_.d_rho_old, d_permute, dataBase_.d_volume, + d_diag, d_source, 1.); + fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_phi, dataBase_.d_weight, + d_lower, d_upper, d_diag, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, + d_internal_coeffs, d_boundary_coeffs, 1.); + field_multiply_scalar(dataBase_.stream, + dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal + dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); + fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, + d_lower, d_upper, d_diag, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + d_internal_coeffs, d_boundary_coeffs, -1); + fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, d_permute, d_grad_u, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, + dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); + scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal + dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); + fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); + // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + // dataBase_.d_volume, d_fvc_output, d_source); + fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); + // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + // dataBase_.d_volume, d_fvc_output, d_source); + +#ifndef TIME_GPU + checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); + checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); + graph_created = true; } -} + DEBUG_TRACE; + checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); +#endif -__global__ void dev2_t_tensor(int num, double *tensor) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num) - return; + checkCudaErrors(cudaEventRecord(stop,0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); + fprintf(stderr, "ueqn process time:%f(ms)\n",time_elapsed); - double t_xx = tensor[index * 9 + 0]; - double t_xy = tensor[index * 9 + 1]; - double t_xz = tensor[index * 9 + 2]; - double t_yx = tensor[index * 9 + 3]; - double t_yy = tensor[index * 9 + 4]; - double t_yz = tensor[index * 9 + 5]; - double t_zx = tensor[index * 9 + 6]; - double t_zy = tensor[index * 9 + 7]; - double t_zz = tensor[index * 9 + 8]; - double trace_coeff = (2. / 3.) * (t_xx + t_yy + t_zz); - tensor[index * 9 + 0] = t_xx - trace_coeff; - tensor[index * 9 + 1] = t_yx; - tensor[index * 9 + 2] = t_zx; - tensor[index * 9 + 3] = t_xy; - tensor[index * 9 + 4] = t_yy - trace_coeff; - tensor[index * 9 + 5] = t_zy; - tensor[index * 9 + 6] = t_xz; - tensor[index * 9 + 7] = t_yz; - tensor[index * 9 + 8] = t_zz - trace_coeff; + //solve(); } -__global__ void fvc_div_tensor_internal(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *scalar0, const double *scalar1, - const double *sf, const double *vf, const double *tlambdas, const double *volume, - const double sign, const double *b_input, double *b_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int row_elements = csr_row_index[index + 1] - row_index; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - - double coeff_own = scalar0[index] * scalar1[index]; - - double own_vf_xx = vf[index * 9 + 0]; - double own_vf_xy = vf[index * 9 + 1]; - double own_vf_xz = vf[index * 9 + 2]; - double own_vf_yx = vf[index * 9 + 3]; - double own_vf_yy = vf[index * 9 + 4]; - double own_vf_yz = vf[index * 9 + 5]; - double own_vf_zx = vf[index * 9 + 6]; - double own_vf_zy = vf[index * 9 + 7]; - double own_vf_zz = vf[index * 9 + 8]; - double sum_x = 0; - double sum_y = 0; - double sum_z = 0; - - // lower - for (int i = 0; i < diag_index; i++) - { - int neighbor_index = neighbor_offset + i; - int neighbor_cell_id = csr_col_index[row_index + i]; - double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id]; - double w = tlambdas[neighbor_index]; - double sf_x = sf[neighbor_index * 3 + 0]; - double sf_y = sf[neighbor_index * 3 + 1]; - double sf_z = sf[neighbor_index * 3 + 2]; - double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0]; - double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1]; - double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2]; - double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3]; - double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4]; - double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5]; - double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6]; - double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7]; - double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8]; - double face_xx = (1 - w) * own_vf_xx * coeff_own + w * neighbor_vf_xx * coeff_nei; - double face_xy = (1 - w) * own_vf_xy * coeff_own + w * neighbor_vf_xy * coeff_nei; - double face_xz = (1 - w) * own_vf_xz * coeff_own + w * neighbor_vf_xz * coeff_nei; - double face_yx = (1 - w) * own_vf_yx * coeff_own + w * neighbor_vf_yx * coeff_nei; - double face_yy = (1 - w) * own_vf_yy * coeff_own + w * neighbor_vf_yy * coeff_nei; - double face_yz = (1 - w) * own_vf_yz * coeff_own + w * neighbor_vf_yz * coeff_nei; - double face_zx = (1 - w) * own_vf_zx * coeff_own + w * neighbor_vf_zx * coeff_nei; - double face_zy = (1 - w) * own_vf_zy * coeff_own + w * neighbor_vf_zy * coeff_nei; - double face_zz = (1 - w) * own_vf_zz * coeff_own + w * neighbor_vf_zz * coeff_nei; - sum_x -= sf_x * face_xx + sf_y * face_yx + sf_z * face_zx; - sum_y -= sf_x * face_xy + sf_y * face_yy + sf_z * face_zy; - sum_z -= sf_x * face_xz + sf_y * face_yz + sf_z * face_zz; - } - // upper - for (int i = diag_index + 1; i < row_elements; i++) - { - int neighbor_index = neighbor_offset + i - 1; - int neighbor_cell_id = csr_col_index[row_index + i]; - double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id]; - double w = tlambdas[neighbor_index]; - double sf_x = sf[neighbor_index * 3 + 0]; - double sf_y = sf[neighbor_index * 3 + 1]; - double sf_z = sf[neighbor_index * 3 + 2]; - double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0]; - double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1]; - double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2]; - double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3]; - double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4]; - double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5]; - double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6]; - double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7]; - double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8]; - double face_xx = w * own_vf_xx * coeff_own + (1 - w) * neighbor_vf_xx * coeff_nei; - double face_xy = w * own_vf_xy * coeff_own + (1 - w) * neighbor_vf_xy * coeff_nei; - double face_xz = w * own_vf_xz * coeff_own + (1 - w) * neighbor_vf_xz * coeff_nei; - double face_yx = w * own_vf_yx * coeff_own + (1 - w) * neighbor_vf_yx * coeff_nei; - double face_yy = w * own_vf_yy * coeff_own + (1 - w) * neighbor_vf_yy * coeff_nei; - double face_yz = w * own_vf_yz * coeff_own + (1 - w) * neighbor_vf_yz * coeff_nei; - double face_zx = w * own_vf_zx * coeff_own + (1 - w) * neighbor_vf_zx * coeff_nei; - double face_zy = w * own_vf_zy * coeff_own + (1 - w) * neighbor_vf_zy * coeff_nei; - double face_zz = w * own_vf_zz * coeff_own + (1 - w) * neighbor_vf_zz * coeff_nei; - sum_x += sf_x * face_xx + sf_y * face_yx + sf_z * face_zx; - sum_y += sf_x * face_xy + sf_y * face_yy + sf_z * face_zy; - sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz; - } - double vol = volume[index]; - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign; -} - -__global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_scalar0, const double *boundary_scalar1, - const double *boundary_sf, const double *boundary_vf, const double *volume, - const double sign, const double *b_input, double *b_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - // OpenFoam code - // Foam::surfaceInterpolationScheme::dotInterpolate - // if (vf.boundaryField()[pi].coupled()) - // { - // psf = - // pSf - // & ( - // pLambda*vf.boundaryField()[pi].patchInternalField() - // + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField() - // ); - // } - // else - // { - // psf = pSf & vf.boundaryField()[pi]; - // } - // tmp> surfaceIntegrate - // forAll(mesh.boundary()[patchi], facei) - // { - // ivf[pFaceCells[facei]] += pssf[facei]; - // } - double sum_x = 0; - double sum_y = 0; - double sum_z = 0; - for (int i = cell_offset; i < next_cell_offset; i++) - { - double sf_x = boundary_sf[i * 3 + 0]; - double sf_y = boundary_sf[i * 3 + 1]; - double sf_z = boundary_sf[i * 3 + 2]; - double face_xx = boundary_vf[i * 9 + 0]; - double face_xy = boundary_vf[i * 9 + 1]; - double face_xz = boundary_vf[i * 9 + 2]; - double face_yx = boundary_vf[i * 9 + 3]; - double face_yy = boundary_vf[i * 9 + 4]; - double face_yz = boundary_vf[i * 9 + 5]; - double face_zx = boundary_vf[i * 9 + 6]; - double face_zy = boundary_vf[i * 9 + 7]; - double face_zz = boundary_vf[i * 9 + 8]; - - // if not coupled - double coeff = boundary_scalar0[i] * boundary_scalar1[i]; - sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff; - sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff; - sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff; - } - double vol = volume[cell_index]; - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign; -} - -__global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num_faces, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *scalar0, const double *scalar1, const double *weight, - const double *magsf, const double *distance, - const double sign, const double *A_csr_input, double *A_csr_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int row_elements = csr_row_index[index + 1] - row_index; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - int csr_dim = num_cells + num_faces; - - double own_scalar0 = scalar0[index]; - double own_scalar1 = scalar1[index]; - double own_coeff = own_scalar0 * own_scalar1; - - // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField(); - // fvm.negSumDiag(); - double sum_diag = 0; - // lower - for (int i = 0; i < diag_index; i++) - { - int neighbor_index = neighbor_offset + i; - int neighbor_cell_id = csr_col_index[i + row_index]; - double w = weight[neighbor_index]; - double nei_scalar0 = scalar0[neighbor_cell_id]; - double nei_scalar1 = scalar1[neighbor_cell_id]; - double nei_coeff = nei_scalar0 * nei_scalar1; - double gamma = w * (nei_coeff - own_coeff) + own_coeff; - double gamma_magsf = gamma * magsf[neighbor_index]; - double coeff = gamma_magsf * distance[neighbor_index]; - A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign; - A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign; - A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign; - - sum_diag += (-coeff); - } - // upper - for (int i = diag_index + 1; i < row_elements; i++) - { - int neighbor_index = neighbor_offset + i - 1; - int neighbor_cell_id = csr_col_index[i + row_index]; - double w = weight[neighbor_index]; - double nei_scalar0 = scalar0[neighbor_cell_id]; - double nei_scalar1 = scalar1[neighbor_cell_id]; - double nei_coeff = nei_scalar0 * nei_scalar1; - double gamma = w * (own_coeff - nei_coeff) + nei_coeff; - double gamma_magsf = gamma * magsf[neighbor_index]; - double coeff = gamma_magsf * distance[neighbor_index]; - A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign; - A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign; - A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign; - sum_diag += (-coeff); - } - A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + sum_diag * sign; // diag - A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + sum_diag * sign; // diag - A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + sum_diag * sign; // diag -} - -__global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num_faces, int num_boundary_cells, - const int *csr_row_index, const int *csr_diag_index, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_scalar0, const double *boundary_scalar1, - const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, - const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, - double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - int row_index = csr_row_index[cell_index]; - int diag_index = csr_diag_index[cell_index]; - int csr_dim = num_cells + num_faces; - int csr_index = row_index + diag_index; - - // OpenFoam code - // if (pvf.coupled()) - // { - // fvm.internalCoeffs()[patchi] = - // pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs); - // fvm.boundaryCoeffs()[patchi] = - // -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs); - // } - // else - // { - // fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs(); - // fvm.boundaryCoeffs()[patchi] = - - // pGamma*pvf.gradientBoundaryCoeffs(); - // } - double internal_coeffs_x = 0; - double internal_coeffs_y = 0; - double internal_coeffs_z = 0; - double boundary_coeffs_x = 0; - double boundary_coeffs_y = 0; - double boundary_coeffs_z = 0; - for (int i = cell_offset; i < next_cell_offset; i++) - { - double gamma = boundary_scalar0[i] * boundary_scalar1[i]; - double gamma_magsf = gamma * boundary_magsf[i]; - internal_coeffs_x += gamma_magsf * gradient_internal_coeffs[i * 3 + 0]; - internal_coeffs_y += gamma_magsf * gradient_internal_coeffs[i * 3 + 1]; - internal_coeffs_z += gamma_magsf * gradient_internal_coeffs[i * 3 + 2]; - boundary_coeffs_x -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 0]; - boundary_coeffs_y -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 1]; - boundary_coeffs_z -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 2]; - } - - ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign; - ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign; - ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign; - ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign; - ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign; - ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign; - - A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign; - A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign; - A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign; - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x * sign; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y * sign; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign; -} - -__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells, - const int *csr_row_index, const int *csr_diag_index, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, - const double *psi, double *H) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs - // boundaryDiagCmpt.negate(); - double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0]; - double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1]; - double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2]; - - // addCmptAvBoundaryDiag(boundaryDiagCmpt); - double ave_internal = (internal_x + internal_y + internal_z) / 3; - - H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index]; - H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index]; - H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index]; -} - -__global__ void permute_psi_d2h(int num_cells, const double *input, double *output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - output[index * 3 + 0] = input[num_cells * 0 + index]; - output[index * 3 + 1] = input[num_cells * 1 + index]; - output[index * 3 + 2] = input[num_cells * 2 + index]; -} - -__global__ void permute_psi_h2d(int num_cells, const double *input, double *output) +void dfUEqn::sync() { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - output[num_cells * 0 + index] = input[index * 3 + 0]; - output[num_cells * 1 + index] = input[index * 3 + 1]; - output[num_cells * 2 + index] = input[index * 3 + 2]; + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); } -__global__ void lduMatrix_H(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *volume, const double *psi, const double *A_csr, const double *b, - const double *ueqn_boundary_coeffs, double *H) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; +void dfUEqn::solve() { + ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_boundary_face_cell, + dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, + d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b, d_diag_vector); - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int row_elements = csr_row_index[index + 1] - row_index; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; + int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries + sync(); - double APsi_x = 0.; - double APsi_y = 0.; - double APsi_z = 0.; - // lower - for (int i = 0; i < diag_index; i++) + if (num_iteration == 0) // first interation { - int neighbor_cell_id = csr_col_index[i + row_index]; - APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id]; - APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id]; - APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id]; - } - // upper - for (int i = diag_index + 1; i < row_elements; i++) - { - int neighbor_cell_id = csr_col_index[i + row_index]; - APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id]; - APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id]; - APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id]; - } - - H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index]; - H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index]; - H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index]; - - double vol = volume[index]; - H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol; - H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol; - H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol; -} - -__global__ void addBoundarySource(int num_cells, int num_boundary_cells, - const int *csr_row_index, const int *csr_diag_index, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, - const double *volume, double *H) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int cell_index = boundary_cell_id[cell_offset]; - - double vol = volume[index]; - - H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol; - H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol; - H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol; -} - -__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells, - const int *csr_row_index, const int *csr_diag_index, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0]; - double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1]; - double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2]; - - double ave_internal = (internal_x + internal_y + internal_z) / 3; - - A[cell_index] = ave_internal; -} - -__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index, - const int *csr_diag_index, const double *A_csr, const double *volume, - double *ueqn_internal_coeffs, const double *A_input, double *A_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - int row_index = csr_row_index[index]; - int diag_index = csr_diag_index[index]; - int csr_index = row_index + diag_index; - - double vol = volume[index]; - - A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; -} - -__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, - double *boundary_coeffs, double *laplac_internal_coeffs, - double *laplac_boundary_coeffs, const int *U_patch_type, - const double *boundary_velocity, const double *boundary_deltaCoeffs) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_faces) - return; - - int patchIndex = U_patch_type[index]; - if (patchIndex == 0) { // zeroGradient - double bouPhi = boundary_phi[index]; - internal_coeffs[index * 3 + 0] = bouPhi * 1.; // valueInternalCoeffs = 1. - internal_coeffs[index * 3 + 1] = bouPhi * 1.; - internal_coeffs[index * 3 + 2] = bouPhi * 1.; - boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0. - boundary_coeffs[index * 3 + 1] = -bouPhi * 0.; - boundary_coeffs[index * 3 + 2] = -bouPhi * 0.; - laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0. - laplac_internal_coeffs[index * 3 + 1] = 0.; - laplac_internal_coeffs[index * 3 + 2] = 0.; - laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0. - laplac_boundary_coeffs[index * 3 + 1] = 0.; - laplac_boundary_coeffs[index * 3 + 2] = 0.; - } else if (patchIndex == 1) { // fixedValue - double bouDeltaCoeffs = boundary_deltaCoeffs[index]; - double bouPhi = boundary_phi[index]; - internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0. - internal_coeffs[index * 3 + 1] = bouPhi * 0.; - internal_coeffs[index * 3 + 2] = bouPhi * 0.; - boundary_coeffs[index * 3 + 0] = -bouPhi * boundary_velocity[index * 3 + 0]; // valueBoundaryCoeffs = boundaryValue - boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 1]; - boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 2]; - laplac_internal_coeffs[index * 3 + 0] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs - laplac_internal_coeffs[index * 3 + 1] = -1 * bouDeltaCoeffs; - laplac_internal_coeffs[index * 3 + 2] = -1 * bouDeltaCoeffs; - laplac_boundary_coeffs[index * 3 + 0] = bouDeltaCoeffs * boundary_velocity[index * 3 + 0]; // gradientBoundaryCoeffs = boundaryDeltaCoeffs * boundaryValue - laplac_boundary_coeffs[index * 3 + 1] = bouDeltaCoeffs * boundary_velocity[index * 3 + 1]; - laplac_boundary_coeffs[index * 3 + 2] = bouDeltaCoeffs * boundary_velocity[index * 3 + 2]; - } else if (patchIndex == 2) { // empty - double bouPhi = boundary_phi[index]; - internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0. - internal_coeffs[index * 3 + 1] = bouPhi * 0.; - internal_coeffs[index * 3 + 2] = bouPhi * 0.; - boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0. - boundary_coeffs[index * 3 + 1] = -bouPhi * 0.; - boundary_coeffs[index * 3 + 2] = -bouPhi * 0.; - laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0. - laplac_internal_coeffs[index * 3 + 1] = 0.; - laplac_internal_coeffs[index * 3 + 2] = 0.; - laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0. - laplac_boundary_coeffs[index * 3 + 1] = 0.; - laplac_boundary_coeffs[index * 3 + 2] = 0.; + printf("Initializing AmgX Linear Solver\n"); + UxSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); + UySolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + nNz); + UzSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * nNz); } - // TODO implement coupled conditions -} - -__global__ void ueqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *velocity, double *boundary_velocity, const int *U_patch_type) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) - return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - for (int i = cell_offset; i < next_cell_offset; i++) + else { - int patchIndex = U_patch_type[i]; - switch (patchIndex) - { - case 0: // zeroGradient - { - boundary_velocity[i * 3 + 0] = velocity[cell_index]; - boundary_velocity[i * 3 + 1] = velocity[num_cells * 1 + cell_index]; - boundary_velocity[i * 3 + 2] = velocity[num_cells * 2 + cell_index]; - break; - } - case 1: - break; - case 2: - break; - // TODO implement coupled conditions - } + UxSolver->updateOperator(dataBase_.num_cells, nNz, d_A); + UySolver->updateOperator(dataBase_.num_cells, nNz, d_A + nNz); + UzSolver->updateOperator(dataBase_.num_cells, nNz, d_A + 2 * nNz); } + UxSolver->solve(dataBase_.num_cells, d_permute, d_b); + UySolver->solve(dataBase_.num_cells, d_permute + dataBase_.num_cells, d_b + dataBase_.num_cells); + UzSolver->solve(dataBase_.num_cells, d_permute + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); + num_iteration++; } -// constructor -dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) - : dataBase_(dataBase) -{ - stream = dataBase_.stream; - - UxSolver = new AmgXSolver(modeStr, cfgFile); - UySolver = new AmgXSolver(modeStr, cfgFile); - UzSolver = new AmgXSolver(modeStr, cfgFile); - - num_cells = dataBase_.num_cells; - cell_bytes = dataBase_.cell_bytes; - num_faces = dataBase_.num_faces; - cell_vec_bytes = dataBase_.cell_vec_bytes; - csr_value_vec_bytes = dataBase_.csr_value_vec_bytes; - num_boundary_cells = dataBase_.num_boundary_cells; - num_surfaces = dataBase_.num_surfaces; - - d_A_csr_row_index = dataBase_.d_A_csr_row_index; - d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; - d_A_csr_col_index = dataBase_.d_A_csr_col_index; - - h_A_csr = new double[(num_cells + num_faces) * 3]; - h_b = new double[num_cells * 3]; - cudaMallocHost(&h_psi, cell_vec_bytes); - cudaMallocHost(&h_H, cell_vec_bytes); - cudaMallocHost(&h_A, cell_bytes); - - checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes)); -} - -void dfUEqn::fvm_ddt(double *vector_old) -{ - // Copy the host input array in host memory to the device input array in device memory - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvm_ddt_kernel<<>>(num_cells, num_faces, dataBase_.rdelta_t, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); -} - -void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, - double *boundary_nuEff_init, double *boundary_rho_init) -{ - // copy and permutate boundary variable - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - - size_t threads_per_block = 1024; - size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; - boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init, - dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, - dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); - - // initialize boundary coeffs (must after the update of d_boundary_velocity) - threads_per_block = 1024; - blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; - ueqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, - dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, - dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, - dataBase_.d_boundary_UpatchType, dataBase_.d_boundary_velocity, dataBase_.d_boundary_deltaCoeffs); - - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvm_div_internal<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b); - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b, - d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); -} - -void dfUEqn::fvc_grad(double *pressure) -{ - // Copy the host input array in host memory to the device input array in device memory - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); - - // launch cuda kernel - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_internal_face<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b); - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b); -} - -void dfUEqn::fvc_grad_vector() -{ - size_t threads_per_block = 512; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_vector_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad); - - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_vector_boundary<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity, - dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init); - - correct_boundary_conditions<<>>(num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, - dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_velocity_old, - dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType); -} - -void dfUEqn::dev2T() -{ - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - dev2_t_tensor<<>>(num_cells, dataBase_.d_grad); - - blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; - dev2_t_tensor<<>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary); -} - -void dfUEqn::fvc_div_tensor(const double *nuEff) -{ - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream)); - size_t threads_per_block = 512; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_div_tensor_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, - dataBase_.d_volume, 1., d_b, d_b); +void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug + permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvc_div_tensor_boundary<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, - dataBase_.d_volume, 1., d_b, d_b); + // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary + update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches, + dataBase_.patch_size.data(), patch_type.data(), + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); } -void dfUEqn::fvm_laplacian() -{ - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvm_laplacian_uncorrected_vector_internal<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight, - dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr); - - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvm_laplacian_uncorrected_vector_boundary<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, - dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); -} - -void dfUEqn::A(double *Psi) -{ - checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream)); - - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - addAveInternaltoDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A); - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - addDiagDivVolume<<>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr, - dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A); - - checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - memcpy(Psi, h_A, cell_bytes); -} - -void dfUEqn::H(double *Psi) -{ - checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream)); - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - addBoundaryDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, - d_psi, d_H); - - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - lduMatrix_H<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H); - - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - permute_psi_d2h<<>>(num_cells, d_H, d_H_permute); - - checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - memcpy(Psi, h_H, cell_vec_bytes); -} - -void dfUEqn::initializeTimeStep() -{ - // initialize matrix value - checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); -} - -void dfUEqn::checkValue(bool print) -{ - checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - - // Synchronize stream - checkCudaErrors(cudaStreamSynchronize(stream)); - if (print) - { - for (int i = 0; i < (num_faces + num_cells); i++) - fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + (num_faces + num_cells)], h_A_csr[i + 2 * (num_faces + num_cells)]); - for (int i = 0; i < num_cells; i++) - fprintf(stderr, "h_b[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_b[i], h_b[i + num_cells], h_b[i + 2 * num_cells]); +double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) { + char mergedName[256]; + if (pos == position::internal) { + sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } else if (pos == position::boundary) { + sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); } - char *input_file = "of_output.txt"; - FILE *fp = fopen(input_file, "rb+"); - if (fp == NULL) - { - fprintf(stderr, "Failed to open input file: %s!\n", input_file); + double *pointer = nullptr; + if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) { + pointer = fieldPointerMap[std::string(mergedName)]; } - int readfile = 0; - double *of_b = new double[3 * num_cells]; - double *of_A = new double[3 * (num_faces + num_cells)]; - readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp); - readfile = fread(of_A, (num_faces + num_cells) * sizeof(double) * 3, 1, fp); - - std::vector h_A_of_init_vec(3 * (num_cells + num_faces)); - std::copy(of_A, of_A + (num_cells + num_faces) * 3, h_A_of_init_vec.begin()); - - std::vector h_A_of_vec_perm(3 * (num_faces + num_cells), 0); - for (int i = 0; i < num_faces + num_cells; i++) - { - h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]]; - h_A_of_vec_perm[i + num_faces + num_cells] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + num_faces + num_cells]; - h_A_of_vec_perm[i + 2 * (num_faces + num_cells)] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + 2 * (num_faces + num_cells)]; + if (pointer == nullptr) { + fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName); } - - // b - std::vector h_b_of_init_vec(3 * num_cells); - std::copy(of_b, of_b + 3 * num_cells, h_b_of_init_vec.begin()); - std::vector h_b_of_vec; - for (int i = 0; i < 3 * num_cells; i += 3) - { - h_b_of_vec.push_back(h_b_of_init_vec[i]); + //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer); + + return pointer; +} + +void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, + const double *source, const double *internal_coeffs, const double *boundary_coeffs, + // const double *tmpVal, + bool printFlag) +{ + DEBUG_TRACE; + std::vector h_lower; + h_lower.resize(dataBase_.num_surfaces); + checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_lower"); + checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag); + DEBUG_TRACE; + + std::vector h_upper; + h_upper.resize(dataBase_.num_surfaces); + checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_upper"); + checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag); + DEBUG_TRACE; + + std::vector h_diag; + h_diag.resize(dataBase_.num_cells); + checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_diag"); + checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag); + DEBUG_TRACE; + + std::vector h_source, h_source_ref; + h_source.resize(dataBase_.num_cells * 3); + h_source_ref.resize(dataBase_.num_cells * 3); + for (int i = 0; i < dataBase_.num_cells; i++) { + h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0]; + h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1]; + h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2]; } - // fill RHS_y - for (int i = 1; i < 3 * num_cells; i += 3) - { - h_b_of_vec.push_back(h_b_of_init_vec[i]); + checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_source"); + checkVectorEqual(dataBase_.num_cells * 3, h_source_ref.data(), h_source.data(), 1e-14, printFlag); + DEBUG_TRACE; + + std::vector h_internal_coeffs, h_internal_coeffs_ref; + h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3); + h_internal_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3); + for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) { + h_internal_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 0]; + h_internal_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 1]; + h_internal_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 2]; } - // fill RHS_z - for (int i = 2; i < 3 * num_cells; i += 3) - { - h_b_of_vec.push_back(h_b_of_init_vec[i]); + checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_internal_coeffs_ref.data(), h_internal_coeffs.data(), 1e-14, printFlag); + DEBUG_TRACE; + + std::vector h_boundary_coeffs, h_boundary_coeffs_ref; + h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3); + h_boundary_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3); + for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) { + h_boundary_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 0]; + h_boundary_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 1]; + h_boundary_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 2]; } + checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_coeffs_ref.data(), h_boundary_coeffs.data(), 1e-14, printFlag); + DEBUG_TRACE; - if (print) - { - for (int i = 0; i < (num_faces + num_cells); i++) - printf("h_A_of_vec[%d]:(%.10lf, %.10lf, %.10lf)\n", i, h_A_of_vec_perm[i], h_A_of_vec_perm[i + (num_faces + num_cells)], h_A_of_vec_perm[i + (num_faces + num_cells) * 2]); - for (int i = 0; i < num_cells; i++) - printf("h_b_of_vec[%d]: (%.10lf, %.10lf, %.10lf)\n", i, of_b[i * 3], of_b[i * 3 + 1], of_b[i * 3 + 2]); - } - - // check - // fprintf(stderr, "check of h_A_csr\n"); - // checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); - // fprintf(stderr, "check of h_b\n"); - // checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5); -} - -void dfUEqn::solve() -{ - // for (size_t i = 0; i < num_cells; i++) - // fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i], - // h_velocity_old[3*i + 1], h_velocity_old[3*i + 2]); - // constructor AmgXSolver at first interation - // Synchronize stream - // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - - checkCudaErrors(cudaStreamSynchronize(stream)); - - // nvtxRangePush("solve"); - - int nNz = num_cells + num_faces; // matrix entries - if (num_iteration == 0) // first interation - { - printf("Initializing AmgX Linear Solver\n"); - UxSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr); - UySolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + nNz); - UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2 * nNz); - } - else - { - UxSolver->updateOperator(num_cells, nNz, d_A_csr); - UySolver->updateOperator(num_cells, nNz, d_A_csr + nNz); - UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2 * nNz); - } - UxSolver->solve(num_cells, d_psi, d_b); - UySolver->solve(num_cells, d_psi + num_cells, d_b + num_cells); - UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells); - num_iteration++; - - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - permute_psi_d2h<<>>(num_cells, d_psi, d_psi_permute); - checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - // for (size_t i = 0; i < num_cells; i++) - // fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], - // h_psi[num_cells + i], h_psi[num_cells*2 + i]); + // std::vector h_tmpVal; + // h_tmpVal.resize(dataBase_.num_cells * 3); + // checkCudaErrors(cudaMemcpy(h_tmpVal.data(), d_fvc_output, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + // checkVectorEqual(dataBase_.num_cells * 3, tmpVal, h_tmpVal.data(), 1e-14, printFlag); + // DEBUG_TRACE; } -void dfUEqn::sync() -{ - checkCudaErrors(cudaStreamSynchronize(stream)); -} - -void dfUEqn::updatePsi(double *Psi) -{ - checkCudaErrors(cudaStreamSynchronize(stream)); - memcpy(Psi, h_psi, cell_vec_bytes); -} - -void dfUEqn::correctBoundaryConditions() -{ - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - ueqn_correct_BoundaryConditions_kernel<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - d_psi, dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType); -} - -// correct volecity in pEqn -void dfUEqn::correctPsi(double *Psi) -{ - memcpy(h_psi, Psi, cell_vec_bytes); - checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - permute_psi_h2d<<>>(num_cells, d_psi_permute, d_psi); -} - -dfUEqn::~dfUEqn() -{ -} diff --git a/src_gpu_orig/AmgXSolver.H b/src_gpu_orig/AmgXSolver.H new file mode 100644 index 000000000..190808934 --- /dev/null +++ b/src_gpu_orig/AmgXSolver.H @@ -0,0 +1,310 @@ +/** + * \file AmgXSolver.hpp + * \brief Definition of class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \author Matt Martineau (mmartineau@nvidia.com) + * \date 2015-09-01 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * This project is released under MIT License. + */ + + +#ifndef __AMGX_SOLVER_H__ +#define __AMGX_SOLVER_H__ + +// CUDA +#include + +// STL +# include +# include +# include + +// AmgX +# include + +// PETSc +// # include + + +/** \brief A macro to check the returned CUDA error code. + * + * \param call [in] Function call to CUDA API. + */ +# define CHECK(call) \ +do \ +{ \ + const cudaError_t error_code = call; \ + if (error_code != cudaSuccess) \ + { \ + printf("CUDA Error:\n"); \ + printf(" File: %s\n", __FILE__); \ + printf(" Line: %d\n", __LINE__); \ + printf(" Error code: %d\n", error_code); \ + printf(" Error text: %s\n", \ + cudaGetErrorString(error_code)); \ + exit(1); \ + } \ +} while (0) + + + + + + +/** \brief A wrapper class for coupling PETSc and AmgX. + * + * This class is a wrapper of AmgX library for PETSc. PETSc users only need to + * pass a PETSc matrix and vectors into an AmgXSolver instance to solve their + * linear systems. The class is designed specifically for the situation where + * the number of MPI processes is more than the number of GPU devices. + * + * Eaxmple usage: + * \code + * int main(int argc, char **argv) + * { + * // initialize matrix A, RHS, etc using PETSc + * ... + * + * // create an instance of the solver wrapper + * AmgXSolver solver; + * // initialize the instance with communicator, executation mode, and config file + * solver.initialize(comm, mode, file); + * // set matrix A. Currently it only accept PETSc AIJ matrix + * solver.setA(A); + * // solve. x and rhs are PETSc vectors. unkns will be the final result in the end + * solver.solve(unks, rhs); + * // get number of iterations + * int iters; + * solver.getIters(iters); + * // get residual at the last iteration + * double res; + * solver.getResidual(iters, res); + * // finalization + * solver.finalize(); + * + * // other codes + * .... + * + * return 0; + * } + * \endcode + */ +class AmgXSolver +{ + public: + + /** \brief Default constructor. */ + AmgXSolver() = default; + + /** \brief Construct a AmgXSolver instance. + * + * \param comm [in] MPI communicator. + * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI). + * \param cfgFile [in] A string; the path to AmgX configuration file. + */ + AmgXSolver + ( + const std::string &modeStr, + const std::string &cfgFile + ); + + /** \brief Destructor. */ + ~AmgXSolver(); + + /** \brief Initialize a AmgXSolver instance. + * + * \param comm [in] MPI communicator. + * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI). + * \param cfgFile [in] A string; the path to AmgX configuration file. + * + */ + void initialize + ( + const std::string &modeStr, + const std::string &cfgFile + ); + + + /** \brief Finalize this instance. + * + * This function destroys AmgX data. When there are more than one + * AmgXSolver instances, the last one destroyed is also in charge of + * destroying the shared resource object and finalizing AmgX. + * + */ + void finalize(); + + /** \brief Set up the matrix used by AmgX. + * + * This function sets up the AmgX matrix from the provided CSR data + * structures and partition data. + * + * \param nGlobalRows [in] The number of global rows. + * \param nLocalRows [in] The number of local rows on this rank. + * \param nLocalNz [in] The total number of non zero entries locally. + * \param rowOffsets [in] The local CSR matrix row offsets. + * \param colIndicesGlobal [in] The global CSR matrix column indices. + * \param values [in] The local CSR matrix values. + * id of the owning rank for each row. + * + */ + void setOperator + ( + const int nRows, + const int nNz, + const int *rowIndex, + const int *colIndex, + const double *value + ); + + /** \brief Re-sets up an existing AmgX matrix. + * + * Replaces the matrix coefficients with the provided values and performs + * a resetup for the AmgX matrix. + * + * \param nLocalRows [in] The number of local rows on this rank. + * \param nLocalNz [in] The total number of non zero entries locally. + * \param values [in] The local CSR matrix values. + * + */ + void updateOperator + ( + const int nRows, + const int nNz, + const double *value + ); + + /** \brief Solve the linear system. + * + * \p p vector will be used as an initial guess and will be updated to the + * solution by the end of solving. + * + * For cases that use more MPI processes than the number of GPUs, this + * function will do data gathering before solving and data scattering + * after the solving. + * + * \param nLocalRows [in] The number of rows owned by this rank. + * \param pscalar [in, out] The unknown array. + * \param bscalar [in] The RHS array. + * \param matrix [in,out] The AmgX CSR matrix, A. + * + */ + void solve + ( + int nRows, + double* psi, + const double* rhs + ); + + /** \brief Solve the linear system. + * + * \p p vector will be used as an initial guess and will be updated to the + * solution by the end of solving. + * + * For cases that use more MPI processes than the number of GPUs, this + * function will do data gathering before solving and data scattering + * after the solving. + * + * \param nLocalRows [in] The number of rows owned by this rank. + * \param p [in, out] The unknown vector. + * \param b [in] The RHS vector. + * \param matrix [in,out] The AmgX CSR matrix, A. + * + */ + // void solve + // ( + // int nLocalRows, + // Vec& p, + // Vec& b, + // AmgXCSRMatrix& matrix + // ); + + + /** \brief Get the number of iterations of the last solving. + * + * \param iter [out] Number of iterations. + * + */ + void getIters + ( + int &iter + ); + + /** \brief Get the residual at a specific iteration during the last solving. + * + * \param iter [in] Target iteration. + * \param res [out] Returned residual. + * + */ + void getResidual + ( + const int &iter, + double &res + ); + + + private: + + /** \brief Current count of AmgXSolver instances. + * + * This static variable is used to count the number of instances. The + * fisrt instance is responsable for initializing AmgX library and the + * resource instance. + */ + static int count; + + /** \brief A flag indicating if this instance has been initialized. */ + bool isInitialised = false; + + /** \brief A parameter used by AmgX. */ + int ring; + + /** \brief AmgX solver mode. */ + AMGX_Mode mode; + + /** \brief AmgX config object. */ + AMGX_config_handle cfg = nullptr; + + /** \brief AmgX matrix object. */ + AMGX_matrix_handle AmgXA = nullptr; + + /** \brief AmgX vector object representing unknowns. */ + AMGX_vector_handle AmgXP = nullptr; + + /** \brief AmgX vector object representing RHS. */ + AMGX_vector_handle AmgXRHS = nullptr; + + /** \brief AmgX solver object. */ + AMGX_solver_handle solver = nullptr; + + /** \brief AmgX resource object. + * + * Due to the design of AmgX library, using more than one resource + * instance may cause some problems. So we make the resource instance + * as a static member to keep only one instance. + */ + static AMGX_resources_handle rsrc; + + /** \brief Set AmgX solver mode based on the user-provided string. + * + * Available modes are: dDDI, dDFI, dFFI, hDDI, hDFI, hFFI. + * + * \param modeStr [in] a std::string. + */ + void setMode(const std::string &modeStr); + + /** \brief Perform necessary initialization of AmgX. + * + * This function initializes AmgX for current instance. Based on + * \ref AmgXSolver::count "count", only the instance initialized first + * is in charge of initializing AmgX and the resource instance. + * + * \param cfgFile [in] Path to AmgX solver configuration file. + */ + void initAmgX(const std::string &cfgFile); +}; + +#endif + diff --git a/src_gpu_orig/AmgXSolver.cu b/src_gpu_orig/AmgXSolver.cu new file mode 100644 index 000000000..b0076e5c3 --- /dev/null +++ b/src_gpu_orig/AmgXSolver.cu @@ -0,0 +1,296 @@ +/** + * \file AmgXSolver.cpp + * \brief Definition of member functions of the class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \author Matt Martineau (mmartineau@nvidia.com) + * \date 2015-09-01 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * This project is released under MIT License. + */ + +// AmgXWrapper +#include "AmgXSolver.H" +#include +#include + +// initialize AmgXSolver::count to 0 +int AmgXSolver::count = 0; + +// initialize AmgXSolver::rsrc to nullptr; +AMGX_resources_handle AmgXSolver::rsrc = nullptr; + + +/* \implements AmgXSolver::AmgXSolver */ +AmgXSolver::AmgXSolver(const std::string &modeStr, const std::string &cfgFile) +{ + initialize(modeStr, cfgFile); +} + + +/* \implements AmgXSolver::~AmgXSolver */ +AmgXSolver::~AmgXSolver() +{ + if (isInitialised) finalize(); +} + + +/* \implements AmgXSolver::initialize */ +void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFile) +{ + + // if this instance has already been initialized, skip + if (isInitialised) { + fprintf(stderr, + "This AmgXSolver instance has been initialized on this process.\n"); + exit(0); + } + + // increase the number of AmgXSolver instances + count += 1; + + // get the mode of AmgX solver + setMode(modeStr); + + // initialize AmgX + initAmgX(cfgFile); + + // a bool indicating if this instance is initialized + isInitialised = true; + + return; +} + +/* \implements AmgXSolver::setMode */ +void AmgXSolver::setMode(const std::string &modeStr) +{ + if (modeStr == "dDDI") + mode = AMGX_mode_dDDI; + else if (modeStr == "dDFI") + mode = AMGX_mode_dDFI; + else if (modeStr == "dFFI") + mode = AMGX_mode_dFFI; + else if (modeStr[0] == 'h') { + printf("CPU mode, %s, is not supported in this wrapper!", + modeStr.c_str()); + exit(0); + } + else { + printf("%s is not an available mode! Available modes are: " + "dDDI, dDFI, dFFI.\n", modeStr.c_str()); + exit(0); + } +} + + +/* \implements AmgXSolver::initAmgX */ + void AmgXSolver::initAmgX(const std::string &cfgFile) +{ + // only the first instance (AmgX solver) is in charge of initializing AmgX + if (count == 1) + { + // initialize AmgX + AMGX_SAFE_CALL(AMGX_initialize()); + + // intialize AmgX plugings + AMGX_SAFE_CALL(AMGX_initialize_plugins()); + + // let AmgX to handle errors returned + AMGX_SAFE_CALL(AMGX_install_signal_handler()); + } + + // create an AmgX configure object + AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, cfgFile.c_str())); + + // let AmgX handle returned error codes internally + AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1")); + + // create an AmgX resource object, only the first instance is in charge + if (count == 1) AMGX_resources_create_simple(&rsrc, cfg); + + // create AmgX vector object for unknowns and RHS + AMGX_vector_create(&AmgXP, rsrc, mode); + AMGX_vector_create(&AmgXRHS, rsrc, mode); + + // create AmgX matrix object for unknowns and RHS + AMGX_matrix_create(&AmgXA, rsrc, mode); + + // create an AmgX solver object + AMGX_solver_create(&solver, rsrc, mode, cfg); + + // obtain the default number of rings based on current configuration + AMGX_config_get_default_number_of_rings(cfg, &ring); +} + +/* \implements AmgXSolver::finalize */ +void AmgXSolver::finalize() +{ + // skip if this instance has not been initialised + if (!isInitialised) + { + fprintf(stderr, + "This AmgXWrapper has not been initialised. " + "Please initialise it before finalization.\n"); + exit(0); + } + + // destroy solver instance + AMGX_solver_destroy(solver); + + // destroy matrix instance + AMGX_matrix_destroy(AmgXA); + + // destroy RHS and unknown vectors + AMGX_vector_destroy(AmgXP); + AMGX_vector_destroy(AmgXRHS); + + // only the last instance need to destroy resource and finalizing AmgX + if (count == 1) + { + AMGX_resources_destroy(rsrc); + AMGX_SAFE_CALL(AMGX_config_destroy(cfg)); + + AMGX_SAFE_CALL(AMGX_finalize_plugins()); + AMGX_SAFE_CALL(AMGX_finalize()); + } + else + { + AMGX_config_destroy(cfg); + } + + // decrease the number of instances + count -= 1; + + // change status + isInitialised = false; +} + +/* \implements AmgXSolver::setOperator */ +void AmgXSolver::setOperator +( + const int nRows, + const int nNz, + const int *rowIndex, + const int *colIndex, + const double *value +) +{ + + // Check the matrix size is not larger than tolerated by AmgX + if(nRows > std::numeric_limits::max()) + { + fprintf(stderr, + "AmgX does not support a global number of rows greater than " + "what can be stored in 32 bits (nGlobalRows = %d).\n", + nRows); + exit(0); + } + + if (nNz > std::numeric_limits::max()) + { + fprintf(stderr, + "AmgX does not support non-zeros per (consolidated) rank greater than" + "what can be stored in 32 bits (nLocalNz = %d).\n", + nNz); + exit(0); + } + + // upload matrix A to AmgX + AMGX_matrix_upload_all( + AmgXA, nRows, nNz, 1, 1, rowIndex, colIndex, value, nullptr); + + // bind the matrix A to the solver + AMGX_solver_setup(solver, AmgXA); + + // connect (bind) vectors to the matrix + AMGX_vector_bind(AmgXP, AmgXA); + AMGX_vector_bind(AmgXRHS, AmgXA); +} + + +/* \implements AmgXSolver::updateOperator */ +void AmgXSolver::updateOperator +( + const int nRows, + const int nNz, + const double *value +) +{ + + // Replace the coefficients for the CSR matrix A within AmgX + AMGX_matrix_replace_coefficients(AmgXA, nRows, nNz, value, nullptr); + + // Re-setup the solver (a reduced overhead setup that accounts for consistent matrix structure) + AMGX_solver_resetup(solver, AmgXA); +} + +/* \implements AmgXSolver::solve */ +// void AmgXSolver::solve( +// int nLocalRows, Vec& p, Vec& b, AmgXCSRMatrix& matrix) +// { +// double* pscalar; +// double* bscalar; + +// // get pointers to the raw data of local vectors +// VecGetArray(p, &pscalar); +// VecGetArray(b, &bscalar); + +// solve(nLocalRows, pscalar, bscalar, matrix); + +// VecRestoreArray(p, &pscalar); +// VecRestoreArray(b, &bscalar); +// } + + +/* \implements AmgXSolver::solve */ +void AmgXSolver::solve( + int nRows, double* psi, const double* rhs) +{ + // Upload potentially consolidated vectors to AmgX + AMGX_vector_upload(AmgXP, nRows, 1, psi); + AMGX_vector_upload(AmgXRHS, nRows, 1, rhs); + + // Solve + AMGX_solver_solve(solver, AmgXRHS, AmgXP); + + // Get the status of the solver + AMGX_SOLVE_STATUS status; + AMGX_solver_get_status(solver, &status); + + // Check whether the solver successfully solved the problem + if (status != AMGX_SOLVE_SUCCESS) + { + fprintf(stderr, "AmgX solver failed to solve the system! " + "The error code is %d.\n", + status); + } + + // Download data from device + AMGX_vector_download(AmgXP, psi); + + // get norm and iteration number + double irnorm = 0., rnorm = 0.; + int nIters = 0; + getResidual(0, irnorm); + getIters(nIters); + getResidual(nIters, rnorm); + printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters); + +} + + +/* \implements AmgXSolver::getIters */ +void AmgXSolver::getIters(int &iter) +{ + // only processes using AmgX will try to get # of iterations + AMGX_solver_get_iterations_number(solver, &iter); +} + + +/* \implements AmgXSolver::getResidual */ +void AmgXSolver::getResidual(const int &iter, double &res) +{ + // only processes using AmgX will try to get residual + AMGX_solver_get_iteration_residual(solver, iter, 0, &res); +} + diff --git a/src_gpu_orig/CMakeLists.txt b/src_gpu_orig/CMakeLists.txt new file mode 100644 index 000000000..3a6d59825 --- /dev/null +++ b/src_gpu_orig/CMakeLists.txt @@ -0,0 +1,38 @@ +# +# dfMatrix CMake configuration +# +cmake_minimum_required(VERSION 3.5) + +project(dfMatrixOrig LANGUAGES CXX CUDA) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_package(CUDA REQUIRED) +find_package(MPI REQUIRED) +find_package(CUDAToolkit REQUIRED) +find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) + +add_compile_options(-arch=sm_70 -fmad=false) + +include_directories( + ${MPI_INCLUDE_PATH} + ${CUDA_INCLUDE_DIRS} + $ENV{AMGX_DIR}/include + $ENV{DF_ROOT}/src_gpu +) + +add_library(${PROJECT_NAME} + SHARED + dfMatrixDataBaseOrig.cu + dfMatrixOpBaseOrig.cu) + +target_link_libraries(${PROJECT_NAME} + ${MPI_LIBRARIES} + ${CUDA_LIBRARIES} + ${LIBAMGXSH} +) +target_compile_options(dfMatrixOrig PUBLIC -g) +option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF) +if (DFMATRIX_ENABLE_DETAILED_DEBUG) + target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG) +endif() diff --git a/src_gpu/GPUMesh.H b/src_gpu_orig/GPUMesh.H similarity index 100% rename from src_gpu/GPUMesh.H rename to src_gpu_orig/GPUMesh.H diff --git a/src_gpu/GPUfield.H b/src_gpu_orig/GPUfield.H similarity index 100% rename from src_gpu/GPUfield.H rename to src_gpu_orig/GPUfield.H diff --git a/src_gpu/GPUfield.cpp b/src_gpu_orig/GPUfield.cpp similarity index 100% rename from src_gpu/GPUfield.cpp rename to src_gpu_orig/GPUfield.cpp diff --git a/src_gpu/dfEEqn.H b/src_gpu_orig/dfEEqn.H similarity index 100% rename from src_gpu/dfEEqn.H rename to src_gpu_orig/dfEEqn.H diff --git a/src_gpu/dfEEqn.cu b/src_gpu_orig/dfEEqn.cu similarity index 100% rename from src_gpu/dfEEqn.cu rename to src_gpu_orig/dfEEqn.cu diff --git a/src_gpu_orig/dfMatrixDataBaseOrig.H b/src_gpu_orig/dfMatrixDataBaseOrig.H new file mode 100644 index 000000000..e4a06d861 --- /dev/null +++ b/src_gpu_orig/dfMatrixDataBaseOrig.H @@ -0,0 +1,607 @@ +#pragma once + +#include +#include +#include "cuda_profiler_api.h" +#include +#include "nvtx3/nvToolsExt.h" +#include +#include +#include +#include +#include +#include +#include +#include "dfMatrixDataBase.H" + + +void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); + +struct dfMatrixDataBaseOrig +{ + // - cuda resource + cudaStream_t stream; + + // - number of cell size + int num_cells; + // - number of face size + int num_surfaces; + // - number of offdiagnal entry size (2*num_surfaces) + int num_faces; + // - number of boundary cells + int num_boundary_cells; + // - number of boundary faces + int num_boundary_faces; + + int num_species; + + // - mesh variables + // - csr_row_index + int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; + // - csr_col_index + int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr; + // - csr_diag_index + int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr; + + // - the pre-permutated and post-permutated interpolation weight list + std::vector h_weight_vec_init, h_weight_vec; + // - the pre-permutated and post-permutated flux (phi) list + std::vector h_phi_vec_init, h_phi_vec; + // - the pre-permutated and post-permutated cell face vector list + std::vector h_face_vector_vec_init, h_face_vector_vec; + std::vector h_face_vec_init, h_face_vec; + std::vector h_deltaCoeffs_vec_init, h_deltaCoeffs_vec; + // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list + double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, + *h_pressure = nullptr; + const double *h_volume = nullptr; + // - the host pointer to the pre-permutated and post-permutated interpolation weight list + double *h_weight_init = nullptr, *h_weight = nullptr; + // - the host pointer to the pre-permutated and post-permutated flux (phi) list + double *h_phi_init = nullptr, *h_phi = nullptr; + // - the host pointer to the pre-permutated and post-permutated cell face vector list + double *h_face_vector_init = nullptr, *h_face_vector = nullptr; + double *h_face_init = nullptr, *h_face = nullptr; + double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr; + // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list + double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, + *d_pressure = nullptr, *d_volume = nullptr; + // - the device pointer to Y(vector Yi) + //std::vector d_Y; + double *d_Y = nullptr; + // - the device pointer to the pre-permutated and post-permutated interpolation weight list + double *d_weight_init = nullptr, *d_weight = nullptr; + double *d_weight_upwind = nullptr; + // - the device pointer to the pre-permutated and post-permutated flux (phi) list + double *d_phi_init = nullptr, *d_phi = nullptr; + // - the device pointer to the pre-permutated and post-permutated cell face vector list + double *d_face_vector_init = nullptr, *d_face_vector = nullptr; + double *d_face_init = nullptr, *d_face = nullptr; + double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; + std::vector d_rhoD_vector; + + double *d_hDiffCorrFlux = nullptr; + double *d_diffAlphaD = nullptr; + double *d_rhoD = nullptr; + double *d_alpha = nullptr; + + double rdelta_t = 1/1e-6; + + /** + * @brief boundary related variables + */ + int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr; + int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr; + double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr, + *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr, + *h_boundary_face = nullptr, *d_boundary_face = nullptr, + *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, + *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr, + *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr, + *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr, + *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, + *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, + *d_boundary_pressure_init = nullptr, + *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, + *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, + *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, + *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; + std::vector d_boundary_Y_vector; + std::vector d_boundary_Y_init_vector; + std::vector d_internal_coeffs_Y_vector; + std::vector d_boundary_coeffs_Y_vector; + std::vector d_laplac_internal_coeffs_Y_vector; + std::vector d_laplac_boundary_coeffs_Y_vector; + double *d_internal_coeffs_Y = nullptr; + double *d_boundary_coeffs_Y = nullptr; + double *d_laplac_internal_coeffs_Y = nullptr; + double *d_laplac_boundary_coeffs_Y = nullptr; + std::vector d_boundary_rhoD_vector; + double *d_boundary_mut_sct = nullptr; + double *d_boundary_rhoD = nullptr; + double *d_boundary_alpha = nullptr; + + double *d_boundary_hDiffCorrFlux = nullptr; + int *d_boundary_UpatchType = nullptr; + int *d_boundary_YpatchType = nullptr; + + std::vector boundPermutationList; + std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; + std::vector boundary_face_vector; + std::vector boundary_pressure; + std::vector boundary_face; + std::vector boundary_deltaCoeffs; + std::vector> patch_type_init; + std::vector> patch_type; + + // - the device pointer to the permutated index list + std::vector permedIndex; + int *d_permedIndex=nullptr; + int *d_bouPermedIndex = nullptr; + + + // bytesize + // - bytes of diagnal entries + size_t cell_bytes; + // - bytes of diagnal entries (vector) + size_t cell_vec_bytes; + // - bytes of diagnal index + size_t cell_index_bytes; + // - bytes of diagnal index + size_t face_bytes; + size_t face_vec_bytes; + size_t face_index_bytes; + + size_t boundary_cell_bytes; + size_t boundary_cell_vec_bytes; + size_t boundary_cell_index_bytes; + + size_t boundary_face_bytes; + size_t boundary_face_vec_bytes; + size_t boundary_face_index_bytes; + + // A_csr has one more element in each row: itself + size_t csr_row_index_bytes; + size_t csr_col_index_bytes; + size_t csr_value_bytes; + size_t csr_value_vec_bytes; + + // extra matrix information + double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr; + std::vector h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx; + std::vector h_turbSrc_init_src_vec, h_turbSrc_src_vec; + std::vector tmpPermutatedList; + int * d_tmpPermutatedList = nullptr; + + // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; + // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; + + int num_iteration; + + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + double* d_grad = nullptr; + double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr; + double* d_nuEff = nullptr; + + // constructor + dfMatrixDataBaseOrig(); + dfMatrixDataBaseOrig(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, + const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, + const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, + std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init, std::vector> patch_type_init) + : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), + num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init) + { + // create cuda stream + checkCudaErrors(cudaStreamCreate(&stream)); + + // allocate field pointer in pin memory + cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); + cudaMallocHost(&h_rho_old, num_cells * sizeof(double)); + + h_weight_vec_init.resize(num_faces); + h_weight_vec.resize(num_faces); + h_face_vector_vec_init.resize(num_faces*3); + h_face_vector_vec.resize(num_faces*3); + h_face_vec_init.resize(num_faces); + h_face_vec.resize(num_faces); + h_deltaCoeffs_vec_init.resize(num_faces); + h_deltaCoeffs_vec.resize(num_faces); + h_turbSrc_init_mtx_vec.resize(num_faces + num_cells); + h_turbSrc_init_1mtx.resize(num_faces + num_cells); + h_turbSrc_init_src_vec.resize(3*num_cells); + h_turbSrc_src_vec.resize(3*num_cells); + + // byte sizes + cell_bytes = num_cells * sizeof(double); + cell_vec_bytes = num_cells * 3 * sizeof(double); + cell_index_bytes = num_cells * sizeof(int); + + face_bytes = num_faces * sizeof(double); + face_vec_bytes = num_faces * 3 * sizeof(double); + face_index_bytes = num_faces * sizeof(int); + + // A_csr has one more element in each row: itself + csr_row_index_bytes = (num_cells + 1) * sizeof(int); + csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); + csr_value_bytes = (num_cells + num_faces) * sizeof(double); + csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); + + /************************construct mesh variables****************************/ + /** + * 1. h_csr_row_index & h_csr_diag_index + */ + std::vector h_mtxEntry_perRow_vec(num_cells); + std::vector h_csr_diag_index_vec(num_cells); + std::vector h_csr_row_index_vec(num_cells + 1, 0); + + for (int faceI = 0; faceI < num_surfaces; faceI++) + { + h_csr_diag_index_vec[neighbour[faceI]]++; + h_mtxEntry_perRow_vec[neighbour[faceI]]++; + h_mtxEntry_perRow_vec[owner[faceI]]++; + } + + // - consider diagnal element in each row + std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n) + {return n + 1;}); + // - construct h_csr_row_index & h_csr_diag_index + std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1); + // - assign h_csr_row_index & h_csr_diag_index + h_A_csr_row_index = h_csr_row_index_vec.data(); + h_A_csr_diag_index = h_csr_diag_index_vec.data(); + + /** + * 2. h_csr_col_index + */ + std::vector rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells); + std::iota(diagIndex.begin(), diagIndex.end(), 0); + + // initialize the RowIndex (rowIndex of lower + upper + diagnal) + std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin()); + std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces); + std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces); + // initialize the ColIndex (colIndex of lower + upper + diagnal) + std::copy(owner, owner + num_surfaces, colIndex.begin()); + std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces); + std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces); + + // - construct hashTable for sorting + std::multimap rowColPair; + for (int i = 0; i < 2*num_surfaces+num_cells; i++) + { + rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i])); + } + // - sort + std::vector> globalPerm(rowColPair.begin(), rowColPair.end()); + std::sort(globalPerm.begin(), globalPerm.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + std::vector h_csr_col_index_vec; + std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), [] + (const std::pair& pair) { + return pair.second; + }); + h_A_csr_col_index = h_csr_col_index_vec.data(); + + // construct a tmp permutated List for add fvMatrix + std::vector tmp_permutation(2*num_surfaces + num_cells); + std::vector tmp_rowIndex(2*num_surfaces + num_cells); + std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0); + std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin()); + std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces); + std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells); + std::multimap tmpPair; + for (int i = 0; i < 2*num_surfaces+num_cells; i++) + { + tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i])); + } + std::vector> tmpPerm(tmpPair.begin(), tmpPair.end()); + std::sort(tmpPerm.begin(), tmpPerm.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), [] + (const std::pair& pair) { + return pair.second; + }); + + /** + * 3. boundary imformations + */ + // get boundPermutation and offset lists + std::vector boundPermutationListInit(num_boundary_faces); + std::vector boundOffsetList; + std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0); + + // - construct hashTable for sorting + std::multimap boundPermutation; + for (int i = 0; i < num_boundary_faces; i++) + { + boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i])); + } + + // - sort + std::vector> boundPermPair(boundPermutation.begin(), boundPermutation.end()); + std::sort(boundPermPair.begin(), boundPermPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + // - construct boundPermedIndex and boundary_cell_id + std::vector boundary_cell_id; + boundPermutationList.clear(); + std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), [] + (const std::pair& pair) { + return pair.first; + }); + std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), [] + (const std::pair& pair) { + return pair.second; + }); + + // construct boundary_cell_offset + std::map countMap; + std::vector boundaryCellcount; + for (const auto& cellIndex : boundary_cell_id) + ++ countMap[cellIndex]; + for (const auto& [cellIndex, count] : countMap) + boundaryCellcount.push_back(count); + + num_boundary_cells = boundaryCellcount.size(); + num_boundary_cells_output = num_boundary_cells; + + std::vector boundary_cell_offset(boundaryCellcount.size() + 1, 0); + std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1); + + // assign h_boundary_cell_offset & h_boundary_cell_id + h_boundary_cell_offset = boundary_cell_offset.data(); + h_boundary_cell_id = boundary_cell_id.data(); + + // + boundary_cell_bytes = num_boundary_cells * sizeof(double); + boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); + boundary_cell_index_bytes = num_boundary_cells * sizeof(int); + + boundary_face_bytes = num_boundary_faces * sizeof(double); + boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); + boundary_face_index_bytes = num_boundary_faces * sizeof(int); + + ueqn_internalCoeffs.resize(3*num_boundary_faces); + ueqn_boundaryCoeffs.resize(3*num_boundary_faces); + + boundary_face_vector.resize(3*num_boundary_faces); + boundary_pressure.resize(num_boundary_faces); + boundary_face.resize(num_boundary_faces); + boundary_deltaCoeffs.resize(num_boundary_faces); + + patch_type.resize(2); + patch_type[0].resize(num_boundary_faces); + patch_type[1].resize(num_boundary_faces); + + /** + * 4. permutation list for field variables + */ + std::vector offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces); + // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper) + std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin()); + std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces); + + // - initialize the permIndex (0, 1, ..., 2*num_surfaces) + std::iota(permIndex.begin(), permIndex.end(), 0); + + // - construct hashTable for sorting + std::multimap permutation; + for (int i = 0; i < 2*num_surfaces; i++) + { + permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i])); + } + // - sort + std::vector> permPair(permutation.begin(), permutation.end()); + std::sort(permPair.begin(), permPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + // - form permedIndex list + std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), [] + (const std::pair& pair) { + return pair.second; + }); + + // copy and permutate cell variables + std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin()); + std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces); + std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin()); + std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces); + std::copy(face, face + num_surfaces, h_face_vec_init.begin()); + std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces); + std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin()); + std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces); + for (int i = 0; i < num_faces; i++) + { + h_weight_vec[i] = h_weight_vec_init[permedIndex[i]]; + h_face_vec[i] = h_face_vec_init[permedIndex[i]]; + h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]]; + h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]]; + h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1]; + h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2]; + } + h_weight = h_weight_vec.data(); + h_face_vector = h_face_vector_vec.data(); + h_face = h_face_vec.data(); + h_deltaCoeffs = h_deltaCoeffs_vec.data(); + + for (int i = 0; i < num_boundary_faces; i++) + { + boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]]; + boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1]; + boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; + boundary_face[i] = boundary_face_init[boundPermutationList[i]]; + boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; + patch_type[0][i] = patch_type_init[0][boundPermutationList[i]]; + patch_type[1][i] = patch_type_init[1][boundPermutationList[i]]; + } + h_boundary_face_vector = boundary_face_vector.data(); + h_boundary_face = boundary_face.data(); + h_boundary_deltaCoeffs = boundary_deltaCoeffs.data(); + + /************************allocate memory on device****************************/ + int total_bytes = 0; + + checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); + total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); + + //d_Y.resize(num_species); + d_rhoD_vector.resize(num_species); + d_boundary_Y_vector.resize(num_species); + d_boundary_Y_init_vector.resize(num_species); + d_internal_coeffs_Y_vector.resize(num_species); + d_boundary_coeffs_Y_vector.resize(num_species); + d_laplac_internal_coeffs_Y_vector.resize(num_species); + d_laplac_boundary_coeffs_Y_vector.resize(num_species); + d_boundary_rhoD_vector.resize(num_species); + + for (size_t i = 0; i < num_species; ++i){ + //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); + } + checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes)); + total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); + total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int)); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes)); + for (size_t i = 0; i < num_species; ++i){ + checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes)); + } + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes)); + + total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11); + + // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); + total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); + + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); + total_bytes += (2*csr_value_bytes + cell_vec_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes)); + total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9)); + checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); + total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename + + checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes)); + + fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); + + checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + }; + + ~dfMatrixDataBaseOrig(){ + std::cout << "Destructor called." << std::endl; + // TODO: free pointers + }; +}; + diff --git a/src_gpu_orig/dfMatrixDataBaseOrig.cu b/src_gpu_orig/dfMatrixDataBaseOrig.cu new file mode 100644 index 000000000..7eb0ba593 --- /dev/null +++ b/src_gpu_orig/dfMatrixDataBaseOrig.cu @@ -0,0 +1,48 @@ +#include "dfMatrixDataBaseOrig.H" + + +void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, + const int patchSize) +{ + boundaryConditions patchCondition; + std::vector tmpSelector; + static std::map BCMap = { + {"zeroGradient", zeroGradient}, + {"fixedValue", fixedValue}, + {"empty", empty}, + {"coupled", coupled} + }; + auto iter = BCMap.find(patchTypeStr); + if (iter != BCMap.end()) { + patchCondition = iter->second; + } else { + throw std::runtime_error("Unknown boundary condition: " + patchTypeStr); + } + // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2 + switch (patchCondition){ + case zeroGradient: + { + tmpSelector.resize(patchSize, 0); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case fixedValue: + { + tmpSelector.resize(patchSize, 1); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case empty: + { + tmpSelector.resize(patchSize, 2); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case coupled: + { + tmpSelector.resize(patchSize, 3); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + } +} diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.H b/src_gpu_orig/dfMatrixOpBaseOrig.H new file mode 100644 index 000000000..0f61b558b --- /dev/null +++ b/src_gpu_orig/dfMatrixOpBaseOrig.H @@ -0,0 +1,9 @@ +#pragma once + +#include "dfMatrixDataBaseOrig.H" +#include "dfMatrixDataBase.H" + +void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, + double *d_grad_boundary_init, double *d_grad_boundary); + +void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad); \ No newline at end of file diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.cu b/src_gpu_orig/dfMatrixOpBaseOrig.cu new file mode 100644 index 000000000..95737ab12 --- /dev/null +++ b/src_gpu_orig/dfMatrixOpBaseOrig.cu @@ -0,0 +1,460 @@ +#include "dfMatrixOpBaseOrig.H" + + +__global__ void fvc_grad_vector_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, const double *volume, + double *grad) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_vf_x = vf[index * 3 + 0]; + double own_vf_y = vf[index * 3 + 1]; + double own_vf_z = vf[index * 3 + 2]; + double grad_xx = 0; + double grad_xy = 0; + double grad_xz = 0; + double grad_yx = 0; + double grad_yy = 0; + double grad_yz = 0; + double grad_zx = 0; + double grad_zy = 0; + double grad_zz = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x; + double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y; + double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z; + grad_xx -= sf_x * face_x; + grad_xy -= sf_x * face_y; + grad_xz -= sf_x * face_z; + grad_yx -= sf_y * face_x; + grad_yy -= sf_y * face_y; + grad_yz -= sf_y * face_z; + grad_zx -= sf_z * face_x; + grad_zy -= sf_z * face_y; + grad_zz -= sf_z * face_z; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x; + double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y; + double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z; + grad_xx += sf_x * face_x; + grad_xy += sf_x * face_y; + grad_xz += sf_x * face_z; + grad_yx += sf_y * face_x; + grad_yy += sf_y * face_y; + grad_yz += sf_y * face_z; + grad_zx += sf_z * face_x; + grad_zy += sf_z * face_y; + grad_zz += sf_z * face_z; + // if (index == 0) + // { + // printf("grad_xx = %.20lf\n", grad_xx); + // // printf("sf_x = %.20lf\n", sf_x); + // // printf("face_x = %.20lf\n", face_x); + // } + } + double vol = volume[index]; + grad[index * 9 + 0] = grad_xx / vol; + grad[index * 9 + 1] = grad_xy / vol; + grad[index * 9 + 2] = grad_xz / vol; + grad[index * 9 + 3] = grad_yx / vol; + grad[index * 9 + 4] = grad_yy / vol; + grad[index * 9 + 5] = grad_yz / vol; + grad[index * 9 + 6] = grad_zx / vol; + grad[index * 9 + 7] = grad_zy / vol; + grad[index * 9 + 8] = grad_zz / vol; + + + // if (index == 2257) + // { + // printf("grad[2257] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2], + // grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]); + // } +} + +__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, const double *volume, + double *grad, double *grad_boundary_init) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + double grad_xx = 0; + double grad_xy = 0; + double grad_xz = 0; + double grad_yx = 0; + double grad_yy = 0; + double grad_yz = 0; + double grad_zx = 0; + double grad_zy = 0; + double grad_zz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + int p = bouPermedIndex[i]; + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double vf_x = boundary_vf[p * 3 + 0]; + double vf_y = boundary_vf[p * 3 + 1]; + double vf_z = boundary_vf[p * 3 + 2]; + grad_xx += sf_x * vf_x; + grad_xy += sf_x * vf_y; + grad_xz += sf_x * vf_z; + grad_yx += sf_y * vf_x; + grad_yy += sf_y * vf_y; + grad_yz += sf_y * vf_z; + grad_zx += sf_z * vf_x; + grad_zy += sf_z * vf_y; + grad_zz += sf_z * vf_z; + } + + double vol = volume[cell_index]; + + grad[cell_index * 9 + 0] += grad_xx / vol; + grad[cell_index * 9 + 1] += grad_xy / vol; + grad[cell_index * 9 + 2] += grad_xz / vol; + grad[cell_index * 9 + 3] += grad_yx / vol; + grad[cell_index * 9 + 4] += grad_yy / vol; + grad[cell_index * 9 + 5] += grad_yz / vol; + grad[cell_index * 9 + 6] += grad_zx / vol; + grad[cell_index * 9 + 7] += grad_zy / vol; + grad[cell_index * 9 + 8] += grad_zz / vol; + + grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0]; + grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1]; + grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2]; + grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3]; + grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4]; + grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5]; + grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6]; + grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7]; + grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8]; + + // if (index == 0) + // { + // printf("grad[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2], + // grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]); + // } +} + +__global__ void correct_boundary_conditions(int num_boundary_cells, const int *bouPermedIndex, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *mag_sf, + double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs, + const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // initialize boundary_grad + double grad_xx = boundary_grad_init[index * 9 + 0]; + double grad_xy = boundary_grad_init[index * 9 + 1]; + double grad_xz = boundary_grad_init[index * 9 + 2]; + double grad_yx = boundary_grad_init[index * 9 + 3]; + double grad_yy = boundary_grad_init[index * 9 + 4]; + double grad_yz = boundary_grad_init[index * 9 + 5]; + double grad_zx = boundary_grad_init[index * 9 + 6]; + double grad_zy = boundary_grad_init[index * 9 + 7]; + double grad_zz = boundary_grad_init[index * 9 + 8]; + + double internal_U_x = internal_velocity[cell_index * 3 + 0]; + double internal_U_y = internal_velocity[cell_index * 3 + 1]; + double internal_U_z = internal_velocity[cell_index * 3 + 2]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + // OpenFoam code + // const vectorField n + // ( + // vsf.mesh().Sf().boundaryField()[patchi] + // / vsf.mesh().magSf().boundaryField()[patchi] + // ); + // gGradbf[patchi] += n * + // ( + // vsf.boundaryField()[patchi].snGrad() + // - (n & gGradbf[patchi]) + // ); + // template // fixedValue + // Foam::tmp> Foam::fvPatchField::snGrad() const + // { + // return patch_.deltaCoeffs()*(*this - patchInternalField()); + // } + + double n_x = boundary_sf[i * 3 + 0] / mag_sf[i]; + double n_y = boundary_sf[i * 3 + 1] / mag_sf[i]; + double n_z = boundary_sf[i * 3 + 2] / mag_sf[i]; + + int p = bouPermedIndex[i]; + + double sn_grad_x, sn_grad_y, sn_grad_z; + int patchIndex = U_patch_type[i]; + if (patchIndex == 0) { // zeroGradient + sn_grad_x = 0; + sn_grad_y = 0; + sn_grad_z = 0; + } else if (patchIndex == 1) { // fixedValue + sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 0] - internal_velocity[cell_index * 3 + 0]); + sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 1] - internal_velocity[cell_index * 3 + 1]); + sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 2] - internal_velocity[cell_index * 3 + 2]); + // if (index == 1) + // { + // printf("cell_index = %d\n", cell_index); + // printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]); + // printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]); + // } + + } + // TODO: implement other BCs + double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); + double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x; + boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y; + boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z; + boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x; + boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y; + boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z; + boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x; + boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y; + boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z; + + } +} + +__global__ void fvc_grad_scalar_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *pressure, const double *volume, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_p = pressure[index]; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + double grad_bx_low = 0; + double grad_bx_upp = 0; + double grad_by_low = 0; + double grad_by_upp = 0; + double grad_bz_low = 0; + double grad_bz_upp = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_low -= face_p * sfx; + grad_by_low -= face_p * sfy; + grad_bz_low -= face_p * sfz; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p; + grad_bx_upp += face_p * sfx; + grad_by_upp += face_p * sfy; + grad_bz_upp += face_p * sfz; + } + } + double vol = volume[index]; + b_output[index * 3 + 0] = b_input[index * 3 + 0] + (grad_bx_low + grad_bx_upp) / vol; + b_output[index * 3 + 1] = b_input[index * 3 + 1] + (grad_by_low + grad_by_upp) / vol; + b_output[index * 3 + 2] = b_input[index * 3 + 2] + (grad_bz_low + grad_bz_upp) / vol; + // b_output[index * 3 + 0] = b_input[index * 3 + 0] + grad_bx_low + grad_bx_upp; + // b_output[index * 3 + 1] = b_input[index * 3 + 1] + grad_by_low + grad_by_upp; + // b_output[index * 3 + 2] = b_input[index * 3 + 2] + grad_bz_low + grad_bz_upp; + +} + +__global__ void fvc_grad_scalar_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_face_vector, const double *boundary_pressure, const double *volume, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // compute boundary gradient + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + int p = bouPermedIndex[i]; + double sfx = boundary_face_vector[i * 3 + 0]; + double sfy = boundary_face_vector[i * 3 + 1]; + double sfz = boundary_face_vector[i * 3 + 2]; + double face_p = boundary_pressure[p]; + grad_bx += face_p * sfx; + grad_by += face_p * sfy; + grad_bz += face_p * sfz; + } + + //// correct the boundary gradient + // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; + // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; + // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; + // double sn_grad = 0; + // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); + // grad_bx += nx * grad_correction; + // grad_by += ny * grad_correction; + // grad_bz += nz * grad_correction; + + double vol = volume[cell_index]; + b_output[cell_index * 3 + 0] = b_input[cell_index * 3 + 0] + grad_bx / vol; + b_output[cell_index * 3 + 1] = b_input[cell_index * 3 + 1] + grad_by / vol; + b_output[cell_index * 3 + 2] = b_input[cell_index * 3 + 2] + grad_bz / vol; +} + + +void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, + double *d_grad_boundary_init, double *d_grad_boundary) +{ + float time_elapsed = 0; + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start, 0)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_internal<<>>(dataBase.num_cells, + dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index, + dataBaseOrig->d_face_vector, dataBase.d_u, dataBaseOrig->d_weight, dataBaseOrig->d_volume, d_grad); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("\nfvc_grad_vector_orig internal 执行时间:%f(ms)\n", time_elapsed); + + + checkCudaErrors(cudaEventRecord(start, 0)); + blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_boundary<<>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex, + dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, + dataBase.d_boundary_u, dataBase.d_volume, d_grad, d_grad_boundary_init); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_orig boundary1 执行时间:%f(ms)\n", time_elapsed); + + + checkCudaErrors(cudaEventRecord(start, 0)); + correct_boundary_conditions<<>>(dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex, + dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, + dataBaseOrig->d_boundary_face, d_grad_boundary_init, d_grad_boundary, dataBaseOrig->d_boundary_deltaCoeffs, + dataBase.d_u, dataBase.d_boundary_u, dataBaseOrig->d_boundary_UpatchType); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_orig boundary2 执行时间:%f(ms)\n", time_elapsed); +} + +void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad) +{ + float time_elapsed = 0; + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start, 0)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_scalar_internal<<>>(dataBase.num_cells, + dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index, + dataBaseOrig->d_face_vector, dataBaseOrig->d_weight, dataBase.d_p, dataBaseOrig->d_volume, d_grad, d_grad); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("\nfvc_grad_scalar_orig internal 执行时间:%f(ms)\n", time_elapsed); + + checkCudaErrors(cudaEventRecord(start, 0)); + + blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_scalar_boundary<<>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex, + dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, + dataBaseOrig->d_boundary_face_vector, dataBase.d_boundary_p, dataBaseOrig->d_volume, d_grad, d_grad); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_scalar_orig boundary 执行时间:%f(ms)\n", time_elapsed); +} \ No newline at end of file diff --git a/src_gpu/dfRhoEqn.H b/src_gpu_orig/dfRhoEqn.H similarity index 100% rename from src_gpu/dfRhoEqn.H rename to src_gpu_orig/dfRhoEqn.H diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu_orig/dfRhoEqn.cu similarity index 100% rename from src_gpu/dfRhoEqn.cu rename to src_gpu_orig/dfRhoEqn.cu diff --git a/src_gpu_orig/dfUEqn.H b/src_gpu_orig/dfUEqn.H new file mode 100644 index 000000000..ec739db5e --- /dev/null +++ b/src_gpu_orig/dfUEqn.H @@ -0,0 +1,62 @@ +#pragma once + +#include "AmgXSolver.H" +#include +#include "dfMatrixDataBase.H" + +class dfUEqn +{ +private: + dfMatrixDataBase &dataBase_; + cudaStream_t stream; + AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr; + int num_iteration; + + // common variables + int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; + int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; + + // Matrix variables + double *d_A_csr, *d_b, *d_psi, *d_psi_permute, *d_H, *d_H_permute, *d_A; + double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr; + + double *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr; + +public: + dfUEqn(); + dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile); + ~dfUEqn(); + + void checkValue(bool print); + + void fvm_ddt(double *vector_old); + + void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, + double *boundary_nuEff_init, double *boundary_rho_init); + + void fvc_grad(double *pressure); + + void fvc_grad_vector(); + + void dev2T(); + + void fvc_div_tensor(const double *nuEff); + + void fvm_laplacian(); + + void A(double *Psi); + + void H(double *Psi); + + void solve(); + + void sync(); + + void updatePsi(double *Psi); + + void correctBoundaryConditions(); + + void correctPsi(double *Psi); + + void initializeTimeStep(); +}; diff --git a/src_gpu_orig/dfUEqn.cu b/src_gpu_orig/dfUEqn.cu new file mode 100644 index 000000000..56983e038 --- /dev/null +++ b/src_gpu_orig/dfUEqn.cu @@ -0,0 +1,1481 @@ +#include "dfUEqn.H" + +// kernel functions +__global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t, + const int *csr_row_index, const int *csr_diag_index, + const double *rho_old, const double *rho_new, const double *volume, const double *velocity_old, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + + int csr_dim = num_cells + num_faces; + int csr_index = row_index + diag_index; + double ddt_diag = rdelta_t * rho_new[index] * volume[index]; + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag; + A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag; + A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag; + + double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0]; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1]; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2]; + + psi[num_cells * 0 + index] = velocity_old[index * 3 + 0]; + psi[num_cells * 1 + index] = velocity_old[index * 3 + 1]; + psi[num_cells * 2 + index] = velocity_old[index * 3 + 2]; +} + +__global__ void fvm_div_internal(int num_cells, int num_faces, + const int *csr_row_index, const int *csr_diag_index, + const double *weight, const double *phi, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + int csr_dim = num_cells + num_faces; + + double div_diag = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + } + // upper + if (inner_index > diag_index) + { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + } + } + A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag + A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag + A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag +} + +__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *internal_coeffs, const double *boundary_coeffs, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, + double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int cell_index = boundary_cell_id[cell_offset]; + int loop_size = boundary_cell_offset[index + 1] - cell_offset; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_dim = num_cells + num_faces; + int csr_index = row_index + diag_index; + + // construct internalCoeffs & boundaryCoeffs + double internal_coeffs_x = 0; + double internal_coeffs_y = 0; + double internal_coeffs_z = 0; + double boundary_coeffs_x = 0; + double boundary_coeffs_y = 0; + double boundary_coeffs_z = 0; + for (int i = 0; i < loop_size; i++) + { + internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0]; + internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1]; + internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2]; + boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0]; + boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1]; + boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2]; + } + ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x; + ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y; + ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z; + ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x; + ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y; + ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z; + + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x; + A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y; + A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z; + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z; +} + +__global__ void fvc_grad_internal_face(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *pressure, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_p = pressure[index]; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + double grad_bx_low = 0; + double grad_bx_upp = 0; + double grad_by_low = 0; + double grad_by_upp = 0; + double grad_bz_low = 0; + double grad_bz_upp = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_low -= face_p * sfx; + grad_by_low -= face_p * sfy; + grad_bz_low -= face_p * sfz; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p; + grad_bx_upp += face_p * sfx; + grad_by_upp += face_p * sfy; + grad_bz_upp += face_p * sfz; + } + } + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp; +} + +__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_face_vector, const double *boundary_pressure, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // compute boundary gradient + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sfx = boundary_face_vector[i * 3 + 0]; + double sfy = boundary_face_vector[i * 3 + 1]; + double sfz = boundary_face_vector[i * 3 + 2]; + double face_p = boundary_pressure[i]; + grad_bx += face_p * sfx; + grad_by += face_p * sfy; + grad_bz += face_p * sfz; + } + + //// correct the boundary gradient + // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; + // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; + // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; + // double sn_grad = 0; + // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); + // grad_bx += nx * grad_correction; + // grad_by += ny * grad_correction; + // grad_bz += nz * grad_correction; + + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz; +} + +__global__ void add_fvMatrix_kernel(int num_cells, int num_faces, + const int *csr_row_index, + const double *turbSrc_A, const double *turbSrc_b, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int csr_dim = num_cells + num_faces; + double A_entry; + + for (int i = row_index; i < next_row_index; i++) + { + A_entry = turbSrc_A[i]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry; + } + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[index * 3 + 0]; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[index * 3 + 1]; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[index * 3 + 2]; +} + +__global__ void offdiagPermutation(const int num_faces, const int *permedIndex, + const double *d_phi_init, double *d_phi) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_faces) + return; + + int p = permedIndex[index]; + d_phi[index] = d_phi_init[p]; +} + +__global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, + const double *boundary_pressure_init, const double *boundary_velocity_init, + double *boundary_pressure, double *boundary_velocity, + double *boundary_nuEff_init, double *boundary_nuEff, + double *boundary_rho_init, double *boundary_rho) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + int p = bouPermedIndex[index]; + boundary_velocity[3 * index] = boundary_velocity_init[3 * p]; + boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1]; + boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2]; + boundary_pressure[index] = boundary_pressure_init[p]; + boundary_rho[index] = boundary_rho_init[p]; + boundary_nuEff[index] = boundary_nuEff_init[p]; +} + +__global__ void fvc_grad_vector_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, const double *volume, + double *grad) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_vf_x = vf[index * 3 + 0]; + double own_vf_y = vf[index * 3 + 1]; + double own_vf_z = vf[index * 3 + 2]; + double grad_xx = 0; + double grad_xy = 0; + double grad_xz = 0; + double grad_yx = 0; + double grad_yy = 0; + double grad_yz = 0; + double grad_zx = 0; + double grad_zy = 0; + double grad_zz = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x; + double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y; + double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z; + grad_xx -= sf_x * face_x; + grad_xy -= sf_x * face_y; + grad_xz -= sf_x * face_z; + grad_yx -= sf_y * face_x; + grad_yy -= sf_y * face_y; + grad_yz -= sf_y * face_z; + grad_zx -= sf_z * face_x; + grad_zy -= sf_z * face_y; + grad_zz -= sf_z * face_z; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x; + double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y; + double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z; + grad_xx += sf_x * face_x; + grad_xy += sf_x * face_y; + grad_xz += sf_x * face_z; + grad_yx += sf_y * face_x; + grad_yy += sf_y * face_y; + grad_yz += sf_y * face_z; + grad_zx += sf_z * face_x; + grad_zy += sf_z * face_y; + grad_zz += sf_z * face_z; + // if (index == 0) + // { + // printf("grad_xx = %.20lf\n", grad_xx); + // // printf("sf_x = %.20lf\n", sf_x); + // // printf("face_x = %.20lf\n", face_x); + // } + } + double vol = volume[index]; + grad[index * 9 + 0] = grad_xx / vol; + grad[index * 9 + 1] = grad_xy / vol; + grad[index * 9 + 2] = grad_xz / vol; + grad[index * 9 + 3] = grad_yx / vol; + grad[index * 9 + 4] = grad_yy / vol; + grad[index * 9 + 5] = grad_yz / vol; + grad[index * 9 + 6] = grad_zx / vol; + grad[index * 9 + 7] = grad_zy / vol; + grad[index * 9 + 8] = grad_zz / vol; +} + +__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, const double *volume, + double *grad, double *grad_boundary_init) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + double grad_xx = 0; + double grad_xy = 0; + double grad_xz = 0; + double grad_yx = 0; + double grad_yy = 0; + double grad_yz = 0; + double grad_zx = 0; + double grad_zy = 0; + double grad_zz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double vf_x = boundary_vf[i * 3 + 0]; + double vf_y = boundary_vf[i * 3 + 1]; + double vf_z = boundary_vf[i * 3 + 2]; + grad_xx += sf_x * vf_x; + grad_xy += sf_x * vf_y; + grad_xz += sf_x * vf_z; + grad_yx += sf_y * vf_x; + grad_yy += sf_y * vf_y; + grad_yz += sf_y * vf_z; + grad_zx += sf_z * vf_x; + grad_zy += sf_z * vf_y; + grad_zz += sf_z * vf_z; + } + + double vol = volume[cell_index]; + + grad[cell_index * 9 + 0] += grad_xx / vol; + grad[cell_index * 9 + 1] += grad_xy / vol; + grad[cell_index * 9 + 2] += grad_xz / vol; + grad[cell_index * 9 + 3] += grad_yx / vol; + grad[cell_index * 9 + 4] += grad_yy / vol; + grad[cell_index * 9 + 5] += grad_yz / vol; + grad[cell_index * 9 + 6] += grad_zx / vol; + grad[cell_index * 9 + 7] += grad_zy / vol; + grad[cell_index * 9 + 8] += grad_zz / vol; + + grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0]; + grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1]; + grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2]; + grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3]; + grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4]; + grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5]; + grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6]; + grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7]; + grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8]; + // if (index == 1) + // { + // printf("grad[1] = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2], + // grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]); + // } +} + +__global__ void correct_boundary_conditions(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *mag_sf, + double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs, + const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // initialize boundary_grad + double grad_xx = boundary_grad_init[index * 9 + 0]; + double grad_xy = boundary_grad_init[index * 9 + 1]; + double grad_xz = boundary_grad_init[index * 9 + 2]; + double grad_yx = boundary_grad_init[index * 9 + 3]; + double grad_yy = boundary_grad_init[index * 9 + 4]; + double grad_yz = boundary_grad_init[index * 9 + 5]; + double grad_zx = boundary_grad_init[index * 9 + 6]; + double grad_zy = boundary_grad_init[index * 9 + 7]; + double grad_zz = boundary_grad_init[index * 9 + 8]; + + double internal_U_x = internal_velocity[cell_index * 3 + 0]; + double internal_U_y = internal_velocity[cell_index * 3 + 1]; + double internal_U_z = internal_velocity[cell_index * 3 + 2]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + // OpenFoam code + // const vectorField n + // ( + // vsf.mesh().Sf().boundaryField()[patchi] + // / vsf.mesh().magSf().boundaryField()[patchi] + // ); + // gGradbf[patchi] += n * + // ( + // vsf.boundaryField()[patchi].snGrad() + // - (n & gGradbf[patchi]) + // ); + // template // fixedValue + // Foam::tmp> Foam::fvPatchField::snGrad() const + // { + // return patch_.deltaCoeffs()*(*this - patchInternalField()); + // } + + double n_x = boundary_sf[i * 3 + 0] / mag_sf[i]; + double n_y = boundary_sf[i * 3 + 1] / mag_sf[i]; + double n_z = boundary_sf[i * 3 + 2] / mag_sf[i]; + + double sn_grad_x, sn_grad_y, sn_grad_z; + int patchIndex = U_patch_type[i]; + if (patchIndex == 0) { // zeroGradient + sn_grad_x = 0; + sn_grad_y = 0; + sn_grad_z = 0; + } else if (patchIndex == 1) { // fixedValue + sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 0] - internal_velocity[cell_index * 3 + 0]); + sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 1] - internal_velocity[cell_index * 3 + 1]); + sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 2] - internal_velocity[cell_index * 3 + 2]); + // if (index == 1) + // { + // printf("cell_index = %d\n", cell_index); + // printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]); + // printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]); + // } + + } + // TODO: implement other BCs + double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); + double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x; + boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y; + boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z; + boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x; + boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y; + boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z; + boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x; + boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y; + boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z; + // if (index == 1) + // { + // printf("boundary_grad = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", boundary_grad[i * 9 + 0], boundary_grad[i * 9 + 1], boundary_grad[i * 9 + 2], + // boundary_grad[i * 9 + 3], boundary_grad[i * 9 + 4], boundary_grad[i * 9 + 5], boundary_grad[i * 9 + 6], boundary_grad[i * 9 + 7], boundary_grad[i * 9 + 8]); + // } + + } +} + +__global__ void dev2_t_tensor(int num, double *tensor) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + double t_xx = tensor[index * 9 + 0]; + double t_xy = tensor[index * 9 + 1]; + double t_xz = tensor[index * 9 + 2]; + double t_yx = tensor[index * 9 + 3]; + double t_yy = tensor[index * 9 + 4]; + double t_yz = tensor[index * 9 + 5]; + double t_zx = tensor[index * 9 + 6]; + double t_zy = tensor[index * 9 + 7]; + double t_zz = tensor[index * 9 + 8]; + double trace_coeff = (2. / 3.) * (t_xx + t_yy + t_zz); + tensor[index * 9 + 0] = t_xx - trace_coeff; + tensor[index * 9 + 1] = t_yx; + tensor[index * 9 + 2] = t_zx; + tensor[index * 9 + 3] = t_xy; + tensor[index * 9 + 4] = t_yy - trace_coeff; + tensor[index * 9 + 5] = t_zy; + tensor[index * 9 + 6] = t_xz; + tensor[index * 9 + 7] = t_yz; + tensor[index * 9 + 8] = t_zz - trace_coeff; +} + +__global__ void fvc_div_tensor_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *scalar0, const double *scalar1, + const double *sf, const double *vf, const double *tlambdas, const double *volume, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double coeff_own = scalar0[index] * scalar1[index]; + + double own_vf_xx = vf[index * 9 + 0]; + double own_vf_xy = vf[index * 9 + 1]; + double own_vf_xz = vf[index * 9 + 2]; + double own_vf_yx = vf[index * 9 + 3]; + double own_vf_yy = vf[index * 9 + 4]; + double own_vf_yz = vf[index * 9 + 5]; + double own_vf_zx = vf[index * 9 + 6]; + double own_vf_zy = vf[index * 9 + 7]; + double own_vf_zz = vf[index * 9 + 8]; + double sum_x = 0; + double sum_y = 0; + double sum_z = 0; + + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0]; + double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1]; + double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2]; + double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3]; + double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4]; + double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5]; + double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6]; + double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7]; + double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8]; + double face_xx = (1 - w) * own_vf_xx * coeff_own + w * neighbor_vf_xx * coeff_nei; + double face_xy = (1 - w) * own_vf_xy * coeff_own + w * neighbor_vf_xy * coeff_nei; + double face_xz = (1 - w) * own_vf_xz * coeff_own + w * neighbor_vf_xz * coeff_nei; + double face_yx = (1 - w) * own_vf_yx * coeff_own + w * neighbor_vf_yx * coeff_nei; + double face_yy = (1 - w) * own_vf_yy * coeff_own + w * neighbor_vf_yy * coeff_nei; + double face_yz = (1 - w) * own_vf_yz * coeff_own + w * neighbor_vf_yz * coeff_nei; + double face_zx = (1 - w) * own_vf_zx * coeff_own + w * neighbor_vf_zx * coeff_nei; + double face_zy = (1 - w) * own_vf_zy * coeff_own + w * neighbor_vf_zy * coeff_nei; + double face_zz = (1 - w) * own_vf_zz * coeff_own + w * neighbor_vf_zz * coeff_nei; + sum_x -= sf_x * face_xx + sf_y * face_yx + sf_z * face_zx; + sum_y -= sf_x * face_xy + sf_y * face_yy + sf_z * face_zy; + sum_z -= sf_x * face_xz + sf_y * face_yz + sf_z * face_zz; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0]; + double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1]; + double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2]; + double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3]; + double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4]; + double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5]; + double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6]; + double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7]; + double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8]; + double face_xx = w * own_vf_xx * coeff_own + (1 - w) * neighbor_vf_xx * coeff_nei; + double face_xy = w * own_vf_xy * coeff_own + (1 - w) * neighbor_vf_xy * coeff_nei; + double face_xz = w * own_vf_xz * coeff_own + (1 - w) * neighbor_vf_xz * coeff_nei; + double face_yx = w * own_vf_yx * coeff_own + (1 - w) * neighbor_vf_yx * coeff_nei; + double face_yy = w * own_vf_yy * coeff_own + (1 - w) * neighbor_vf_yy * coeff_nei; + double face_yz = w * own_vf_yz * coeff_own + (1 - w) * neighbor_vf_yz * coeff_nei; + double face_zx = w * own_vf_zx * coeff_own + (1 - w) * neighbor_vf_zx * coeff_nei; + double face_zy = w * own_vf_zy * coeff_own + (1 - w) * neighbor_vf_zy * coeff_nei; + double face_zz = w * own_vf_zz * coeff_own + (1 - w) * neighbor_vf_zz * coeff_nei; + sum_x += sf_x * face_xx + sf_y * face_yx + sf_z * face_zx; + sum_y += sf_x * face_xy + sf_y * face_yy + sf_z * face_zy; + sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz; + } + double vol = volume[index]; + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign; +} + +__global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_scalar0, const double *boundary_scalar1, + const double *boundary_sf, const double *boundary_vf, const double *volume, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // OpenFoam code + // Foam::surfaceInterpolationScheme::dotInterpolate + // if (vf.boundaryField()[pi].coupled()) + // { + // psf = + // pSf + // & ( + // pLambda*vf.boundaryField()[pi].patchInternalField() + // + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField() + // ); + // } + // else + // { + // psf = pSf & vf.boundaryField()[pi]; + // } + // tmp> surfaceIntegrate + // forAll(mesh.boundary()[patchi], facei) + // { + // ivf[pFaceCells[facei]] += pssf[facei]; + // } + double sum_x = 0; + double sum_y = 0; + double sum_z = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double face_xx = boundary_vf[i * 9 + 0]; + double face_xy = boundary_vf[i * 9 + 1]; + double face_xz = boundary_vf[i * 9 + 2]; + double face_yx = boundary_vf[i * 9 + 3]; + double face_yy = boundary_vf[i * 9 + 4]; + double face_yz = boundary_vf[i * 9 + 5]; + double face_zx = boundary_vf[i * 9 + 6]; + double face_zy = boundary_vf[i * 9 + 7]; + double face_zz = boundary_vf[i * 9 + 8]; + + // if not coupled + double coeff = boundary_scalar0[i] * boundary_scalar1[i]; + sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff; + sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff; + sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff; + } + double vol = volume[cell_index]; + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign; +} + +__global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num_faces, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *scalar0, const double *scalar1, const double *weight, + const double *magsf, const double *distance, + const double sign, const double *A_csr_input, double *A_csr_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + int csr_dim = num_cells + num_faces; + + double own_scalar0 = scalar0[index]; + double own_scalar1 = scalar1[index]; + double own_coeff = own_scalar0 * own_scalar1; + + // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField(); + // fvm.negSumDiag(); + double sum_diag = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_scalar0 = scalar0[neighbor_cell_id]; + double nei_scalar1 = scalar1[neighbor_cell_id]; + double nei_coeff = nei_scalar0 * nei_scalar1; + double gamma = w * (nei_coeff - own_coeff) + own_coeff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign; + A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign; + A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign; + + sum_diag += (-coeff); + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_scalar0 = scalar0[neighbor_cell_id]; + double nei_scalar1 = scalar1[neighbor_cell_id]; + double nei_coeff = nei_scalar0 * nei_scalar1; + double gamma = w * (own_coeff - nei_coeff) + nei_coeff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign; + A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign; + A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign; + sum_diag += (-coeff); + } + A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + sum_diag * sign; // diag + A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + sum_diag * sign; // diag + A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + sum_diag * sign; // diag +} + +__global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num_faces, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_scalar0, const double *boundary_scalar1, + const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, + double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_dim = num_cells + num_faces; + int csr_index = row_index + diag_index; + + // OpenFoam code + // if (pvf.coupled()) + // { + // fvm.internalCoeffs()[patchi] = + // pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs); + // fvm.boundaryCoeffs()[patchi] = + // -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs); + // } + // else + // { + // fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs(); + // fvm.boundaryCoeffs()[patchi] = - + // pGamma*pvf.gradientBoundaryCoeffs(); + // } + double internal_coeffs_x = 0; + double internal_coeffs_y = 0; + double internal_coeffs_z = 0; + double boundary_coeffs_x = 0; + double boundary_coeffs_y = 0; + double boundary_coeffs_z = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double gamma = boundary_scalar0[i] * boundary_scalar1[i]; + double gamma_magsf = gamma * boundary_magsf[i]; + internal_coeffs_x += gamma_magsf * gradient_internal_coeffs[i * 3 + 0]; + internal_coeffs_y += gamma_magsf * gradient_internal_coeffs[i * 3 + 1]; + internal_coeffs_z += gamma_magsf * gradient_internal_coeffs[i * 3 + 2]; + boundary_coeffs_x -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 0]; + boundary_coeffs_y -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 1]; + boundary_coeffs_z -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 2]; + } + + ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign; + ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign; + ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign; + ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign; + ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign; + ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign; + + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign; + A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign; + A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign; + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x * sign; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y * sign; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign; +} + +__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, + const double *psi, double *H) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs + // boundaryDiagCmpt.negate(); + double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0]; + double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1]; + double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2]; + + // addCmptAvBoundaryDiag(boundaryDiagCmpt); + double ave_internal = (internal_x + internal_y + internal_z) / 3; + + H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index]; + H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index]; + H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index]; +} + +__global__ void permute_psi_d2h(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[index * 3 + 0] = input[num_cells * 0 + index]; + output[index * 3 + 1] = input[num_cells * 1 + index]; + output[index * 3 + 2] = input[num_cells * 2 + index]; +} + +__global__ void permute_psi_h2d(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[num_cells * 0 + index] = input[index * 3 + 0]; + output[num_cells * 1 + index] = input[index * 3 + 1]; + output[num_cells * 2 + index] = input[index * 3 + 2]; +} + +__global__ void lduMatrix_H(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *volume, const double *psi, const double *A_csr, const double *b, + const double *ueqn_boundary_coeffs, double *H) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double APsi_x = 0.; + double APsi_y = 0.; + double APsi_z = 0.; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_cell_id = csr_col_index[i + row_index]; + APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id]; + APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id]; + APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id]; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_cell_id = csr_col_index[i + row_index]; + APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id]; + APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id]; + APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id]; + } + + H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index]; + H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index]; + H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index]; + + double vol = volume[index]; + H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol; + H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol; + H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol; +} + +__global__ void addBoundarySource(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, + const double *volume, double *H) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int cell_index = boundary_cell_id[cell_offset]; + + double vol = volume[index]; + + H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol; + H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol; + H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol; +} + +__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0]; + double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1]; + double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2]; + + double ave_internal = (internal_x + internal_y + internal_z) / 3; + + A[cell_index] = ave_internal; +} + +__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index, + const int *csr_diag_index, const double *A_csr, const double *volume, + double *ueqn_internal_coeffs, const double *A_input, double *A_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + int csr_index = row_index + diag_index; + + double vol = volume[index]; + + A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; +} + +__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, + double *boundary_coeffs, double *laplac_internal_coeffs, + double *laplac_boundary_coeffs, const int *U_patch_type, + const double *boundary_velocity, const double *boundary_deltaCoeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + int patchIndex = U_patch_type[index]; + if (patchIndex == 0) { // zeroGradient + double bouPhi = boundary_phi[index]; + internal_coeffs[index * 3 + 0] = bouPhi * 1.; // valueInternalCoeffs = 1. + internal_coeffs[index * 3 + 1] = bouPhi * 1.; + internal_coeffs[index * 3 + 2] = bouPhi * 1.; + boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0. + boundary_coeffs[index * 3 + 1] = -bouPhi * 0.; + boundary_coeffs[index * 3 + 2] = -bouPhi * 0.; + laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0. + laplac_internal_coeffs[index * 3 + 1] = 0.; + laplac_internal_coeffs[index * 3 + 2] = 0.; + laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0. + laplac_boundary_coeffs[index * 3 + 1] = 0.; + laplac_boundary_coeffs[index * 3 + 2] = 0.; + } else if (patchIndex == 1) { // fixedValue + double bouDeltaCoeffs = boundary_deltaCoeffs[index]; + double bouPhi = boundary_phi[index]; + internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0. + internal_coeffs[index * 3 + 1] = bouPhi * 0.; + internal_coeffs[index * 3 + 2] = bouPhi * 0.; + boundary_coeffs[index * 3 + 0] = -bouPhi * boundary_velocity[index * 3 + 0]; // valueBoundaryCoeffs = boundaryValue + boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 1]; + boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 2]; + laplac_internal_coeffs[index * 3 + 0] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs + laplac_internal_coeffs[index * 3 + 1] = -1 * bouDeltaCoeffs; + laplac_internal_coeffs[index * 3 + 2] = -1 * bouDeltaCoeffs; + laplac_boundary_coeffs[index * 3 + 0] = bouDeltaCoeffs * boundary_velocity[index * 3 + 0]; // gradientBoundaryCoeffs = boundaryDeltaCoeffs * boundaryValue + laplac_boundary_coeffs[index * 3 + 1] = bouDeltaCoeffs * boundary_velocity[index * 3 + 1]; + laplac_boundary_coeffs[index * 3 + 2] = bouDeltaCoeffs * boundary_velocity[index * 3 + 2]; + } else if (patchIndex == 2) { // empty + double bouPhi = boundary_phi[index]; + internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0. + internal_coeffs[index * 3 + 1] = bouPhi * 0.; + internal_coeffs[index * 3 + 2] = bouPhi * 0.; + boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0. + boundary_coeffs[index * 3 + 1] = -bouPhi * 0.; + boundary_coeffs[index * 3 + 2] = -bouPhi * 0.; + laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0. + laplac_internal_coeffs[index * 3 + 1] = 0.; + laplac_internal_coeffs[index * 3 + 2] = 0.; + laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0. + laplac_boundary_coeffs[index * 3 + 1] = 0.; + laplac_boundary_coeffs[index * 3 + 2] = 0.; + } + // TODO implement coupled conditions +} + +__global__ void ueqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *velocity, double *boundary_velocity, const int *U_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + int patchIndex = U_patch_type[i]; + switch (patchIndex) + { + case 0: // zeroGradient + { + boundary_velocity[i * 3 + 0] = velocity[cell_index]; + boundary_velocity[i * 3 + 1] = velocity[num_cells * 1 + cell_index]; + boundary_velocity[i * 3 + 2] = velocity[num_cells * 2 + cell_index]; + break; + } + case 1: + break; + case 2: + break; + // TODO implement coupled conditions + } + } +} + +// constructor +dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) + : dataBase_(dataBase) +{ + stream = dataBase_.stream; + + UxSolver = new AmgXSolver(modeStr, cfgFile); + UySolver = new AmgXSolver(modeStr, cfgFile); + UzSolver = new AmgXSolver(modeStr, cfgFile); + + num_cells = dataBase_.num_cells; + cell_bytes = dataBase_.cell_bytes; + num_faces = dataBase_.num_faces; + cell_vec_bytes = dataBase_.cell_vec_bytes; + csr_value_vec_bytes = dataBase_.csr_value_vec_bytes; + num_boundary_cells = dataBase_.num_boundary_cells; + num_surfaces = dataBase_.num_surfaces; + + d_A_csr_row_index = dataBase_.d_A_csr_row_index; + d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; + d_A_csr_col_index = dataBase_.d_A_csr_col_index; + + h_A_csr = new double[(num_cells + num_faces) * 3]; + h_b = new double[num_cells * 3]; + cudaMallocHost(&h_psi, cell_vec_bytes); + cudaMallocHost(&h_H, cell_vec_bytes); + cudaMallocHost(&h_A, cell_bytes); + + checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes)); +} + +void dfUEqn::fvm_ddt(double *vector_old) +{ + // Copy the host input array in host memory to the device input array in device memory + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_ddt_kernel<<>>(num_cells, num_faces, dataBase_.rdelta_t, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); +} + +void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, + double *boundary_nuEff_init, double *boundary_rho_init) +{ + // copy and permutate boundary variable + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init, + dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, + dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); + + // initialize boundary coeffs (must after the update of d_boundary_velocity) + threads_per_block = 1024; + blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + ueqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, + dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, + dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, + dataBase_.d_boundary_UpatchType, dataBase_.d_boundary_velocity, dataBase_.d_boundary_deltaCoeffs); + + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_div_internal<<>>(num_cells, num_faces, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b, + d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); +} + +void dfUEqn::fvc_grad(double *pressure) +{ + // Copy the host input array in host memory to the device input array in device memory + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); + + // launch cuda kernel + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_internal_face<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b); +} + +void dfUEqn::fvc_grad_vector() +{ + size_t threads_per_block = 512; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_boundary<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity, + dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init); + + correct_boundary_conditions<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, + dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_velocity_old, + dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType); +} + +void dfUEqn::dev2T() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + dev2_t_tensor<<>>(num_cells, dataBase_.d_grad); + + blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + dev2_t_tensor<<>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary); +} + +void dfUEqn::fvc_div_tensor(const double *nuEff) +{ + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream)); + size_t threads_per_block = 512; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_div_tensor_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, + dataBase_.d_volume, 1., d_b, d_b); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_div_tensor_boundary<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, + dataBase_.d_volume, 1., d_b, d_b); +} + +void dfUEqn::fvm_laplacian() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_vector_internal<<>>(num_cells, num_faces, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight, + dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_vector_boundary<<>>(num_cells, num_faces, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, + dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); +} + +void dfUEqn::A(double *Psi) +{ + checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + addAveInternaltoDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + addDiagDivVolume<<>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr, + dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A); + + checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + memcpy(Psi, h_A, cell_bytes); +} + +void dfUEqn::H(double *Psi) +{ + checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream)); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + addBoundaryDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, + d_psi, d_H); + + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + lduMatrix_H<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H); + + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_psi_d2h<<>>(num_cells, d_H, d_H_permute); + + checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + memcpy(Psi, h_H, cell_vec_bytes); +} + +void dfUEqn::initializeTimeStep() +{ + // initialize matrix value + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); +} + +void dfUEqn::checkValue(bool print) +{ + checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + + // Synchronize stream + checkCudaErrors(cudaStreamSynchronize(stream)); + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + (num_faces + num_cells)], h_A_csr[i + 2 * (num_faces + num_cells)]); + for (int i = 0; i < num_cells; i++) + fprintf(stderr, "h_b[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_b[i], h_b[i + num_cells], h_b[i + 2 * num_cells]); + } + + char *input_file = "of_output.txt"; + FILE *fp = fopen(input_file, "rb+"); + if (fp == NULL) + { + fprintf(stderr, "Failed to open input file: %s!\n", input_file); + } + int readfile = 0; + double *of_b = new double[3 * num_cells]; + double *of_A = new double[3 * (num_faces + num_cells)]; + readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp); + readfile = fread(of_A, (num_faces + num_cells) * sizeof(double) * 3, 1, fp); + + std::vector h_A_of_init_vec(3 * (num_cells + num_faces)); + std::copy(of_A, of_A + (num_cells + num_faces) * 3, h_A_of_init_vec.begin()); + + std::vector h_A_of_vec_perm(3 * (num_faces + num_cells), 0); + for (int i = 0; i < num_faces + num_cells; i++) + { + h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]]; + h_A_of_vec_perm[i + num_faces + num_cells] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + num_faces + num_cells]; + h_A_of_vec_perm[i + 2 * (num_faces + num_cells)] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + 2 * (num_faces + num_cells)]; + } + + // b + std::vector h_b_of_init_vec(3 * num_cells); + std::copy(of_b, of_b + 3 * num_cells, h_b_of_init_vec.begin()); + std::vector h_b_of_vec; + for (int i = 0; i < 3 * num_cells; i += 3) + { + h_b_of_vec.push_back(h_b_of_init_vec[i]); + } + // fill RHS_y + for (int i = 1; i < 3 * num_cells; i += 3) + { + h_b_of_vec.push_back(h_b_of_init_vec[i]); + } + // fill RHS_z + for (int i = 2; i < 3 * num_cells; i += 3) + { + h_b_of_vec.push_back(h_b_of_init_vec[i]); + } + + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + printf("h_A_of_vec[%d]:(%.10lf, %.10lf, %.10lf)\n", i, h_A_of_vec_perm[i], h_A_of_vec_perm[i + (num_faces + num_cells)], h_A_of_vec_perm[i + (num_faces + num_cells) * 2]); + for (int i = 0; i < num_cells; i++) + printf("h_b_of_vec[%d]: (%.10lf, %.10lf, %.10lf)\n", i, of_b[i * 3], of_b[i * 3 + 1], of_b[i * 3 + 2]); + } + + // check + // fprintf(stderr, "check of h_A_csr\n"); + // checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); + // fprintf(stderr, "check of h_b\n"); + // checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5); +} + +void dfUEqn::solve() +{ + // for (size_t i = 0; i < num_cells; i++) + // fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i], + // h_velocity_old[3*i + 1], h_velocity_old[3*i + 2]); + // constructor AmgXSolver at first interation + // Synchronize stream + // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + + checkCudaErrors(cudaStreamSynchronize(stream)); + + // nvtxRangePush("solve"); + + int nNz = num_cells + num_faces; // matrix entries + if (num_iteration == 0) // first interation + { + printf("Initializing AmgX Linear Solver\n"); + UxSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr); + UySolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + nNz); + UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2 * nNz); + } + else + { + UxSolver->updateOperator(num_cells, nNz, d_A_csr); + UySolver->updateOperator(num_cells, nNz, d_A_csr + nNz); + UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2 * nNz); + } + UxSolver->solve(num_cells, d_psi, d_b); + UySolver->solve(num_cells, d_psi + num_cells, d_b + num_cells); + UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells); + num_iteration++; + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_psi_d2h<<>>(num_cells, d_psi, d_psi_permute); + checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + // for (size_t i = 0; i < num_cells; i++) + // fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], + // h_psi[num_cells + i], h_psi[num_cells*2 + i]); +} + +void dfUEqn::sync() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); +} + +void dfUEqn::updatePsi(double *Psi) +{ + checkCudaErrors(cudaStreamSynchronize(stream)); + memcpy(Psi, h_psi, cell_vec_bytes); +} + +void dfUEqn::correctBoundaryConditions() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + ueqn_correct_BoundaryConditions_kernel<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_psi, dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType); +} + +// correct volecity in pEqn +void dfUEqn::correctPsi(double *Psi) +{ + memcpy(h_psi, Psi, cell_vec_bytes); + checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_psi_h2d<<>>(num_cells, d_psi_permute, d_psi); +} + +dfUEqn::~dfUEqn() +{ +} diff --git a/src_gpu/dfYEqn.H b/src_gpu_orig/dfYEqn.H similarity index 100% rename from src_gpu/dfYEqn.H rename to src_gpu_orig/dfYEqn.H diff --git a/src_gpu/dfYEqn.cu b/src_gpu_orig/dfYEqn.cu similarity index 100% rename from src_gpu/dfYEqn.cu rename to src_gpu_orig/dfYEqn.cu