diff --git a/src_gpu/ueqn.cu b/src_gpu/ueqn.cu new file mode 100644 index 000000000..23401eef5 --- /dev/null +++ b/src_gpu/ueqn.cu @@ -0,0 +1,1097 @@ +#include +#include +#include +#include +#include + +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +static void checkVectorEqual(int count, double* basevec, double* vec, double max_relative_error) { + for (size_t i = 0; i < count; ++i) + { + double abs_diff = fabs(basevec[i] - vec[i]); + double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); + if (abs_diff > 1e-16 && rel_diff > max_relative_error) { + fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); + } + } +} + +__global__ void fvm_ddt(int num_cells, int num_faces, const double rdelta_t, + const int* csr_row_index, const int* csr_diag_index, + const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old, + const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + + int csr_dim = num_cells + num_faces; + int csr_index = row_index + diag_index; + double ddt_diag = rdelta_t * rho_new[index] * volume[index]; + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag; + A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag; + A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag; + + double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0]; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1]; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2]; +} + +__global__ void fvm_div_internal(int num_cells, int num_faces, + const int* csr_row_index, const int* csr_diag_index, + const double* weight, const double* phi, + const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + int csr_dim = num_cells + num_faces; + + double div_diag = 0; + for (int i = row_index; i < next_row_index; i++) { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + } + // upper + if (inner_index > diag_index) { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + } + } + A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag + A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag + A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag +} + +__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells, + const int* csr_row_index, const int* csr_diag_index, + const int* boundary_cell_offset, const int* boundary_cell_id, + const double* internal_coeffs, const double* boundary_coeffs, + const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) return; + + int cell_offset = boundary_cell_offset[index]; // face index + int cell_index = boundary_cell_id[cell_offset]; + int loop_size = boundary_cell_offset[index + 1] - cell_offset; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_dim = num_cells + num_faces; + int csr_index = row_index + diag_index; + + // construct internalCoeffs & boundaryCoeffs + double internal_coeffs_x = 0; + double internal_coeffs_y = 0; + double internal_coeffs_z = 0; + double boundary_coeffs_x = 0; + double boundary_coeffs_y = 0; + double boundary_coeffs_z = 0; + for (int i = 0; i < loop_size; i++) { + internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0]; + internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1]; + internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2]; + boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0]; + boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1]; + boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2]; + } + + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x; + A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y; + A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z; + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z; +} + +__global__ void fvc_grad_internal_face(int num_cells, + const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index, + const double* face_vector, const double* weight, const double* pressure, + const double* b_input, double* b_output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_p = pressure[index]; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + double grad_bx_low = 0; + double grad_bx_upp = 0; + double grad_by_low = 0; + double grad_by_upp = 0; + double grad_bz_low = 0; + double grad_bz_upp = 0; + for (int i = row_index; i < next_row_index; i++) { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_low -= face_p * sfx; + grad_by_low -= face_p * sfy; + grad_bz_low -= face_p * sfz; + } + // upper + if (inner_index > diag_index) { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index + 1]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_upp += face_p * sfx; + grad_by_upp += face_p * sfy; + grad_bz_upp += face_p * sfz; + } + } + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + grad_bx_low + grad_bx_upp; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + grad_by_low + grad_by_upp; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + grad_bz_low + grad_bz_upp; +} + +__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, + const int* boundary_cell_offset, const int* boundary_cell_id, + const double* boundary_face_vector, const double* boundary_pressure, + const double* b_input, double* b_output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // compute boundary gradient + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) { + double sfx = boundary_face_vector[i * 3 + 0]; + double sfy = boundary_face_vector[i * 3 + 1]; + double sfz = boundary_face_vector[i * 3 + 2]; + double face_p = boundary_pressure[i]; + grad_bx += face_p * sfx; + grad_by += face_p * sfy; + grad_bz += face_p * sfz; + } + + //// correct the boundary gradient + //double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; + //double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; + //double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; + //double sn_grad = 0; + //double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); + //grad_bx += nx * grad_correction; + //grad_by += ny * grad_correction; + //grad_bz += nz * grad_correction; + + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + grad_bx; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + grad_by; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + grad_bz; +} + +__global__ void add_fvMatrix(int num_cells, int num_faces, + const int* csr_row_index, + const double* turbSrc_A, const double* turbSrc_b, + const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) return; + + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int csr_dim = num_cells + num_faces; + double A_entry; + + for (int i = row_index; i < next_row_index; i++) + { + A_entry = turbSrc_A[i]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry; + } + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[num_cells * 0 + index]; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[num_cells * 1 + index]; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[num_cells * 2 + index]; +} + +/* +// the implement fo assemble kernel is outdated. +__global__ void assemble(int num_cells, int num_faces, const double rdelta_t, const int* csr_row_index, const int* csr_diag_index, + const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old, + const double* weight, const double* phi, const double* face_vector, const double* pressure, + double* A_csr, double* b) { + + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + int csr_dim = num_cells + num_faces; + + double div_diag = 0; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = row_index; i < next_row_index; i++) { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + double p = pressure[neighbor_index]; + A_csr[csr_dim * 0 + i] += (-w) * f; + A_csr[csr_dim * 1 + i] += (-w) * f; + A_csr[csr_dim * 2 + i] += (-w) * f; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + grad_bx -= p * sfx; + grad_by -= p * sfy; + grad_bz -= p * sfz; + } + // upper + if (inner_index > diag_index) { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + double p = pressure[neighbor_index]; + A_csr[csr_dim * 0 + i] += (1 - w) * f; + A_csr[csr_dim * 1 + i] += (1 - w) * f; + A_csr[csr_dim * 2 + i] += (1 - w) * f; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + grad_bx += p * sfx; + grad_by += p * sfy; + grad_bz += p * sfz; + } + } + + double ddt_diag = rdelta_t * rho_new[index] * volume[index]; + A_csr[csr_dim * 0 + row_index + diag_index] = ddt_diag + div_diag; // diag of A + A_csr[csr_dim * 1 + row_index + diag_index] = ddt_diag + div_diag; // diag of A + A_csr[csr_dim * 2 + row_index + diag_index] = ddt_diag + div_diag; // diag of A + + double ddt_bx = rdelta_t * rho_old[index] * velocity_old[index * 3 + 0] * volume[index]; + double ddt_by = rdelta_t * rho_old[index] * velocity_old[index * 3 + 1] * volume[index]; + double ddt_bz = rdelta_t * rho_old[index] * velocity_old[index * 3 + 2] * volume[index]; + b[num_cells * 0 + index] = ddt_bx + grad_bx; + b[num_cells * 1 + index] = ddt_by + grad_by; + b[num_cells * 2 + index] = ddt_bz + grad_bz; +} +*/ +int main(int argc, char* argv[]) { + // Print the array length to be used, and compute its size + bool validation_mode = false; + int w = 128; + int h = 128; + int d = 128; + char* input_file = NULL; + + if (argc > 1) { + validation_mode = true; + w = atoi(argv[1]); + h = atoi(argv[2]); + d = atoi(argv[3]); + input_file = argv[4]; + } + + fprintf(stderr, "mesh(w, h, d) is (%d, %d, %d)\n", w, h, d); + + // Get num_cells, num_faces, num_boundary_cells, num_boundary_faces + int num_cells = w * h * d; + int num_faces = num_cells * 6; + int num_boundary_cells = 0; + int num_boundary_faces = 0; + for (int i = 0; i < num_cells; ++i) { + int ww = i % w; + int hh = (i % (w * h)) / w; + int dd = i / (w * h); + + bool boundary_flag = false; + if (ww == 0) { + num_boundary_faces++; + boundary_flag = true; + } + if (ww == (w - 1)) { + num_boundary_faces++; + boundary_flag = true; + } + if (hh == 0) { + num_boundary_faces++; + boundary_flag = true; + } + if (hh == (h - 1)) { + num_boundary_faces++; + boundary_flag = true; + } + if (dd == 0) { + num_boundary_faces++; + boundary_flag = true; + } + if (dd == (d - 1)) { + num_boundary_faces++; + boundary_flag = true; + } + + if (boundary_flag) + num_boundary_cells++; + } + num_faces -= num_boundary_faces; + + // Load num_cells, num_faces, num_boundary_cells and num_boundary_faces + FILE *fp = NULL; + if (validation_mode) { + fp = fopen(input_file, "rb+"); + if (fp == NULL) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + + int readfile = 0; + int input_num_cells = 0; + int input_num_faces = 0; + int input_num_boundary_cells = 0; + int input_num_boundary_faces = 0; + readfile = fread(&input_num_cells, sizeof(int), 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + readfile = fread(&input_num_faces, sizeof(int), 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + readfile = fread(&input_num_boundary_cells, sizeof(int), 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + readfile = fread(&input_num_boundary_faces, sizeof(int), 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //if ((input_num_cells != num_cells) || (input_num_faces != num_faces) || (input_num_boundary_cells != num_boundary_cells)) { + // temp + if ((input_num_cells != num_cells) || (input_num_faces * 2 != num_faces + || (input_num_boundary_cells != num_boundary_cells) || (input_num_boundary_faces != num_boundary_faces))) { + fprintf(stderr, "Input info mismatched:\n"); + fprintf(stderr, "input_num_cells: %d, num_cells: %d\n", input_num_cells, num_cells); + fprintf(stderr, "input_num_faces: %d, num_faces: %d\n", input_num_faces, num_faces); + fprintf(stderr, "input_num_boundary_cells: %d, num_boundary_cells: %d\n", input_num_boundary_cells, num_boundary_cells); + fprintf(stderr, "input_num_boundary_faces: %d, num_boundary_faces: %d\n", input_num_boundary_faces, num_boundary_faces); + exit(EXIT_FAILURE); + } + num_faces = input_num_faces * 2; + } + + size_t cell_bytes = num_cells * sizeof(double); + size_t cell_vec_bytes = num_cells * 3 * sizeof(double); + size_t cell_index_bytes = num_cells * sizeof(int); + + size_t face_bytes = num_faces * sizeof(double); + size_t face_vec_bytes = num_faces * 3 * sizeof(double); + + size_t boundary_cell_bytes = num_boundary_cells * sizeof(double); + size_t boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); + size_t boundary_cell_index_bytes = num_boundary_cells * sizeof(int); + + size_t boundary_face_bytes = num_boundary_faces * sizeof(double); + size_t boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); + size_t boundary_face_index_bytes = num_boundary_faces * sizeof(int); + + // A_csr has one more element in each row: itself + size_t csr_row_index_bytes = (num_cells + 1) * sizeof(int); + size_t csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); + size_t csr_value_bytes = (num_cells + num_faces) * sizeof(double); + size_t csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); + + // csr_row_index, csr_col_index, csr_diag_index are pre-computed on cpu, not computed on gpu. + // values of each row are stored continued. + // row_index stores the start offset of each row. + // col_index stores the column number of each row + // diag_index stores the serial number(in a row) of the diag element. + int* h_A_csr_row_index = (int*)malloc(csr_row_index_bytes); + int* h_A_csr_col_index = (int*)malloc(csr_col_index_bytes); + int* h_A_csr_diag_index = (int*)malloc(cell_index_bytes); + + double* h_rho_old = (double*)malloc(cell_bytes); + double* h_rho_new = (double*)malloc(cell_bytes); + double* h_volume = (double*)malloc(cell_bytes); + double* h_pressure = (double*)malloc(cell_bytes); + double* h_velocity_old = (double*)malloc(cell_vec_bytes); + double* h_weight = (double*)malloc(face_bytes); + double* h_phi = (double*)malloc(face_bytes); + double* h_face_vector = (double*)malloc(face_vec_bytes); + + int* h_boundary_cell_offset = (int*)malloc((num_boundary_cells+1) * sizeof(int)); + int* h_boundary_cell_id = (int*)malloc(boundary_face_bytes); + + double* h_boundary_pressure = (double*)malloc(boundary_face_bytes); + double* h_boundary_face_vector = (double*)malloc(boundary_face_vec_bytes); + double* h_internal_coeffs = (double*)malloc(boundary_face_vec_bytes); + double* h_boundary_coeffs = (double*)malloc(boundary_face_vec_bytes); + + double* h_A_csr = (double*)malloc(csr_value_vec_bytes); + double* h_b = (double*)malloc(cell_vec_bytes); + double* h_A_csr_validation = (double*)malloc(csr_value_vec_bytes); + double* h_b_validation = (double*)malloc(cell_vec_bytes); + double* h_grad_validation = (double*)malloc(cell_vec_bytes); + + double* h_turbSrc_A = (double*)malloc(csr_value_bytes); + double* h_turbSrc_b = (double*)malloc(cell_vec_bytes); + + double *b_openfoam = (double*)malloc(cell_vec_bytes); + double *A_openfoam = (double*)malloc(csr_value_vec_bytes); + + // Verify that allocations succeeded + if (h_A_csr_row_index == NULL || h_A_csr_col_index == NULL || h_A_csr_diag_index == NULL + || h_rho_old == NULL || h_rho_new == NULL || h_volume == NULL || h_pressure == NULL || h_velocity_old == NULL + || h_weight == NULL || h_phi == NULL || h_face_vector == NULL + || h_boundary_cell_offset == NULL || h_boundary_cell_id == NULL + || h_boundary_pressure == NULL || h_boundary_face_vector == NULL + || h_internal_coeffs == NULL || h_boundary_coeffs == NULL + || h_A_csr == NULL || h_b == NULL || h_A_csr_validation == NULL || h_b_validation == NULL) { + fprintf(stderr, "Failed to allocate host arrays!\n"); + exit(EXIT_FAILURE); + } + + // Load input data + double rdelta_t = 1.0 / 0.0001; + if (validation_mode) { + int readfile = 0; + readfile = fread((void*)&rdelta_t, sizeof(double), 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + fprintf(stderr, "rdelta_t: %lf\n", rdelta_t); + + readfile = fread(h_A_csr_row_index, csr_row_index_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells + 1; i++) + // fprintf(stderr, "h_A_csr_row_index[%d]: %d\n", i, h_A_csr_row_index[i]); + + readfile = fread(h_A_csr_col_index, csr_col_index_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells + num_faces; i++) + // fprintf(stderr, "h_A_csr_col_index[%d]: %d\n", i, h_A_csr_col_index[i]); + + readfile = fread(h_A_csr_diag_index, cell_index_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_A_csr_diag_index[%d]: %d\n", i, h_A_csr_diag_index[i]); + + readfile = fread(h_rho_old, cell_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_rho_old[%d]: %lf\n", i, h_rho_old[i]); + + readfile = fread(h_rho_new, cell_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_rho_new[%d]: %lf\n", i, h_rho_new[i]); + + readfile = fread(h_volume, cell_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_volume[%d]: %.16lf\n", i, h_volume[i]); + + readfile = fread(h_pressure, cell_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_pressure[%d]: %.30lf\n", i, h_pressure[i]); + + readfile = fread(h_velocity_old, cell_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_velocity[%d]: (%lf, %lf, %lf)\n", i, h_volume[i * 3 + 0], h_volume[i * 3 + 1], h_volume[i * 3 + 2]); + + readfile = fread(h_weight, face_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_faces; i++) + // fprintf(stderr, "h_weight[%d]: %.20lf\n", i, h_weight[i]); + + readfile = fread(h_phi, face_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < num_faces; i++) + // fprintf(stderr, "h_phi[%d]: %lf\n", i, h_phi[i]); + + readfile = fread(h_face_vector, face_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_faces; i++) + // fprintf(stderr, "h_face_vector[%d]: (%.20lf, %.20lf, %.20lf)\n", i, h_face_vector[i * 3 + 0], h_face_vector[i * 3 + 1], h_face_vector[i * 3 + 2]); + + readfile = fread(h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_boundary_cells; i++) + // fprintf(stderr, "h_boundary_cell_offset[%d]: %d\n", i, h_boundary_cell_offset[i]); + + readfile = fread(h_boundary_cell_id, boundary_face_index_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_boundary_faces; i++) + // fprintf(stderr, "h_boundary_cell_id[%d]: %d\n", i, h_boundary_cell_id[i]); + + readfile = fread(h_boundary_pressure, boundary_face_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_boundary_faces; i++) + // fprintf(stderr, "h_boundary_pressure[%d]: %lf\n", i, h_boundary_pressure[i]); + + readfile = fread(h_boundary_face_vector, boundary_face_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_boundary_faces; i++) + // fprintf(stderr, "h_boundary_face_vector[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_boundary_face_vector[i * 3 + 0], h_boundary_face_vector[i * 3 + 1], h_boundary_face_vector[i * 3 + 2]); + + readfile = fread(h_internal_coeffs, boundary_face_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_boundary_faces; i++) + // fprintf(stderr, "h_internal_coeffs[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_internal_coeffs[i * 3 + 0], h_internal_coeffs[i * 3 + 1], h_internal_coeffs[i * 3 + 2]); + + readfile = fread(h_boundary_coeffs, boundary_face_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < num_boundary_faces; i++) + // fprintf(stderr, "h_boundary_coeffs[%d]: (%lf, %lf, %lf)\n", i, h_boundary_coeffs[i * 3 + 0], h_boundary_coeffs[i * 3 + 1], h_boundary_coeffs[i * 3 + 2]); + + readfile = fread(h_A_csr_validation, csr_value_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + //for (int i = 0; i < (num_cells + num_faces) * 3; i++) + // fprintf(stderr, "h_A_csr_validation[%d]: %.30lf\n", i, h_A_csr_validation[i]); + + readfile = fread(h_b_validation, cell_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells * 3; i++) + // fprintf(stderr, "h_b_validation[%d]: %lf\n", i, h_b_validation[i]); + + readfile = fread(h_grad_validation, cell_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells * 3; i++) + // fprintf(stderr, "h_grad_validation[%d]: %lf\n", i, h_grad_validation[i]); + + readfile = fread(h_turbSrc_A, csr_value_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < (num_cells + num_faces); i++) + // fprintf(stderr, "h_turbSrc_A[%d]: %.30lf\n", i, h_turbSrc_A[i]); + + readfile = fread(h_turbSrc_b, cell_vec_bytes, 1, fp); + if(readfile == 0) { + fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); + exit(EXIT_FAILURE); + } + // for (int i = 0; i < num_cells * 3; i++) + // fprintf(stderr, "h_turbSrc_b[%d]: %lf\n", i, h_turbSrc_b[i]); + + // compare with of results + FILE *fp1 = NULL; + fp1 = fopen("of_finalresult.txt", "rb+"); + readfile = fread(A_openfoam, csr_value_vec_bytes, 1, fp1); + readfile = fread(b_openfoam, cell_vec_bytes, 1, fp1); + + } else { + // Initialize row index + h_A_csr_row_index[0] = 0; + for (int i = 0; i < num_cells; ++i) { + int ww = i % w; + int hh = (i % (w * h)) / w; + int dd = i / (w * h); + int neighbor_number = 6; + + if (ww == 0) { + neighbor_number -= 1; + } + if (ww == (w - 1)) { + neighbor_number -= 1; + } + if (hh == 0) { + neighbor_number -= 1; + } + if (hh == (h - 1)) { + neighbor_number -= 1; + } + if (dd == 0) { + neighbor_number -= 1; + } + if (dd == (d - 1)) { + neighbor_number -= 1; + } + + h_A_csr_row_index[i + 1] = h_A_csr_row_index[i] + neighbor_number; + } + + // Initialize cell fields + for (int i = 0; i < num_cells; ++i) { + h_rho_old[i] = rand() / (double)RAND_MAX; + h_rho_new[i] = rand() / (double)RAND_MAX; + h_volume[i] = rand() / (double)RAND_MAX; + h_pressure[i] = rand() / (double)RAND_MAX; + h_velocity_old[i * 3 + 0] = rand() / (double)RAND_MAX; + h_velocity_old[i * 3 + 1] = rand() / (double)RAND_MAX; + h_velocity_old[i * 3 + 2] = rand() / (double)RAND_MAX; + } + + // Initialize A_csr_col_index, A_csr_diag_index, and the face fields. + for (int i = 0; i < num_cells; ++i) { + int row_offset = h_A_csr_row_index[i]; + int neighbor_offset = row_offset - i; + + double weight = 0.5; + + // Make sure that the values are placed in each row with the following order. + // (x, y, z - 1)--> column: (i - w * h) + // (x, y - 1, z)--> column: (i - w) + // (x - 1, y, z)--> column: (i - 1) + // (x, y, z) --> column: (i) //only in A_csr_index and A_csr_diag_index + // (x + 1, y, z)--> column: (i + 1) + // (x, y + 1, z)--> column: (i + w) + // (x, y, z + 1)--> column: (i + w * h) + + // (x, y, z - 1) + // (x, y - 1, z) + // (x - 1, y, z) + int num_lower = 0; + if (i - w * h >= 0) { + num_lower++; + h_A_csr_col_index[row_offset++] = i - w * h; + + int face_index = neighbor_offset++; + h_weight[face_index] = weight; + h_phi[face_index] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; + } + if (i - w >= 0) { + num_lower++; + h_A_csr_col_index[row_offset++] = i - w; + + int face_index = neighbor_offset++; + h_weight[face_index] = weight; + h_phi[face_index] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; + } + if (i - 1 >= 0) { + num_lower++; + h_A_csr_col_index[row_offset++] = i - 1; + + int face_index = neighbor_offset++; + h_weight[face_index] = weight; + h_phi[face_index] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; + } + + // (x, y, z) + h_A_csr_col_index[row_offset++] = i; + h_A_csr_diag_index[i] = num_lower; + + // (x + 1, y, z) + // (x, y + 1, z) + // (x, y, z + 1) + if (i + 1 < num_cells) { + h_A_csr_col_index[row_offset++] = i + 1; + + int face_index = neighbor_offset++; + h_weight[face_index] = weight; + h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; + h_face_vector[face_index] = rand() / (double)RAND_MAX; + } + if (i + w < num_cells) { + h_A_csr_col_index[row_offset++] = i + w; + + int face_index = neighbor_offset++; + h_weight[face_index] = weight; + h_phi[face_index] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; + } + if (i + w * h < num_cells) { + h_A_csr_col_index[row_offset++] = i + w * h; + + int face_index = neighbor_offset++; + h_weight[face_index] = weight; + h_phi[face_index] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; + h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; + } + } + } + + if (fp) { + fclose(fp); + } + + int total_bytes = 0; + int copy_bytes = 0; + + // Allocate the device arrays + int* d_A_csr_row_index = NULL; + int* d_A_csr_col_index = NULL; + int* d_A_csr_diag_index = NULL; + checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); + total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); + + double* d_rho_old = NULL; + double* d_rho_new = NULL; + double* d_volume = NULL; + double* d_pressure = NULL; + double* d_velocity_old = NULL; + double* d_weight = NULL; + double* d_phi = NULL; + double* d_face_vector = NULL; + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); + total_bytes += (cell_bytes * 4 + face_bytes * 2 + cell_vec_bytes + face_vec_bytes); + + int* d_boundary_cell_offset = NULL; + int* d_boundary_cell_id = NULL; + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_bytes)); + total_bytes += (boundary_cell_index_bytes + (num_boundary_cells+1) * sizeof(int)); + + double* d_boundary_pressure = NULL; + double* d_boundary_face_vector = NULL; + double* d_internal_coeffs = NULL; + double* d_boundary_coeffs = NULL; + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); + total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); + + double* d_A_csr = NULL; + double* d_b = NULL; + checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); + total_bytes += (csr_value_vec_bytes + cell_vec_bytes); + + double* d_turbSrc_A = NULL; + double* d_turbSrc_b = NULL; + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); + total_bytes += (csr_value_bytes + cell_vec_bytes); + + fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); + + // Create cuda stream + cudaStream_t stream; + checkCudaErrors(cudaStreamCreate(&stream)); + + struct timeval start, end; + float duration = 0; + gettimeofday(&start, NULL); + + // Copy the host input array in host memory to the device input array in device memory + printf("Copy input arrays from the host memory to the CUDA device\n"); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); + copy_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); + + checkCudaErrors(cudaMemcpyAsync(d_rho_old, h_rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_rho_new, h_rho_new, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_pressure, h_pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_velocity_old, h_velocity_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_phi, h_phi, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); + copy_bytes += (cell_bytes * 4 + cell_vec_bytes + face_bytes * 2 + face_vec_bytes); + + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + copy_bytes += ((num_boundary_cells+1) * sizeof(int) + boundary_face_index_bytes); + + checkCudaErrors(cudaMemcpyAsync(d_boundary_pressure, h_boundary_pressure, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs, h_internal_coeffs, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs, h_boundary_coeffs, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + copy_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); + + checkCudaErrors(cudaMemcpyAsync(d_turbSrc_A, h_turbSrc_A, csr_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_turbSrc_b, h_turbSrc_b, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); + copy_bytes += (csr_value_bytes + cell_vec_bytes); + + // Synchronize stream + checkCudaErrors(cudaStreamSynchronize(stream)); + gettimeofday(&end, NULL); + duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001; + fprintf(stderr, "memcpy time cost %lf ms.\n", duration); + fprintf(stderr, "bandwidth %lf GB/s.\n", (copy_bytes * 1.0 / 1024 / 1024 / 1024) / (duration / 1000)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block); + + /** + * start term-level + */ + gettimeofday(&start, NULL); + + // Memset A and b to zero + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); + + // Launch the fvm_ddt Kernel + fvm_ddt<<>>(num_cells, num_faces, rdelta_t, + d_A_csr_row_index, d_A_csr_diag_index, + d_rho_old, d_rho_new, d_volume, d_velocity_old, d_A_csr, d_b, d_A_csr, d_b); + // Launch the fvm_div Kernel + fvm_div_internal<<>>(num_cells, num_faces, + d_A_csr_row_index, d_A_csr_diag_index, + d_weight, d_phi, d_A_csr, d_b, d_A_csr, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, + d_boundary_cell_offset, d_boundary_cell_id, + d_internal_coeffs, d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); + // Launch the fvc_grad Kernel + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_internal_face<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + d_face_vector, d_weight, d_pressure, d_b, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, + d_boundary_cell_offset, d_boundary_cell_id, + d_boundary_face_vector, d_boundary_pressure, d_b, d_b); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + add_fvMatrix<<>>(num_cells, num_faces, + d_A_csr_row_index, d_turbSrc_A, d_turbSrc_b, d_A_csr, d_b, d_A_csr, d_b); + + // Synchronize stream + checkCudaErrors(cudaStreamSynchronize(stream)); + gettimeofday(&end, NULL); + duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001; + fprintf(stderr, "assemble on term level, time cost %lf ms.\n", duration); + /** + * end term-level + */ + + /** + * start equation-level + */ + //gettimeofday(&start, NULL); + + //// Memset A and b to zero + //checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); + //checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); + + //// Note: the implement fo assemble kernel is outdated. + //// Launch the assemble Kernel + //assemble<<>>(num_cells, num_faces, rdelta_t, d_A_csr_row_index, d_A_csr_diag_index, + // d_rho_old, d_rho_new, d_volume, d_velocity_old, d_weight, d_phi, d_face_vector, d_pressure, d_A_csr, d_b); + + //// Synchronize stream + //checkCudaErrors(cudaStreamSynchronize(stream)); + //gettimeofday(&end, NULL); + //duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001; + //fprintf(stderr, "assemble on equation level, time cost %lf ms.\n", duration); + /** + * end equation-level + */ + + // Sleep 10 seconds, so that user can check gpu memory usage through nvidia-smi. + //sleep(10); + + // Check result + if (validation_mode) { + checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + // fprintf(stderr, "check of h_A_csr\n"); + // checkVectorEqual((num_cells + num_faces) * 3, h_A_csr_validation, h_A_csr, 1e-5); + // fprintf(stderr, "check of h_b\n"); + // checkVectorEqual(num_cells * 3, h_b_validation, h_b, 1e-5); + fprintf(stderr, "check of h_A_csr\n"); + checkVectorEqual((num_cells + num_faces) * 3, A_openfoam, h_A_csr, 1e-5); + fprintf(stderr, "check of h_b\n"); + checkVectorEqual(num_cells * 3, b_openfoam, h_b, 1e-5); + } + + // Free device global memory + checkCudaErrors(cudaFree(d_A_csr_row_index)); + checkCudaErrors(cudaFree(d_A_csr_col_index)); + checkCudaErrors(cudaFree(d_A_csr_diag_index)); + checkCudaErrors(cudaFree(d_rho_old)); + checkCudaErrors(cudaFree(d_rho_new)); + checkCudaErrors(cudaFree(d_volume)); + checkCudaErrors(cudaFree(d_pressure)); + checkCudaErrors(cudaFree(d_velocity_old)); + checkCudaErrors(cudaFree(d_weight)); + checkCudaErrors(cudaFree(d_phi)); + checkCudaErrors(cudaFree(d_face_vector)); + checkCudaErrors(cudaFree(d_boundary_cell_offset)); + checkCudaErrors(cudaFree(d_boundary_cell_id)); + checkCudaErrors(cudaFree(d_boundary_pressure)); + checkCudaErrors(cudaFree(d_boundary_face_vector)); + checkCudaErrors(cudaFree(d_internal_coeffs)); + checkCudaErrors(cudaFree(d_boundary_coeffs)); + checkCudaErrors(cudaFree(d_A_csr)); + checkCudaErrors(cudaFree(d_b)); + checkCudaErrors(cudaStreamDestroy(stream)); + + // Free host memory + free(h_A_csr_row_index); + free(h_A_csr_col_index); + free(h_A_csr_diag_index); + free(h_rho_old); + free(h_rho_new); + free(h_volume); + free(h_pressure); + free(h_velocity_old); + free(h_weight); + free(h_phi); + free(h_face_vector); + free(h_boundary_cell_offset); + free(h_boundary_cell_id); + free(h_boundary_pressure); + free(h_boundary_face_vector); + free(h_internal_coeffs); + free(h_boundary_coeffs); + free(h_A_csr); + free(h_b); + free(h_A_csr_validation); + free(h_b_validation); + + printf("Done\n"); + return 0; +} +