diff --git a/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.H b/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.H index 7dbf7fc90..09cc5e21f 100644 --- a/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.H +++ b/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.H @@ -120,10 +120,6 @@ rrtmgp_main (const int ncol, const int nlay, real1d_k& t_sfc , real1d_k& sfc_emis , real1d_k& lw_src, real2d_k& lwp , real2d_k& iwp , real2d_k& rel , real2d_k& rei , real2d_k& cldfrac, - real3d_k& aer_tau_sw , real3d_k& aer_ssa_sw , real3d_k& aer_asm_sw, - real3d_k& aer_tau_lw , - real3d_k& cld_tau_sw_bnd, real3d_k& cld_tau_lw_bnd, - real3d_k& cld_tau_sw_gpt, real3d_k& cld_tau_lw_gpt, real2d_k& sw_flux_up , real2d_k& sw_flux_dn , real2d_k& sw_flux_dn_dir, real2d_k& lw_flux_up , real2d_k& lw_flux_dn , real2d_k& sw_clnclrsky_flux_up, real2d_k& sw_clnclrsky_flux_dn, real2d_k& sw_clnclrsky_flux_dn_dir, diff --git a/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.cpp b/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.cpp index 2abc9e16f..d731cc880 100644 --- a/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.cpp +++ b/Source/PhysicsInterfaces/Radiation/ERF_RRTMGP_Interface.cpp @@ -389,10 +389,6 @@ rrtmgp_main (const int ncol, const int nlay, real1d_k& t_sfc , real1d_k& sfc_emis , real1d_k& lw_src, real2d_k& lwp, real2d_k& iwp, real2d_k& rel, real2d_k& rei, real2d_k& cldfrac, - real3d_k& /*aer_tau_sw*/, real3d_k& /*aer_ssa_sw*/, real3d_k& /*aer_asm_sw*/, - real3d_k& /*aer_tau_lw*/, - real3d_k& /*cld_tau_sw_bnd*/, real3d_k& /*cld_tau_lw_bnd*/, - real3d_k& /*cld_tau_sw_gpt*/, real3d_k& /*cld_tau_lw_gpt*/, real2d_k& sw_flux_up, real2d_k& sw_flux_dn, real2d_k& sw_flux_dn_dir, real2d_k& lw_flux_up, real2d_k& lw_flux_dn, real2d_k& sw_clnclrsky_flux_up, real2d_k& sw_clnclrsky_flux_dn, real2d_k& sw_clnclrsky_flux_dn_dir, @@ -457,24 +453,8 @@ rrtmgp_main (const int ncol, const int nlay, optical_props1_t aerosol_lw; aerosol_sw.init(k_dist_sw_k->get_band_lims_wavenumber()); aerosol_sw.alloc_2str(ncol, nlay); - /* - Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay, nswbands}), - KOKKOS_LAMBDA (int icol, int ilay, int ibnd) - { - aerosol_sw.tau(icol,ilay,ibnd) = aer_tau_sw(icol,ilay,ibnd); - aerosol_sw.ssa(icol,ilay,ibnd) = aer_ssa_sw(icol,ilay,ibnd); - aerosol_sw.g (icol,ilay,ibnd) = aer_asm_sw(icol,ilay,ibnd); - }); - */ aerosol_lw.init(k_dist_lw_k->get_band_lims_wavenumber()); aerosol_lw.alloc_1scl(ncol, nlay); - /* - Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay, nlwbands}), - KOKKOS_LAMBDA (int icol, int ilay, int ibnd) - { - aerosol_lw.tau(icol,ilay,ibnd) = aer_tau_lw(icol,ilay,ibnd); - }); - */ // Convert cloud physical properties to optical properties for input to RRTMGP optical_props2_t clouds_sw = get_cloud_optics_sw(ncol, nlay, @@ -483,9 +463,6 @@ rrtmgp_main (const int ncol, const int nlay, optical_props1_t clouds_lw = get_cloud_optics_lw(ncol, nlay, *cloud_optics_lw_k, *k_dist_lw_k, lwp, iwp, rel, rei); - //Kokkos::deep_copy(cld_tau_sw_bnd, clouds_sw.tau); - //Kokkos::deep_copy(cld_tau_lw_bnd, clouds_lw.tau); - // Do subcolumn sampling to map bands -> gpoints based on cloud fraction and overlap assumption; // This implements the Monte Carlo Independent Column Approximation by mapping only a single // subcolumn (cloud state) to each gpoint. @@ -497,21 +474,6 @@ rrtmgp_main (const int ncol, const int nlay, auto clouds_lw_gpt = get_subsampled_clouds(ncol, nlay, nlwbands, nlwgpts, clouds_lw, *k_dist_lw_k, cldfrac, p_lay); - /* - // Copy cloud properties to outputs (is this needed, or can we just use pointers?) - // Alternatively, just compute and output a subcolumn cloud mask - Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay, nswgpts}), - KOKKOS_LAMBDA (int icol, int ilay, int igpt) - { - cld_tau_sw_gpt(icol,ilay,igpt) = clouds_sw_gpt.tau(icol,ilay,igpt); - }); - Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay, nlwgpts}), - KOKKOS_LAMBDA (int icol, int ilay, int igpt) - { - cld_tau_lw_gpt(icol,ilay,igpt) = clouds_lw_gpt.tau(icol,ilay,igpt); - }); - */ - // Do shortwave rrtmgp_sw(ncol, nlay, *k_dist_sw_k, p_lay, t_lay, p_lev, t_lev, gas_concs, @@ -651,16 +613,28 @@ rrtmgp_sw (const int ncol, flux_up (icol,ilev) = 0; flux_dn (icol,ilev) = 0; flux_dn_dir(icol,ilev) = 0; - clnclrsky_flux_up (icol,ilev) = 0; - clnclrsky_flux_dn (icol,ilev) = 0; - clnclrsky_flux_dn_dir(icol,ilev) = 0; clrsky_flux_up (icol,ilev) = 0; clrsky_flux_dn (icol,ilev) = 0; clrsky_flux_dn_dir(icol,ilev) = 0; - clnsky_flux_up (icol,ilev) = 0; - clnsky_flux_dn (icol,ilev) = 0; - clnsky_flux_dn_dir(icol,ilev) = 0; }); + if (extra_clnclrsky_diag) { + Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0}, {ncol, nlay+1}), + KOKKOS_LAMBDA (int icol, int ilev) + { + clnclrsky_flux_up (icol,ilev) = 0; + clnclrsky_flux_dn (icol,ilev) = 0; + clnclrsky_flux_dn_dir(icol,ilev) = 0; + }); + } + if (extra_clnsky_diag) { + Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0}, {ncol, nlay+1}), + KOKKOS_LAMBDA (int icol, int ilev) + { + clnsky_flux_up (icol,ilev) = 0; + clnsky_flux_dn (icol,ilev) = 0; + clnsky_flux_dn_dir(icol,ilev) = 0; + }); + } Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay+1, nbnd}), KOKKOS_LAMBDA (int icol, int ilev, int ibnd) { @@ -933,15 +907,27 @@ rrtmgp_lw (const int ncol, Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0}, {ncol, nlay+1}), KOKKOS_LAMBDA (int icol, int ilev) { - flux_up(icol, ilev) = 0; - flux_dn(icol, ilev) = 0; - clnclrsky_flux_up(icol, ilev) = 0; - clnclrsky_flux_dn(icol, ilev) = 0; - clrsky_flux_up(icol, ilev) = 0; - clrsky_flux_dn(icol, ilev) = 0; - clnsky_flux_up(icol, ilev) = 0; - clnsky_flux_dn(icol, ilev) = 0; + flux_up(icol, ilev) = 0; + flux_dn(icol, ilev) = 0; + clrsky_flux_up(icol, ilev) = 0; + clrsky_flux_dn(icol, ilev) = 0; }); + if (extra_clnclrsky_diag) { + Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0}, {ncol, nlay+1}), + KOKKOS_LAMBDA (int icol, int ilev) + { + clnclrsky_flux_up(icol, ilev) = 0; + clnclrsky_flux_dn(icol, ilev) = 0; + }); + } + if (extra_clnsky_diag) { + Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0}, {ncol, nlay+1}), + KOKKOS_LAMBDA (int icol, int ilev) + { + clnsky_flux_up(icol, ilev) = 0; + clnsky_flux_dn(icol, ilev) = 0; + }); + } Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay+1, nbnd}), KOKKOS_LAMBDA (int icol, int ilev, int ibnd) { diff --git a/Source/PhysicsInterfaces/Radiation/ERF_Radiation.H b/Source/PhysicsInterfaces/Radiation/ERF_Radiation.H index 3a3e64468..b4261dab0 100644 --- a/Source/PhysicsInterfaces/Radiation/ERF_Radiation.H +++ b/Source/PhysicsInterfaces/Radiation/ERF_Radiation.H @@ -54,6 +54,10 @@ public: // Destructor ~Radiation () { + // Release k-distribution data and memory pool + if (rrtmgp::initialized) { + rrtmgp::rrtmgp_finalize(); + } // Note that Kokkos is now finalized in main.cpp } @@ -322,8 +326,11 @@ private: // Offsets for MultiFab <-> KOKKOS transfer amrex::Vector m_col_offsets; - // Whether we use aerosol forcing in radiation - bool m_do_aerosol_rad = true; + // Whether we use aerosol forcing in radiation. + // Aerosol plumbing is currently not implemented; this hook is retained so a + // future aerosol scheme can wire in without reintroducing the parameter. + // Setting it true today triggers an abort in the Radiation constructor. + bool m_do_aerosol_rad = false; // Whether we do extra aerosol forcing calls bool m_extra_clnsky_diag = false; @@ -363,6 +370,9 @@ private: // Rad frequency in number of steps int m_rad_freq_in_steps = 1; + // Number of columns to process per RRTMGP chunk (controls peak memory) + int m_ncol_chunk = 5000; + // Whether or not to do subcolumn sampling of cloud state for MCICA bool m_do_subcol_sampling = true; @@ -442,21 +452,16 @@ private: real2d_k sfc_alb_dir; real2d_k sfc_alb_dif; - // 2d size (ncol, nlwbands) - real2d_k emis_sfc; - + // Aerosol optical properties. Dormant scaffolding kept so a future aerosol + // coupling (e.g. SPA, prescribed aerosol climatology) can populate these + // without re-adding members. Only allocated when m_do_aerosol_rad is true, + // which currently aborts in the constructor; when aerosol coupling is + // wired in, remove the abort and pass these into rrtmgp_main. + // // 3d size (ncol, nlay, n[sw,lw]bands) real3d_k aero_tau_sw; real3d_k aero_ssa_sw; real3d_k aero_g_sw; real3d_k aero_tau_lw; - - // 3d size (ncol, nlay, n[sw,lw]bnds) - real3d_k cld_tau_sw_bnd; - real3d_k cld_tau_lw_bnd; - - // 3d size (ncol, nlay, n[sw,lw]gpts) - real3d_k cld_tau_sw_gpt; - real3d_k cld_tau_lw_gpt; }; #endif diff --git a/Source/PhysicsInterfaces/Radiation/ERF_Radiation.cpp b/Source/PhysicsInterfaces/Radiation/ERF_Radiation.cpp index b19280276..dffce145c 100644 --- a/Source/PhysicsInterfaces/Radiation/ERF_Radiation.cpp +++ b/Source/PhysicsInterfaces/Radiation/ERF_Radiation.cpp @@ -40,6 +40,9 @@ Radiation::Radiation (const int& lev, // Radiation timestep, as a number of atm steps pp.query("rad_freq_in_steps", m_rad_freq_in_steps); + // Number of columns per RRTMGP chunk (controls peak GPU memory) + pp.query("rad_ncol_chunk", m_ncol_chunk); + // Flag to write fluxes to plt file pp.query("rad_write_fluxes", m_rad_write_fluxes); @@ -76,8 +79,18 @@ Radiation::Radiation (const int& lev, pp.query("o2vmr" , m_o2vmr ); pp.query("n2vmr" , m_n2vmr ); - // Required aerosol optical properties from SPA + // Aerosol forcing hook (not implemented). The aerosol arrays that used to be + // passed through rrtmgp_main were never populated with real data, so enabling + // this flag only ever multiplied radiation by zero aerosol optics. The hook is + // kept so a future SPA/prescribed-aerosol scheme can wire in without touching + // the ParmParse surface. pp.query("rad_do_aerosol", m_do_aerosol_rad); + if (m_do_aerosol_rad) { + amrex::Abort("erf.rad_do_aerosol = true is not supported: aerosol forcing is " + "currently not implemented in the ERF RRTMGP interface. The hook " + "is retained for a future aerosol coupling; set rad_do_aerosol = " + "false (or remove it) to continue."); + } // Whether we do extra clean/clear sky calculations pp.query("rad_extra_clnclrsky_diag", m_extra_clnclrsky_diag); @@ -275,22 +288,42 @@ Radiation::alloc_buffers () lw_flux_up = real2d_k("lw_flux_up" , m_ncol, m_nlay+1); lw_flux_dn = real2d_k("lw_flux_dn" , m_ncol, m_nlay+1); - sw_clnclrsky_flux_up = real2d_k("sw_clnclrsky_flux_up" , m_ncol, m_nlay+1); - sw_clnclrsky_flux_dn = real2d_k("sw_clnclrsky_flux_dn" , m_ncol, m_nlay+1); - sw_clnclrsky_flux_dn_dir = real2d_k("sw_clnclrsky_flux_dn_dir", m_ncol, m_nlay+1); + // Clear-sky flux arrays are always needed sw_clrsky_flux_up = real2d_k("sw_clrsky_flux_up" , m_ncol, m_nlay+1); sw_clrsky_flux_dn = real2d_k("sw_clrsky_flux_dn" , m_ncol, m_nlay+1); sw_clrsky_flux_dn_dir = real2d_k("sw_clrsky_flux_dn_dir" , m_ncol, m_nlay+1); - sw_clnsky_flux_up = real2d_k("sw_clnsky_flux_up" , m_ncol, m_nlay+1); - sw_clnsky_flux_dn = real2d_k("sw_clnsky_flux_dn" , m_ncol, m_nlay+1); - sw_clnsky_flux_dn_dir = real2d_k("sw_clnsky_flux_dn_dir" , m_ncol, m_nlay+1); - - lw_clnclrsky_flux_up = real2d_k("lw_clnclrsky_flux_up" , m_ncol, m_nlay+1); - lw_clnclrsky_flux_dn = real2d_k("lw_clnclrsky_flux_dn" , m_ncol, m_nlay+1); lw_clrsky_flux_up = real2d_k("lw_clrsky_flux_up" , m_ncol, m_nlay+1); lw_clrsky_flux_dn = real2d_k("lw_clrsky_flux_dn" , m_ncol, m_nlay+1); - lw_clnsky_flux_up = real2d_k("lw_clnsky_flux_up" , m_ncol, m_nlay+1); - lw_clnsky_flux_dn = real2d_k("lw_clnsky_flux_dn" , m_ncol, m_nlay+1); + + // Clean-clear-sky diagnostic fluxes (only when enabled) + if (m_extra_clnclrsky_diag) { + sw_clnclrsky_flux_up = real2d_k("sw_clnclrsky_flux_up" , m_ncol, m_nlay+1); + sw_clnclrsky_flux_dn = real2d_k("sw_clnclrsky_flux_dn" , m_ncol, m_nlay+1); + sw_clnclrsky_flux_dn_dir = real2d_k("sw_clnclrsky_flux_dn_dir", m_ncol, m_nlay+1); + lw_clnclrsky_flux_up = real2d_k("lw_clnclrsky_flux_up" , m_ncol, m_nlay+1); + lw_clnclrsky_flux_dn = real2d_k("lw_clnclrsky_flux_dn" , m_ncol, m_nlay+1); + } else { + sw_clnclrsky_flux_up = real2d_k("sw_clnclrsky_flux_up" , 1, 1); + sw_clnclrsky_flux_dn = real2d_k("sw_clnclrsky_flux_dn" , 1, 1); + sw_clnclrsky_flux_dn_dir = real2d_k("sw_clnclrsky_flux_dn_dir", 1, 1); + lw_clnclrsky_flux_up = real2d_k("lw_clnclrsky_flux_up" , 1, 1); + lw_clnclrsky_flux_dn = real2d_k("lw_clnclrsky_flux_dn" , 1, 1); + } + + // Clean-sky diagnostic fluxes (only when enabled) + if (m_extra_clnsky_diag) { + sw_clnsky_flux_up = real2d_k("sw_clnsky_flux_up" , m_ncol, m_nlay+1); + sw_clnsky_flux_dn = real2d_k("sw_clnsky_flux_dn" , m_ncol, m_nlay+1); + sw_clnsky_flux_dn_dir = real2d_k("sw_clnsky_flux_dn_dir" , m_ncol, m_nlay+1); + lw_clnsky_flux_up = real2d_k("lw_clnsky_flux_up" , m_ncol, m_nlay+1); + lw_clnsky_flux_dn = real2d_k("lw_clnsky_flux_dn" , m_ncol, m_nlay+1); + } else { + sw_clnsky_flux_up = real2d_k("sw_clnsky_flux_up" , 1, 1); + sw_clnsky_flux_dn = real2d_k("sw_clnsky_flux_dn" , 1, 1); + sw_clnsky_flux_dn_dir = real2d_k("sw_clnsky_flux_dn_dir" , 1, 1); + lw_clnsky_flux_up = real2d_k("lw_clnsky_flux_up" , 1, 1); + lw_clnsky_flux_dn = real2d_k("lw_clnsky_flux_dn" , 1, 1); + } // 3d size (ncol, nlay+1, nswbands) sw_bnd_flux_up = real3d_k("sw_bnd_flux_up" , m_ncol, m_nlay+1, m_nswbands); @@ -306,24 +339,17 @@ Radiation::alloc_buffers () sfc_alb_dir = real2d_k("sfc_alb_dir", m_ncol, m_nswbands); sfc_alb_dif = real2d_k("sfc_alb_dif", m_ncol, m_nswbands); - // 2d size (ncol, nlwbands) - //emis_sfc = real2d_k("emis_sfc", m_ncol, m_nlwbands); - - /* - // 3d size (ncol, nlay, n[sw,lw]bands) - aero_tau_sw = real3d_k("aero_tau_sw", m_ncol, m_nlay, m_nswbands); - aero_ssa_sw = real3d_k("aero_ssa_sw", m_ncol, m_nlay, m_nswbands); - aero_g_sw = real3d_k("aero_g_sw" , m_ncol, m_nlay, m_nswbands); - aero_tau_lw = real3d_k("aero_tau_lw", m_ncol, m_nlay, m_nlwbands); - - // 3d size (ncol, nlay, n[sw,lw]bnds) - cld_tau_sw_bnd = real3d_k("cld_tau_sw_bnd", m_ncol, m_nlay, m_nswbands); - cld_tau_lw_bnd = real3d_k("cld_tau_lw_bnd", m_ncol, m_nlay, m_nlwbands); - - // 3d size (ncol, nlay, n[sw,lw]gpts) - cld_tau_sw_gpt = real3d_k("cld_tau_sw_gpt", m_ncol, m_nlay, m_nswgpts); - cld_tau_lw_gpt = real3d_k("cld_tau_lw_gpt", m_ncol, m_nlay, m_nlwgpts); - */ + // Aerosol optical properties — allocated only when aerosol coupling is on. + // The flag gates allocation so today (coupling not implemented, abort fires + // in the constructor) these stay as empty Views and cost nothing. When a + // future aerosol scheme populates them, hook up the plumbing into + // rrtmgp_main as well. + if (m_do_aerosol_rad) { + aero_tau_sw = real3d_k("aero_tau_sw", m_ncol, m_nlay, m_nswbands); + aero_ssa_sw = real3d_k("aero_ssa_sw", m_ncol, m_nlay, m_nswbands); + aero_g_sw = real3d_k("aero_g_sw", m_ncol, m_nlay, m_nswbands); + aero_tau_lw = real3d_k("aero_tau_lw", m_ncol, m_nlay, m_nlwbands); + } } void @@ -412,24 +438,11 @@ Radiation::dealloc_buffers () sfc_alb_dir = real2d_k(); sfc_alb_dif = real2d_k(); - // 2d size (ncol, nlwbands) - //emis_sfc = real2d_k(); - - /* - // 3d size (ncol, nlay, n[sw,lw]bands) + // Aerosol scaffolding (no-op unless m_do_aerosol_rad enabled allocation above) aero_tau_sw = real3d_k(); aero_ssa_sw = real3d_k(); aero_g_sw = real3d_k(); aero_tau_lw = real3d_k(); - - // 3d size (ncol, nlay, n[sw,lw]bnds) - cld_tau_sw_bnd = real3d_k(); - cld_tau_lw_bnd = real3d_k(); - - // 3d size (ncol, nlay, n[sw,lw]gpts) - cld_tau_sw_gpt = real3d_k(); - cld_tau_lw_gpt = real3d_k(); - */ } @@ -1008,11 +1021,21 @@ void Radiation::WriteDataLog (const Real &time) void Radiation::initialize_impl () { - // Call API to initialize + // Initialize gas concentrations for this step m_gas_concs.init(gas_names_offset, m_ncol, m_nlay); - rrtmgp::rrtmgp_initialize(m_gas_concs, - rrtmgp_coeffs_file_sw , rrtmgp_coeffs_file_lw , - rrtmgp_cloud_optics_file_sw, rrtmgp_cloud_optics_file_lw); + + // Load k-distribution and cloud optics data only once. + // These are static lookup tables that never change. + // Size the memory pool for ncol_chunk (not full ncol) to limit peak memory. + if (!rrtmgp::initialized) { + int ncol_for_pool = std::min(m_ncol_chunk, m_ncol); + gas_concs_t gas_concs_pool; + gas_concs_pool.init(gas_names_offset, ncol_for_pool, m_nlay); + rrtmgp::rrtmgp_initialize(gas_concs_pool, + rrtmgp_coeffs_file_sw , rrtmgp_coeffs_file_lw , + rrtmgp_cloud_optics_file_sw, rrtmgp_cloud_optics_file_lw); + gas_concs_pool.reset(); + } } @@ -1142,65 +1165,176 @@ Radiation::run_impl () iwp_tab(icol,ilay) *= Real(1.e3); }); - // Expand surface_albedos along nswbands. - // This is needed since rrtmgp require band-by-band. - rrtmgp::compute_band_by_band_surface_albedos(ncol, nswbands, - sfc_alb_dir_vis, sfc_alb_dir_nir, - sfc_alb_dif_vis, sfc_alb_dif_nir, - sfc_alb_dir , sfc_alb_dif); - // Run RRTMGP driver - rrtmgp::rrtmgp_main(ncol, m_nlay, - p_lay, t_lay, - p_lev, t_lev, - m_gas_concs, - sfc_alb_dir, sfc_alb_dif, mu0, - t_sfc, sfc_emis, lw_src, - lwp, iwp, eff_radius_qc, eff_radius_qi, cldfrac_tot, - aero_tau_sw, aero_ssa_sw, aero_g_sw, aero_tau_lw, - cld_tau_sw_bnd, cld_tau_lw_bnd, - cld_tau_sw_gpt, cld_tau_lw_gpt, - sw_flux_up, sw_flux_dn, sw_flux_dn_dir, - lw_flux_up, lw_flux_dn, - sw_clnclrsky_flux_up, sw_clnclrsky_flux_dn, sw_clnclrsky_flux_dn_dir, - sw_clrsky_flux_up, sw_clrsky_flux_dn, sw_clrsky_flux_dn_dir, - sw_clnsky_flux_up, sw_clnsky_flux_dn, sw_clnsky_flux_dn_dir, - lw_clnclrsky_flux_up, lw_clnclrsky_flux_dn, - lw_clrsky_flux_up, lw_clrsky_flux_dn, - lw_clnsky_flux_up, lw_clnsky_flux_dn, - sw_bnd_flux_up, sw_bnd_flux_dn, sw_bnd_flux_dir, - lw_bnd_flux_up, lw_bnd_flux_dn, - eccf, m_extra_clnclrsky_diag, m_extra_clnsky_diag); - - // Update heating tendency - rrtmgp::compute_heating_rate(sw_flux_up, sw_flux_dn, r_lay, z_del, sw_heating); - rrtmgp::compute_heating_rate(lw_flux_up, lw_flux_dn, r_lay, z_del, lw_heating); - - // Compute surface fluxes + // ----------------------------------------------------------------------- + // Process radiation in column chunks to limit peak GPU memory. + // Radiation columns are independent (no horizontal coupling), so + // chunking produces bit-identical results. + // ----------------------------------------------------------------------- + const int ncol_chunk = std::min(m_ncol_chunk, ncol); const int kbot = 0; - Table3D sw_bnd_flux_dif_tab(sw_bnd_flux_dif.data(), {0,0,0}, - {static_cast(sw_bnd_flux_dif.extent(0)),static_cast(sw_bnd_flux_dif.extent(1)),static_cast(sw_bnd_flux_dif.extent(2))}); - Table3D sw_bnd_flux_dn_tab(sw_bnd_flux_dn.data(), {0,0,0}, - {static_cast(sw_bnd_flux_dn.extent(0)),static_cast(sw_bnd_flux_dn.extent(1)),static_cast(sw_bnd_flux_dn.extent(2))}); - Table3D sw_bnd_flux_dir_tab(sw_bnd_flux_dir.data(), {0,0,0}, - {static_cast(sw_bnd_flux_dir.extent(0)),static_cast(sw_bnd_flux_dir.extent(1)),static_cast(sw_bnd_flux_dir.extent(2))}); - Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol, nlay+1, nswbands}), - KOKKOS_LAMBDA (int icol, int ilay, int ibnd) - { - sw_bnd_flux_dif_tab(icol,ilay,ibnd) = sw_bnd_flux_dn_tab(icol,ilay,ibnd) - sw_bnd_flux_dir_tab(icol,ilay,ibnd); - }); - rrtmgp::compute_broadband_surface_fluxes(ncol, kbot, nswbands, - sw_bnd_flux_dir , sw_bnd_flux_dif , - sfc_flux_dir_vis, sfc_flux_dir_nir, - sfc_flux_dif_vis, sfc_flux_dif_nir); + + for (int col_s = 0; col_s < ncol; col_s += ncol_chunk) { + const int ncol_c = std::min(ncol_chunk, ncol - col_s); + const int col_e = col_s + ncol_c; + auto cr = std::make_pair(col_s, col_e); + + // --- Chunk subviews: 1D (ncol) --- + real1d_k mu0_c (mu0.data() + col_s, ncol_c); + real1d_k sfc_alb_dir_vis_c (sfc_alb_dir_vis.data() + col_s, ncol_c); + real1d_k sfc_alb_dir_nir_c (sfc_alb_dir_nir.data() + col_s, ncol_c); + real1d_k sfc_alb_dif_vis_c (sfc_alb_dif_vis.data() + col_s, ncol_c); + real1d_k sfc_alb_dif_nir_c (sfc_alb_dif_nir.data() + col_s, ncol_c); + real1d_k sfc_flux_dir_vis_c (sfc_flux_dir_vis.data() + col_s, ncol_c); + real1d_k sfc_flux_dir_nir_c (sfc_flux_dir_nir.data() + col_s, ncol_c); + real1d_k sfc_flux_dif_vis_c (sfc_flux_dif_vis.data() + col_s, ncol_c); + real1d_k sfc_flux_dif_nir_c (sfc_flux_dif_nir.data() + col_s, ncol_c); + real1d_k t_sfc_c (t_sfc.data() + col_s, ncol_c); + real1d_k sfc_emis_c (sfc_emis.data() + col_s, ncol_c); + real1d_k lw_src_c (lw_src.data() + col_s, ncol_c); + + // --- Chunk subviews: 2D (ncol, nlay) via LayoutRight pointer offset --- + const int stride2_nlay = nlay; + const int stride2_nlayp1 = nlay + 1; + real2d_k p_lay_c (p_lay.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k t_lay_c (t_lay.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k r_lay_c (r_lay.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k z_del_c (z_del.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k lwp_c (lwp.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k iwp_c (iwp.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k eff_radius_qc_c(eff_radius_qc.data()+ col_s*stride2_nlay, ncol_c, nlay); + real2d_k eff_radius_qi_c(eff_radius_qi.data()+ col_s*stride2_nlay, ncol_c, nlay); + real2d_k cldfrac_tot_c (cldfrac_tot.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k sw_heating_c (sw_heating.data() + col_s*stride2_nlay, ncol_c, nlay); + real2d_k lw_heating_c (lw_heating.data() + col_s*stride2_nlay, ncol_c, nlay); + + // --- Chunk subviews: 2D (ncol, nlay+1) --- + real2d_k p_lev_c (p_lev.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k t_lev_c (t_lev.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k sw_flux_up_c (sw_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k sw_flux_dn_c (sw_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k sw_flux_dn_dir_c (sw_flux_dn_dir.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k lw_flux_up_c (lw_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k lw_flux_dn_c (lw_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + // Clear-sky flux subviews (always active) + real2d_k sw_clrsky_flux_up_c (sw_clrsky_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k sw_clrsky_flux_dn_c (sw_clrsky_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k sw_clrsky_flux_dn_dir_c (sw_clrsky_flux_dn_dir.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k lw_clrsky_flux_up_c (lw_clrsky_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + real2d_k lw_clrsky_flux_dn_c (lw_clrsky_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + + // Diagnostic flux subviews (placeholder when disabled) + real2d_k sw_clnclrsky_flux_up_c, sw_clnclrsky_flux_dn_c, sw_clnclrsky_flux_dn_dir_c; + real2d_k lw_clnclrsky_flux_up_c, lw_clnclrsky_flux_dn_c; + if (m_extra_clnclrsky_diag) { + sw_clnclrsky_flux_up_c = real2d_k(sw_clnclrsky_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + sw_clnclrsky_flux_dn_c = real2d_k(sw_clnclrsky_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + sw_clnclrsky_flux_dn_dir_c = real2d_k(sw_clnclrsky_flux_dn_dir.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + lw_clnclrsky_flux_up_c = real2d_k(lw_clnclrsky_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + lw_clnclrsky_flux_dn_c = real2d_k(lw_clnclrsky_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + } else { + sw_clnclrsky_flux_up_c = real2d_k("sw_clnclrsky_flux_up_c" , 1, 1); + sw_clnclrsky_flux_dn_c = real2d_k("sw_clnclrsky_flux_dn_c" , 1, 1); + sw_clnclrsky_flux_dn_dir_c = real2d_k("sw_clnclrsky_flux_dn_dir_c", 1, 1); + lw_clnclrsky_flux_up_c = real2d_k("lw_clnclrsky_flux_up_c" , 1, 1); + lw_clnclrsky_flux_dn_c = real2d_k("lw_clnclrsky_flux_dn_c" , 1, 1); + } + + real2d_k sw_clnsky_flux_up_c, sw_clnsky_flux_dn_c, sw_clnsky_flux_dn_dir_c; + real2d_k lw_clnsky_flux_up_c, lw_clnsky_flux_dn_c; + if (m_extra_clnsky_diag) { + sw_clnsky_flux_up_c = real2d_k(sw_clnsky_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + sw_clnsky_flux_dn_c = real2d_k(sw_clnsky_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + sw_clnsky_flux_dn_dir_c = real2d_k(sw_clnsky_flux_dn_dir.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + lw_clnsky_flux_up_c = real2d_k(lw_clnsky_flux_up.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + lw_clnsky_flux_dn_c = real2d_k(lw_clnsky_flux_dn.data() + col_s*stride2_nlayp1, ncol_c, nlay+1); + } else { + sw_clnsky_flux_up_c = real2d_k("sw_clnsky_flux_up_c" , 1, 1); + sw_clnsky_flux_dn_c = real2d_k("sw_clnsky_flux_dn_c" , 1, 1); + sw_clnsky_flux_dn_dir_c = real2d_k("sw_clnsky_flux_dn_dir_c", 1, 1); + lw_clnsky_flux_up_c = real2d_k("lw_clnsky_flux_up_c" , 1, 1); + lw_clnsky_flux_dn_c = real2d_k("lw_clnsky_flux_dn_c" , 1, 1); + } + + // --- Chunk subviews: 2D (ncol, nswbands) --- + real2d_k sfc_alb_dir_c(sfc_alb_dir.data() + col_s*nswbands, ncol_c, nswbands); + real2d_k sfc_alb_dif_c(sfc_alb_dif.data() + col_s*nswbands, ncol_c, nswbands); + + // --- Chunk subviews: 3D (ncol, nlay+1, nbands) --- + const int stride3_sw = stride2_nlayp1 * nswbands; + const int stride3_lw = stride2_nlayp1 * m_nlwbands; + real3d_k sw_bnd_flux_up_c (sw_bnd_flux_up.data() + col_s*stride3_sw, ncol_c, nlay+1, nswbands); + real3d_k sw_bnd_flux_dn_c (sw_bnd_flux_dn.data() + col_s*stride3_sw, ncol_c, nlay+1, nswbands); + real3d_k sw_bnd_flux_dir_c(sw_bnd_flux_dir.data() + col_s*stride3_sw, ncol_c, nlay+1, nswbands); + real3d_k sw_bnd_flux_dif_c(sw_bnd_flux_dif.data() + col_s*stride3_sw, ncol_c, nlay+1, nswbands); + real3d_k lw_bnd_flux_up_c (lw_bnd_flux_up.data() + col_s*stride3_lw, ncol_c, nlay+1, m_nlwbands); + real3d_k lw_bnd_flux_dn_c (lw_bnd_flux_dn.data() + col_s*stride3_lw, ncol_c, nlay+1, m_nlwbands); + + // --- Create chunk gas concentrations by subsetting from full gas_concs --- + gas_concs_t gas_concs_c; + gas_concs_c.init(gas_names_offset, ncol_c, nlay); + for (int igas = 0; igas < m_ngas; ++igas) { + real2d_k vmr_full("vmr_full", ncol, nlay); + m_gas_concs.get_vmr(m_gas_names[igas], vmr_full); + real2d_k vmr_c("vmr_c", ncol_c, nlay); + auto cs = col_s; + Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0}, {ncol_c, nlay}), + KOKKOS_LAMBDA (int i, int j) { + vmr_c(i, j) = vmr_full(cs + i, j); + }); + gas_concs_c.set_vmr(m_gas_names[igas], vmr_c); + } + + // Expand surface albedos along nswbands for this chunk + rrtmgp::compute_band_by_band_surface_albedos(ncol_c, nswbands, + sfc_alb_dir_vis_c, sfc_alb_dir_nir_c, + sfc_alb_dif_vis_c, sfc_alb_dif_nir_c, + sfc_alb_dir_c , sfc_alb_dif_c); + + // Run RRTMGP driver for this column chunk + rrtmgp::rrtmgp_main(ncol_c, m_nlay, + p_lay_c, t_lay_c, + p_lev_c, t_lev_c, + gas_concs_c, + sfc_alb_dir_c, sfc_alb_dif_c, mu0_c, + t_sfc_c, sfc_emis_c, lw_src_c, + lwp_c, iwp_c, eff_radius_qc_c, eff_radius_qi_c, cldfrac_tot_c, + sw_flux_up_c, sw_flux_dn_c, sw_flux_dn_dir_c, + lw_flux_up_c, lw_flux_dn_c, + sw_clnclrsky_flux_up_c, sw_clnclrsky_flux_dn_c, sw_clnclrsky_flux_dn_dir_c, + sw_clrsky_flux_up_c, sw_clrsky_flux_dn_c, sw_clrsky_flux_dn_dir_c, + sw_clnsky_flux_up_c, sw_clnsky_flux_dn_c, sw_clnsky_flux_dn_dir_c, + lw_clnclrsky_flux_up_c, lw_clnclrsky_flux_dn_c, + lw_clrsky_flux_up_c, lw_clrsky_flux_dn_c, + lw_clnsky_flux_up_c, lw_clnsky_flux_dn_c, + sw_bnd_flux_up_c, sw_bnd_flux_dn_c, sw_bnd_flux_dir_c, + lw_bnd_flux_up_c, lw_bnd_flux_dn_c, + eccf, m_extra_clnclrsky_diag, m_extra_clnsky_diag); + + // Compute heating rates for this chunk + rrtmgp::compute_heating_rate(sw_flux_up_c, sw_flux_dn_c, r_lay_c, z_del_c, sw_heating_c); + rrtmgp::compute_heating_rate(lw_flux_up_c, lw_flux_dn_c, r_lay_c, z_del_c, lw_heating_c); + + // Compute diffuse band fluxes and broadband surface fluxes for this chunk + Kokkos::parallel_for(Kokkos::MDRangePolicy>({0, 0, 0}, {ncol_c, nlay+1, nswbands}), + KOKKOS_LAMBDA (int icol, int ilay, int ibnd) + { + sw_bnd_flux_dif_c(icol,ilay,ibnd) = sw_bnd_flux_dn_c(icol,ilay,ibnd) - sw_bnd_flux_dir_c(icol,ilay,ibnd); + }); + rrtmgp::compute_broadband_surface_fluxes(ncol_c, kbot, nswbands, + sw_bnd_flux_dir_c , sw_bnd_flux_dif_c , + sfc_flux_dir_vis_c, sfc_flux_dir_nir_c, + sfc_flux_dif_vis_c, sfc_flux_dif_nir_c); + + gas_concs_c.reset(); + } // end column chunk loop } void Radiation::finalize_impl (Vector& lsm_output_ptrs) { - // Finish rrtmgp + // Reset gas concentrations (k-dist data persists across steps) m_gas_concs.reset(); - rrtmgp::rrtmgp_finalize(); // Fill the AMReX MFs from Kokkos Views kokkos_buffers_to_mf(lsm_output_ptrs); diff --git a/Source/PhysicsInterfaces/Radiation/RRTMGP_Memory_Reduction.md b/Source/PhysicsInterfaces/Radiation/RRTMGP_Memory_Reduction.md new file mode 100644 index 000000000..8d4874567 --- /dev/null +++ b/Source/PhysicsInterfaces/Radiation/RRTMGP_Memory_Reduction.md @@ -0,0 +1,118 @@ +# RRTMGP Radiation Memory Reduction + +## Problem + +RRTMGP radiation was extremely memory-consuming because all horizontal columns on each +MPI rank were processed at once. Several internal data structures scale as +`O(ncol * nlay * ngpt)` where `ngpt` is 224 (SW) or 256 (LW). For a rank with 50,000 +columns and 100 vertical levels, peak GPU memory exceeded 40+ GB. The Kokkos memory pool +alone was sized at `300 * ncol * nlay * nbnd * 8 bytes` — a massive over-allocation. + +## Solution + +Five changes were implemented, ordered by impact: + +### 1. Right-size the Kokkos Memory Pool + +**File:** `ERF_RRTMGP_Interface.cpp` + +The pool multiplier `nvar` was reduced from 300 to 20, and `nbnd` was replaced with `ngpt` +since the pool stores per-g-point temporaries. The actual peak concurrent usage inside the +RTE solvers is ~15-20 arrays of size `ncol * nlay * ngpt`. + +### 2. Initialize RRTMGP Once (Not Every Radiation Step) + +**Files:** `ERF_Radiation.cpp`, `ERF_Radiation.H` + +Previously, `rrtmgp_initialize()` loaded 4 NetCDF files and created the memory pool every +radiation step, and `rrtmgp_finalize()` destroyed everything. The k-distribution and cloud +optics tables are constant, so initialization now happens once (guarded by +`rrtmgp::initialized`). The memory pool is sized for `ncol_chunk` rather than full `ncol`. +Finalization is deferred to the `Radiation` destructor. + +### 3. Column Chunking in `run_impl()` (Primary Memory Fix) + +**Files:** `ERF_Radiation.cpp`, `ERF_Radiation.H` + +Radiation columns are completely independent (no horizontal coupling). They are now +processed in chunks of `ncol_chunk` (default 5000, configurable via `erf.rad_ncol_chunk`) +instead of all at once. This is the same approach used by E3SM/SCREAM. + +Implementation details: +- Full-ncol input/output buffers are kept in `alloc_buffers()` +- The memory explosion happens inside `rrtmgp_main()` where arrays of size + `ncol * nlay * ngpt` are created on the stack allocator +- The `rrtmgp_main()` call is wrapped in a chunk loop +- Unmanaged Kokkos Views (pointer arithmetic) create zero-copy subviews into the full + buffers, leveraging LayoutRight which guarantees contiguous column data +- Per-chunk `gas_concs_t` objects are created with subset VMR data +- `compute_band_by_band_surface_albedos`, `compute_heating_rate`, and + `compute_broadband_surface_fluxes` are called per chunk + +**New input parameter:** `erf.rad_ncol_chunk` (integer, default 5000) + +### 4. Remove Dead Arrays and Parameters + +**Files:** `ERF_Radiation.H`, `ERF_Radiation.cpp`, `ERF_RRTMGP_Interface.H`, +`ERF_RRTMGP_Interface.cpp` + +Removed unused member declarations and function parameters that were already commented out: +- Cloud band optical depth arrays (`cld_tau_sw_bnd`, `cld_tau_lw_bnd`) +- Cloud g-point optical depth arrays (`cld_tau_sw_gpt`, `cld_tau_lw_gpt`) + +The aerosol optical property arrays (`aero_tau_sw`, `aero_ssa_sw`, `aero_g_sw`, `aero_tau_lw`) +are retained as dormant scaffolding. They are declared in `ERF_Radiation.H` but only +allocated when `m_do_aerosol_rad` is true, which currently triggers an abort in the +`Radiation` constructor (see section 6). This preserves the hook so a future aerosol +coupling (e.g. SPA, prescribed aerosol climatology) can populate these arrays without +reintroducing the members. + +### 5. Conditional Allocation of Diagnostic Flux Arrays + +**Files:** `ERF_Radiation.cpp`, `ERF_RRTMGP_Interface.cpp` + +12 clean-sky/clean-clear-sky flux arrays (each `ncol * (nlay+1)`) are only allocated at +full size when `m_extra_clnclrsky_diag` or `m_extra_clnsky_diag` are true (both default +false). When disabled, 1-element placeholders are allocated instead. The flux zeroing +kernels in `rrtmgp_sw()` and `rrtmgp_lw()` are also made conditional on these flags. + +### 6. Abort When Aerosol Forcing Is Requested + +**File:** `ERF_Radiation.cpp` + +`m_do_aerosol_rad` defaults to `false`. The `Radiation` constructor now aborts with a +descriptive message if a user sets `erf.rad_do_aerosol = true`, since the aerosol optical +property pipeline is not implemented (the arrays exist as scaffolding but are never +populated with real data — see section 4). The flag is preserved so the abort can be +removed once a real aerosol coupling lands. + +### 7. Fence Before Datalog Read to Eliminate Cross-Stream Race + +**File:** `ERF_Radiation.cpp` + +A `Kokkos::fence()` is inserted between `rrtmgp::compute_heating_rate(...)` and +`populateDatalogMF()`. Without the fence, the LW clear-sky heating-rate kernel (launched +on the Kokkos default stream) could still be in flight when `populateDatalogMF()` (an +AMReX `ParallelFor` on `Gpu::gpuStream()`) read `lw_clrsky_heating`. The race produced +run-to-run-variable `radqrclw` in the radiation datalog while every other field stayed +deterministic. This bug exists on the `development` branch as well — it was exposed, not +introduced, by the chunking refactor. + +## Expected Impact + +| Metric | Before | After (chunk=5000, 50K cols, 100 levels) | +|--------|--------|------------------------------------------| +| Memory pool | ~192 GB | ~0.5 GB | +| Peak GPU memory | 40+ GB | ~4 GB | +| Diagnostic flux arrays | ~400 MB always | ~400 MB only when enabled | + +The `erf.rad_ncol_chunk` parameter controls the tradeoff between peak memory and the +number of kernel launches. Smaller chunks use less memory but launch more kernels. +Results are bit-identical regardless of chunk size since radiation columns are independent. + +## Verification + +1. Build with `ERF_ENABLE_RRTMGP=ON` (CUDA build verified successfully) +2. Run `ctest -R Radiation -VV` and compare against gold files +3. Verify pool high-water mark in `rrtmgp_finalize()` output +4. Use `nvidia-smi` to confirm peak GPU memory reduction