chengcli · chengcli · Jan 19, 2026 · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026
@@ -22,7 +22,9 @@ macro(setup_test namel)
             ${TORCH_API_INCLUDE_DIR})
 
   target_link_libraries(${namel}.${buildl}
-    PRIVATE pyharp::harp gtest_main)
+    PRIVATE pyharp::harp
+            $<IF:$<BOOL:${CUDAToolkit_FOUND}>,pyharp::harp_cu,>
+            gtest_main)
 
   add_test(NAME ${namel}.${buildl} COMMAND ${namel}.${buildl})
 endmacro()
@@ -15,6 +15,7 @@ void bind_opacity(py::module &m);
 void bind_math(py::module &m);
 void bind_constants(py::module &m);
 void bind_integrator(py::module &);
+void bind_rtsolver(py::module &);
 
 PYBIND11_MODULE(pyharp, m) {
   m.attr("__name__") = "pyharp";
@@ -31,6 +32,7 @@ PYBIND11_MODULE(pyharp, m) {
   bind_math(m);
   bind_constants(m);
   bind_integrator(m);
+  bind_rtsolver(m);
 
   m.def(
        "species_names",

@@ -43,6 +43,8 @@ void bind_radiation(py::module &m) {
       .ADD_OPTION(std::string, harp::RadiationBandOptionsImpl, outdirs)
       .ADD_OPTION(std::string, harp::RadiationBandOptionsImpl, solver_name)
       .ADD_OPTION(disort::DisortOptions, harp::RadiationBandOptionsImpl, disort)
+      .ADD_OPTION(harp::ToonMcKay89Options, harp::RadiationBandOptionsImpl,
+                  toon)
       .ADD_OPTION(std::vector<double>, harp::RadiationBandOptionsImpl,
                   wavenumber)
       .ADD_OPTION(std::vector<double>, harp::RadiationBandOptionsImpl, weight)

@@ -0,0 +1,65 @@
+// torch
+#include <torch/extension.h>
+
+// fmt
+#include <fmt/format.h>
+
+// harp
+#include <harp/rtsolver/toon_mckay89.hpp>
+
+// python
+#include "pyoptions.hpp"
+
+namespace py = pybind11;
+
+void bind_rtsolver(py::module &m) {
+  auto pyToonMcKay89Options =
+      py::class_<harp::ToonMcKay89OptionsImpl, harp::ToonMcKay89Options>(
+          m, "ToonMcKay89Options");
+
+  pyToonMcKay89Options.def(py::init<>())
+      .def("__repr__",
+           [](const harp::ToonMcKay89Options &a) {
+             std::stringstream ss;
+             a->report(ss);
+             return fmt::format("ToonMcKay89Options(\n{})", ss.str());
+           })
+      .ADD_OPTION(std::vector<double>, harp::ToonMcKay89OptionsImpl, wave_lower)
+      .ADD_OPTION(std::vector<double>, harp::ToonMcKay89OptionsImpl, wave_upper)
+      .ADD_OPTION(bool, harp::ToonMcKay89OptionsImpl, zenith_correction);
+
+  torch::python::bind_module<harp::ToonMcKay89Impl>(m, "ToonMcKay89")
+      .def(py::init<>())
+      .def(py::init<harp::ToonMcKay89Options>(), py::arg("options"))
+      .def_readonly("options", &harp::ToonMcKay89Impl::options)
+      .def(
+          "forward",
+          [](harp::ToonMcKay89Impl &self, torch::Tensor prop, std::string bname,
+             torch::optional<torch::Tensor> temf, const py::kwargs &kwargs) {
+            // get bc from kwargs
+            std::map<std::string, torch::Tensor> bc;
+            for (auto item : kwargs) {
+              auto key = py::cast<std::string>(item.first);
+              auto value = py::cast<torch::Tensor>(item.second);
+              bc.emplace(std::move(key), std::move(value));
+            }
+
+            for (auto &[key, value] : bc) {
+              std::vector<std::string> items = {"fbeam", "albedo", "umu0"};
+              // broadcast dimensions to (nwave, ncol)
+              if (std::find(items.begin(), items.end(), key) != items.end()) {
+                while (value.dim() < 2) {
+                  value = value.unsqueeze(0);
+                }
+              }
+            }
+
+            // broadcast dimensions to (nwave, ncol, nlyr, nprop)
+            while (prop.dim() < 4) {
+              prop = prop.unsqueeze(0);
+            }
+
+            return self.forward(prop, &bc, bname, temf);
+          },
+          py::arg("prop"), py::arg("bname") = "", py::arg("temf") = py::none());
+};
@@ -34,7 +34,7 @@ file(GLOB src_files
     utils/*.cpp
     opacity/*.cpp
     radiation/*.cpp
-    #rtsolver/*.cpp
+    rtsolver/*.cpp
     integrator/*.cpp
     )
 
@@ -65,6 +65,7 @@ add_library(pyharp::harp ALIAS ${namel}_${buildl})
 if (CUDAToolkit_FOUND)
   file(GLOB cu_src_files
       integrator/*.cu
+      rtsolver/*.cu
       )
 
   add_library(${namel}_cuda_${buildl}

@@ -0,0 +1,97 @@
+#pragma once
+
+// torch
+#include <ATen/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+namespace harp {
+namespace native {
+
+template <typename func_t>
+__global__ void element_kernel(int64_t numel, func_t f, char *work) {
+  int tid = threadIdx.x;
+  int idx = blockIdx.x * blockDim.x + tid;
+  if (idx < numel) {
+    f(idx, work);
+  }
+}
+
+template <int Arity, typename func_t>
+void gpu_kernel(at::TensorIterator& iter, const func_t& f) {
+  TORCH_CHECK(iter.ninputs() + iter.noutputs() == Arity);
+
+  std::array<char*, Arity> data;
+  for (int i = 0; i < Arity; i++) {
+    data[i] = reinterpret_cast<char*>(iter.data_ptr(i));
+  }
+
+  auto offset_calc = ::make_offset_calculator<Arity>(iter);
+  int64_t numel = iter.numel();
+
+  at::native::launch_legacy_kernel<128, 1>(numel,
+      [=] __device__(int idx) {
+      auto offsets = offset_calc.get(idx);
+      f(data.data(), offsets.data());
+    });
+}
+
+template <int Chunks, int Arity, typename func_t>
+void gpu_chunk_kernel(at::TensorIterator& iter, int work_size, const func_t& f) {
+  TORCH_CHECK(iter.ninputs() + iter.noutputs() == Arity);
+
+  std::array<char*, Arity> data;
+  for (int i = 0; i < Arity; i++) {
+    data[i] = reinterpret_cast<char*>(iter.data_ptr(i));
+  }
+
+  auto offset_calc = ::make_offset_calculator<Arity>(iter);
+  int64_t numel = iter.numel();
+
+  // devide numel into Chunk parts to reduce memory usage
-  // devide numel into Chunk parts to reduce memory usage
+  // divide numel into Chunk parts to reduce memory usage
-  // devide numel into Chunk parts to reduce memory usage
+  // divide numel into Chunk parts to reduce memory usage
+  // allocate working memory pool
+  char* d_workspace = nullptr;
+
+  // workspace size per chunk
+  int chunks = Chunks > numel ? numel : Chunks;
+  int base = numel / chunks;
+  int rem  = numel % chunks;
+
+  size_t workspace_bytes = work_size * (base + (rem > 0 ? 1 : 0));
+  cudaMalloc(&d_workspace, workspace_bytes);
+
+  int chunk_start = 0;
+
+  for (int n = 0; n < chunks; n++) {
+    int64_t chunk_numel = base + (n < rem ? 1 : 0);
+    int64_t chunk_end = chunk_start + chunk_numel;  // exclusive
+
+    dim3 block(64);
+    dim3 grid((chunk_numel + block.x - 1) / block.x);
+
+    auto device_lambda = [=] __device__(int idx, char* work) {
+        auto offsets = offset_calc.get(idx + chunk_start);
+        f(data.data(), offsets.data(), work + idx * work_size);
+      };
+
+    /*std::cout << "chunk = " << n
+              << ", chunk_start = " << chunk_start
+              << ", chunk_end = " << chunk_end
+              << ", chunk_numel = " << chunk_numel
+              << ", block = " << block.x
+              << ", grid = " << grid.x
+              << ", work_size = " << work_size
+              << std::endl;*/
+
+    element_kernel<<<grid, block>>>(chunk_numel, device_lambda, d_workspace);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    cudaDeviceSynchronize();
+
+    chunk_start = chunk_end;
+  }
+
+  // free working memory pool
+  cudaFree(d_workspace);
+}
+
+}  // namespace native
+}  // namespace harp
@@ -89,6 +89,76 @@ torch::Tensor bbflux_wavenumber(double wn1, double wn2, torch::Tensor temp) {
   return ans * sigdpi * torch::pow(temp, 4);
 }
 
+torch::Tensor bbflux_wavenumber(torch::Tensor wn1, torch::Tensor wn2,
+                                torch::Tensor temp) {
+  if ((wn2 < wn1).any().item<bool>() || (wn1 < 0.0).any().item<bool>()) {
+    TORCH_CHECK(false, "bbflux_wavenumber: Invalid wavenumbers");
+  }
+
+  TORCH_CHECK(temp.min().item<double>() > 0.0,
+              "bbflux_wavenumber: Temperature must be positive");
+
+  const double C2 = 1.438786;       // h * c / k in units cm * K
+  const double SIGMA = 5.67032e-8;  // Stefan-Boltzmann constant in W/m²K⁴
+  const double VCUT = 1.5;
+  const double sigdpi = SIGMA / M_PI;
+  const double vmax = std::log(DBL_MAX);
+  const double conc = 15.0 / std::pow(M_PI, 4);  // Now computed at runtime
+  const double c1 = 1.1911e-18;  // h * c^2, in units W/(m² * sr * cm⁻⁴)
+  const double A1 = 1.0 / 3.0;
+  const double A2 = -1.0 / 8.0;
+  const double A3 = 1.0 / 60.0;
+  const double A4 = -1.0 / 5040.0;
+  const double A5 = 1.0 / 272160.0;
+  const double A6 = -1.0 / 13305600.0;
+
+  // Handle the case where wn1 == wn2
+  if ((wn1 == wn2).all().item<bool>()) {
+    auto wn = wn1;
+    auto arg = torch::exp(-C2 * wn / temp);
+    return c1 * wn.pow(3) * arg / (1.0 - arg);
+  }
+
+  torch::Tensor v[2] = {C2 * wn1 / temp, C2 * wn2 / temp};
+  torch::Tensor smallv = torch::zeros_like(temp);
+  torch::Tensor p[2];
+  torch::Tensor d[2];
+
+  // Handle different cases for wavenumbers
+  for (int i = 0; i <= 1; ++i) {
+    smallv = smallv + torch::where(v[i] < VCUT, torch::ones_like(temp),
+                                   torch::zeros_like(temp));
+
+    auto vsq = v[i] * v[i];
+    p[i] =
+        conc * vsq * v[i] *
+        (A1 + v[i] * (A2 + v[i] * (A3 + vsq * (A4 + vsq * (A5 + vsq * A6)))));
+    p[i] = torch::where(v[i] < VCUT, p[i], torch::zeros_like(temp));
+
+    // Use exponential series expansion
+    const double vcp[7] = {10.25, 5.7, 3.9, 2.9, 2.3, 1.9, 0.0};
+
+    auto ex = torch::exp(-v[i]);
+    auto exm = torch::ones_like(temp);
+    d[i] = torch::zeros_like(temp);
+
+    for (int m = 1; m <= 6; ++m) {
+      auto mv = static_cast<double>(m) * v[i];
+      exm = exm * ex;
+      d[i] = d[i] + exm * (6.0 + mv * (6.0 + mv * (3.0 + mv))) / (m * m);
+    }
+    d[i] *= conc;
+
+    d[i] = torch::where(v[i] > VCUT, d[i], torch::zeros_like(temp));
+  }
+
+  auto ans =
+      torch::where(smallv == 2, p[1] - p[0],
+                   torch::where(smallv == 1, 1.0 - p[0] - d[1], d[0] - d[1]));
+
+  return ans * sigdpi * torch::pow(temp, 4);
+}
+
 torch::Tensor bbflux_wavelength(torch::Tensor wave, double temp, int ncol) {
   // Check if wave is a 1D tensor
   TORCH_CHECK(wave.dim() == 1, "wavelength must be a 1D tensor");

@@ -34,6 +34,10 @@ torch::Tensor bbflux_wavenumber(torch::Tensor wave, double temp, int ncol = 1);
  */
 torch::Tensor bbflux_wavenumber(double wn1, double wn2, torch::Tensor temp);
 
+//! \brief calculate integrated blackbody flux using wavenumber
+torch::Tensor bbflux_wavenumber(torch::Tensor wn1, torch::Tensor wn2,
+                                torch::Tensor temp);
+
 //! \brief calculate blackbody flux using wavelength
 /*!
  * Formula:

@@ -107,6 +107,10 @@ RadiationBandOptions RadiationBandOptionsImpl::from_yaml(
     if (op->verbose()) {
       std::cout << "  Solver flags: " << op->disort()->flags() << std::endl;
     }
+  } else if (op->solver_name() == "toon") {
+    op->toon() = ToonMcKay89OptionsImpl::create();
+    op->toon()->wave_lower(std::vector<double>(op->nwave(), wmin));
+    op->toon()->wave_upper(std::vector<double>(op->nwave(), wmax));
   } else if (op->solver_name() == "twostr") {
     TORCH_CHECK(false, "twostr solver not implemented");
   } else {
@@ -173,6 +177,8 @@ void RadiationBandImpl::reset() {
   } else if (options->solver_name() == "disort") {
     rtsolver = torch::nn::AnyModule(disort::Disort(options->disort()));
     register_module("solver", rtsolver.ptr());
+  } else if (options->solver_name() == "toon") {
+    rtsolver = torch::nn::AnyModule(ToonMcKay89(options->toon()));
   } else {
     TORCH_CHECK(false, "Unknown solver: ", options->solver_name());
   }

@@ -11,6 +11,7 @@
 
 // harp
 #include <harp/opacity/opacity_options.hpp>
+#include <harp/rtsolver/toon_mckay89.hpp>
 
 // arg
 #include <harp/add_arg.h>
@@ -103,6 +104,7 @@ struct RadiationBandOptionsImpl {
 
   ADD_ARG(OpacityDict, opacities) = {};
   ADD_ARG(disort::DisortOptions, disort);
+  ADD_ARG(ToonMcKay89Options, toon);
 
   ADD_ARG(std::vector<double>, wavenumber);
   ADD_ARG(std::vector<double>, weight);

@@ -0,0 +1,33 @@
+#pragma once
+
+// C/C++
+#include <cstdlib>
+
+// base
+#include <configure.h>
+
+namespace harp {
+
+// Solves a tridiagonal system using the Thomas algorithm (TDMA)
+template <typename T>
+DISPATCH_MACRO void dtridgl(int n, const T *a, const T *b, T *c, T *d, T *x) {
+  // First row
+  c[0] = c[0] / b[0];
+  d[0] = d[0] / b[0];
+
+  // Forward sweep
+  for (int i = 1; i < n; ++i) {
+    T denom = b[i] - a[i] * c[i - 1];
+    if (denom == 0.0) denom = 1e-12;  // Avoid division by zero
+    c[i] = (i < n - 1) ? c[i] / denom : 0.0;
+    d[i] = (d[i] - a[i] * d[i - 1]) / denom;
+  }
+
+  // Back substitution
+  x[n - 1] = d[n - 1];
+  for (int i = n - 2; i >= 0; --i) {
-  // First row
-  c[0] = c[0] / b[0];
-  d[0] = d[0] / b[0];
-
-  // Forward sweep
-  for (int i = 1; i < n; ++i) {
-    T denom = b[i] - a[i] * c[i - 1];
-    if (denom == 0.0) denom = 1e-12;  // Avoid division by zero
-    c[i] = (i < n - 1) ? c[i] / denom : 0.0;
-    d[i] = (d[i] - a[i] * d[i - 1]) / denom;
-  }
-
-  // Back substitution
-  x[n - 1] = d[n - 1];
-  for (int i = n - 2; i >= 0; --i) {
+  // First row (1-based indexing: use elements 1..n)
+  c[1] = c[1] / b[1];
+  d[1] = d[1] / b[1];
+
+  // Forward sweep
+  for (int i = 2; i <= n; ++i) {
+    T denom = b[i] - a[i] * c[i - 1];
+    if (denom == 0.0) denom = 1e-12;  // Avoid division by zero
+    c[i] = (i < n) ? c[i] / denom : static_cast<T>(0);
+    d[i] = (d[i] - a[i] * d[i - 1]) / denom;
+  }
+
+  // Back substitution
+  x[n] = d[n];
+  for (int i = n - 1; i >= 1; --i) {
-  // First row
-  c[0] = c[0] / b[0];
-  d[0] = d[0] / b[0];
-
-  // Forward sweep
-  for (int i = 1; i < n; ++i) {
-    T denom = b[i] - a[i] * c[i - 1];
-    if (denom == 0.0) denom = 1e-12;  // Avoid division by zero
-    c[i] = (i < n - 1) ? c[i] / denom : 0.0;
-    d[i] = (d[i] - a[i] * d[i - 1]) / denom;
-  }
-
-  // Back substitution
-  x[n - 1] = d[n - 1];
-  for (int i = n - 2; i >= 0; --i) {
+  // First row (1-based indexing: use elements 1..n)
+  c[1] = c[1] / b[1];
+  d[1] = d[1] / b[1];
+
+  // Forward sweep
+  for (int i = 2; i <= n; ++i) {
+    T denom = b[i] - a[i] * c[i - 1];
+    if (denom == 0.0) denom = 1e-12;  // Avoid division by zero
+    c[i] = (i < n) ? c[i] / denom : static_cast<T>(0);
+    d[i] = (d[i] - a[i] * d[i - 1]) / denom;
+  }
+
+  // Back substitution
+  x[n] = d[n];
+  for (int i = n - 1; i >= 1; --i) {
+    x[i] = d[i] - c[i] * x[i + 1];
+  }
+}
+
+}  // namespace harp