Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmake/macros/macro_setup_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ macro(setup_test namel)
${TORCH_API_INCLUDE_DIR})

target_link_libraries(${namel}.${buildl}
PRIVATE pyharp::harp gtest_main)
PRIVATE pyharp::harp
$<IF:$<BOOL:${CUDAToolkit_FOUND}>,pyharp::harp_cu,>
gtest_main)

add_test(NAME ${namel}.${buildl} COMMAND ${namel}.${buildl})
endmacro()
2 changes: 2 additions & 0 deletions python/csrc/pyharp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ void bind_opacity(py::module &m);
void bind_math(py::module &m);
void bind_constants(py::module &m);
void bind_integrator(py::module &);
void bind_rtsolver(py::module &);

PYBIND11_MODULE(pyharp, m) {
m.attr("__name__") = "pyharp";
Expand All @@ -31,6 +32,7 @@ PYBIND11_MODULE(pyharp, m) {
bind_math(m);
bind_constants(m);
bind_integrator(m);
bind_rtsolver(m);

m.def(
"species_names",
Expand Down
2 changes: 2 additions & 0 deletions python/csrc/pyradiation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ void bind_radiation(py::module &m) {
.ADD_OPTION(std::string, harp::RadiationBandOptionsImpl, outdirs)
.ADD_OPTION(std::string, harp::RadiationBandOptionsImpl, solver_name)
.ADD_OPTION(disort::DisortOptions, harp::RadiationBandOptionsImpl, disort)
.ADD_OPTION(harp::ToonMcKay89Options, harp::RadiationBandOptionsImpl,
toon)
.ADD_OPTION(std::vector<double>, harp::RadiationBandOptionsImpl,
wavenumber)
.ADD_OPTION(std::vector<double>, harp::RadiationBandOptionsImpl, weight)
Expand Down
65 changes: 65 additions & 0 deletions python/csrc/pyrtsolver.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// torch
#include <torch/extension.h>

// fmt
#include <fmt/format.h>

// harp
#include <harp/rtsolver/toon_mckay89.hpp>

// python
#include "pyoptions.hpp"

namespace py = pybind11;

void bind_rtsolver(py::module &m) {
auto pyToonMcKay89Options =
py::class_<harp::ToonMcKay89OptionsImpl, harp::ToonMcKay89Options>(
m, "ToonMcKay89Options");

pyToonMcKay89Options.def(py::init<>())
.def("__repr__",
[](const harp::ToonMcKay89Options &a) {
std::stringstream ss;
a->report(ss);
return fmt::format("ToonMcKay89Options(\n{})", ss.str());
})
.ADD_OPTION(std::vector<double>, harp::ToonMcKay89OptionsImpl, wave_lower)
.ADD_OPTION(std::vector<double>, harp::ToonMcKay89OptionsImpl, wave_upper)
.ADD_OPTION(bool, harp::ToonMcKay89OptionsImpl, zenith_correction);

torch::python::bind_module<harp::ToonMcKay89Impl>(m, "ToonMcKay89")
.def(py::init<>())
.def(py::init<harp::ToonMcKay89Options>(), py::arg("options"))
.def_readonly("options", &harp::ToonMcKay89Impl::options)
.def(
"forward",
[](harp::ToonMcKay89Impl &self, torch::Tensor prop, std::string bname,
torch::optional<torch::Tensor> temf, const py::kwargs &kwargs) {
// get bc from kwargs
std::map<std::string, torch::Tensor> bc;
for (auto item : kwargs) {
auto key = py::cast<std::string>(item.first);
auto value = py::cast<torch::Tensor>(item.second);
bc.emplace(std::move(key), std::move(value));
}

for (auto &[key, value] : bc) {
std::vector<std::string> items = {"fbeam", "albedo", "umu0"};
// broadcast dimensions to (nwave, ncol)
if (std::find(items.begin(), items.end(), key) != items.end()) {
while (value.dim() < 2) {
value = value.unsqueeze(0);
}
}
}

// broadcast dimensions to (nwave, ncol, nlyr, nprop)
while (prop.dim() < 4) {
prop = prop.unsqueeze(0);
}

return self.forward(prop, &bc, bname, temf);
},
py::arg("prop"), py::arg("bname") = "", py::arg("temf") = py::none());
};
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ file(GLOB src_files
utils/*.cpp
opacity/*.cpp
radiation/*.cpp
#rtsolver/*.cpp
rtsolver/*.cpp
integrator/*.cpp
)

Expand Down Expand Up @@ -65,6 +65,7 @@ add_library(pyharp::harp ALIAS ${namel}_${buildl})
if (CUDAToolkit_FOUND)
file(GLOB cu_src_files
integrator/*.cu
rtsolver/*.cu
)

add_library(${namel}_cuda_${buildl}
Expand Down
97 changes: 97 additions & 0 deletions src/loops.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#pragma once

// torch
#include <ATen/TensorIterator.h>
#include <ATen/native/cuda/Loops.cuh>

namespace harp {
namespace native {

template <typename func_t>
__global__ void element_kernel(int64_t numel, func_t f, char *work) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
if (idx < numel) {
f(idx, work);
}
}

template <int Arity, typename func_t>
void gpu_kernel(at::TensorIterator& iter, const func_t& f) {
TORCH_CHECK(iter.ninputs() + iter.noutputs() == Arity);

std::array<char*, Arity> data;
for (int i = 0; i < Arity; i++) {
data[i] = reinterpret_cast<char*>(iter.data_ptr(i));
}

auto offset_calc = ::make_offset_calculator<Arity>(iter);
int64_t numel = iter.numel();

at::native::launch_legacy_kernel<128, 1>(numel,
[=] __device__(int idx) {
auto offsets = offset_calc.get(idx);
f(data.data(), offsets.data());
});
}

template <int Chunks, int Arity, typename func_t>
void gpu_chunk_kernel(at::TensorIterator& iter, int work_size, const func_t& f) {
TORCH_CHECK(iter.ninputs() + iter.noutputs() == Arity);

std::array<char*, Arity> data;
for (int i = 0; i < Arity; i++) {
data[i] = reinterpret_cast<char*>(iter.data_ptr(i));
}

auto offset_calc = ::make_offset_calculator<Arity>(iter);
int64_t numel = iter.numel();

// devide numel into Chunk parts to reduce memory usage
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in comment. "devide" should be "divide".

Suggested change
// devide numel into Chunk parts to reduce memory usage
// divide numel into Chunk parts to reduce memory usage

Copilot uses AI. Check for mistakes.
// allocate working memory pool
char* d_workspace = nullptr;

// workspace size per chunk
int chunks = Chunks > numel ? numel : Chunks;
int base = numel / chunks;
int rem = numel % chunks;

size_t workspace_bytes = work_size * (base + (rem > 0 ? 1 : 0));
cudaMalloc(&d_workspace, workspace_bytes);

int chunk_start = 0;

for (int n = 0; n < chunks; n++) {
int64_t chunk_numel = base + (n < rem ? 1 : 0);
int64_t chunk_end = chunk_start + chunk_numel; // exclusive

dim3 block(64);
dim3 grid((chunk_numel + block.x - 1) / block.x);

auto device_lambda = [=] __device__(int idx, char* work) {
auto offsets = offset_calc.get(idx + chunk_start);
f(data.data(), offsets.data(), work + idx * work_size);
};

/*std::cout << "chunk = " << n
<< ", chunk_start = " << chunk_start
<< ", chunk_end = " << chunk_end
<< ", chunk_numel = " << chunk_numel
<< ", block = " << block.x
<< ", grid = " << grid.x
<< ", work_size = " << work_size
<< std::endl;*/

element_kernel<<<grid, block>>>(chunk_numel, device_lambda, d_workspace);
C10_CUDA_KERNEL_LAUNCH_CHECK();
cudaDeviceSynchronize();

chunk_start = chunk_end;
}

// free working memory pool
cudaFree(d_workspace);
}

} // namespace native
} // namespace harp
70 changes: 70 additions & 0 deletions src/radiation/bbflux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,76 @@ torch::Tensor bbflux_wavenumber(double wn1, double wn2, torch::Tensor temp) {
return ans * sigdpi * torch::pow(temp, 4);
}

torch::Tensor bbflux_wavenumber(torch::Tensor wn1, torch::Tensor wn2,
torch::Tensor temp) {
if ((wn2 < wn1).any().item<bool>() || (wn1 < 0.0).any().item<bool>()) {
TORCH_CHECK(false, "bbflux_wavenumber: Invalid wavenumbers");
}

TORCH_CHECK(temp.min().item<double>() > 0.0,
"bbflux_wavenumber: Temperature must be positive");

const double C2 = 1.438786; // h * c / k in units cm * K
const double SIGMA = 5.67032e-8; // Stefan-Boltzmann constant in W/m²K⁴
const double VCUT = 1.5;
const double sigdpi = SIGMA / M_PI;
const double vmax = std::log(DBL_MAX);
const double conc = 15.0 / std::pow(M_PI, 4); // Now computed at runtime
const double c1 = 1.1911e-18; // h * c^2, in units W/(m² * sr * cm⁻⁴)
const double A1 = 1.0 / 3.0;
const double A2 = -1.0 / 8.0;
const double A3 = 1.0 / 60.0;
const double A4 = -1.0 / 5040.0;
const double A5 = 1.0 / 272160.0;
const double A6 = -1.0 / 13305600.0;

// Handle the case where wn1 == wn2
if ((wn1 == wn2).all().item<bool>()) {
auto wn = wn1;
auto arg = torch::exp(-C2 * wn / temp);
return c1 * wn.pow(3) * arg / (1.0 - arg);
}

torch::Tensor v[2] = {C2 * wn1 / temp, C2 * wn2 / temp};
torch::Tensor smallv = torch::zeros_like(temp);
torch::Tensor p[2];
torch::Tensor d[2];

// Handle different cases for wavenumbers
for (int i = 0; i <= 1; ++i) {
smallv = smallv + torch::where(v[i] < VCUT, torch::ones_like(temp),
torch::zeros_like(temp));

auto vsq = v[i] * v[i];
p[i] =
conc * vsq * v[i] *
(A1 + v[i] * (A2 + v[i] * (A3 + vsq * (A4 + vsq * (A5 + vsq * A6)))));
p[i] = torch::where(v[i] < VCUT, p[i], torch::zeros_like(temp));

// Use exponential series expansion
const double vcp[7] = {10.25, 5.7, 3.9, 2.9, 2.3, 1.9, 0.0};

auto ex = torch::exp(-v[i]);
auto exm = torch::ones_like(temp);
d[i] = torch::zeros_like(temp);

for (int m = 1; m <= 6; ++m) {
auto mv = static_cast<double>(m) * v[i];
exm = exm * ex;
d[i] = d[i] + exm * (6.0 + mv * (6.0 + mv * (3.0 + mv))) / (m * m);
}
d[i] *= conc;

d[i] = torch::where(v[i] > VCUT, d[i], torch::zeros_like(temp));
}

auto ans =
torch::where(smallv == 2, p[1] - p[0],
torch::where(smallv == 1, 1.0 - p[0] - d[1], d[0] - d[1]));

return ans * sigdpi * torch::pow(temp, 4);
}

torch::Tensor bbflux_wavelength(torch::Tensor wave, double temp, int ncol) {
// Check if wave is a 1D tensor
TORCH_CHECK(wave.dim() == 1, "wavelength must be a 1D tensor");
Expand Down
4 changes: 4 additions & 0 deletions src/radiation/bbflux.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ torch::Tensor bbflux_wavenumber(torch::Tensor wave, double temp, int ncol = 1);
*/
torch::Tensor bbflux_wavenumber(double wn1, double wn2, torch::Tensor temp);

//! \brief calculate integrated blackbody flux using wavenumber
torch::Tensor bbflux_wavenumber(torch::Tensor wn1, torch::Tensor wn2,
torch::Tensor temp);

//! \brief calculate blackbody flux using wavelength
/*!
* Formula:
Expand Down
6 changes: 6 additions & 0 deletions src/radiation/radiation_band.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ RadiationBandOptions RadiationBandOptionsImpl::from_yaml(
if (op->verbose()) {
std::cout << " Solver flags: " << op->disort()->flags() << std::endl;
}
} else if (op->solver_name() == "toon") {
op->toon() = ToonMcKay89OptionsImpl::create();
op->toon()->wave_lower(std::vector<double>(op->nwave(), wmin));
op->toon()->wave_upper(std::vector<double>(op->nwave(), wmax));
} else if (op->solver_name() == "twostr") {
TORCH_CHECK(false, "twostr solver not implemented");
} else {
Expand Down Expand Up @@ -173,6 +177,8 @@ void RadiationBandImpl::reset() {
} else if (options->solver_name() == "disort") {
rtsolver = torch::nn::AnyModule(disort::Disort(options->disort()));
register_module("solver", rtsolver.ptr());
} else if (options->solver_name() == "toon") {
rtsolver = torch::nn::AnyModule(ToonMcKay89(options->toon()));
} else {
TORCH_CHECK(false, "Unknown solver: ", options->solver_name());
}
Expand Down
2 changes: 2 additions & 0 deletions src/radiation/radiation_band.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

// harp
#include <harp/opacity/opacity_options.hpp>
#include <harp/rtsolver/toon_mckay89.hpp>

// arg
#include <harp/add_arg.h>
Expand Down Expand Up @@ -103,6 +104,7 @@ struct RadiationBandOptionsImpl {

ADD_ARG(OpacityDict, opacities) = {};
ADD_ARG(disort::DisortOptions, disort);
ADD_ARG(ToonMcKay89Options, toon);

ADD_ARG(std::vector<double>, wavenumber);
ADD_ARG(std::vector<double>, weight);
Expand Down
33 changes: 33 additions & 0 deletions src/rtsolver/dtridgl_impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

// C/C++
#include <cstdlib>

// base
#include <configure.h>

namespace harp {

// Solves a tridiagonal system using the Thomas algorithm (TDMA)
template <typename T>
DISPATCH_MACRO void dtridgl(int n, const T *a, const T *b, T *c, T *d, T *x) {
// First row
c[0] = c[0] / b[0];
d[0] = d[0] / b[0];

// Forward sweep
for (int i = 1; i < n; ++i) {
T denom = b[i] - a[i] * c[i - 1];
if (denom == 0.0) denom = 1e-12; // Avoid division by zero
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comparison denom == 0.0 for floating-point numbers is problematic. Consider using a threshold comparison like fabs(denom) < epsilon instead to handle floating-point precision issues.

Copilot uses AI. Check for mistakes.
c[i] = (i < n - 1) ? c[i] / denom : 0.0;
d[i] = (d[i] - a[i] * d[i - 1]) / denom;
}

// Back substitution
x[n - 1] = d[n - 1];
for (int i = n - 2; i >= 0; --i) {
Comment on lines +14 to +28
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential indexing mismatch with caller. The dtridgl function is called with parameter 'n' representing the size of the system, but the implementation uses 0-based indexing throughout (accessing c[0], d[0], etc.). Meanwhile, the caller code in toon_mckay89_shortwave_impl.h and toon_mckay89_longwave_impl.h uses 1-based indexing when setting up the matrix (accessing Af[1], Bf[1], ..., Af[l], Bf[l] where l = 2*nlay). This suggests either the arrays passed should have an offset applied, or dtridgl should account for 1-based indexing. The current implementation will process elements at indices 0 through n-1, but the caller expects processing of indices 1 through n.

Suggested change
// First row
c[0] = c[0] / b[0];
d[0] = d[0] / b[0];
// Forward sweep
for (int i = 1; i < n; ++i) {
T denom = b[i] - a[i] * c[i - 1];
if (denom == 0.0) denom = 1e-12; // Avoid division by zero
c[i] = (i < n - 1) ? c[i] / denom : 0.0;
d[i] = (d[i] - a[i] * d[i - 1]) / denom;
}
// Back substitution
x[n - 1] = d[n - 1];
for (int i = n - 2; i >= 0; --i) {
// First row (1-based indexing: use elements 1..n)
c[1] = c[1] / b[1];
d[1] = d[1] / b[1];
// Forward sweep
for (int i = 2; i <= n; ++i) {
T denom = b[i] - a[i] * c[i - 1];
if (denom == 0.0) denom = 1e-12; // Avoid division by zero
c[i] = (i < n) ? c[i] / denom : static_cast<T>(0);
d[i] = (d[i] - a[i] * d[i - 1]) / denom;
}
// Back substitution
x[n] = d[n];
for (int i = n - 1; i >= 1; --i) {

Copilot uses AI. Check for mistakes.
x[i] = d[i] - c[i] * x[i + 1];
}
}

} // namespace harp
Loading
Loading