diff --git a/include/infinicore/ops/logsumexp.hpp b/include/infinicore/ops/logsumexp.hpp new file mode 100644 index 000000000..aa9b359c8 --- /dev/null +++ b/include/infinicore/ops/logsumexp.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class LogSumExp { +public: + using schema = void (*)(Tensor, int, bool, Tensor); + static void execute(Tensor input, int dim, bool keepdim, Tensor output); + static common::OpDispatcher &dispatcher(); +}; + +Tensor logsumexp(Tensor input, int dim, bool keepdim); +void logsumexp_(Tensor input, int dim, bool keepdim, Tensor output); +} // namespace infinicore::op diff --git a/include/infinicore/ops/lp_pool1d.hpp b/include/infinicore/ops/lp_pool1d.hpp new file mode 100644 index 000000000..71704059a --- /dev/null +++ b/include/infinicore/ops/lp_pool1d.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Lp_Pool1d { +public: + using schema = void (*)(Tensor, Tensor, float, size_t, size_t, bool); + static void execute(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode); + static common::OpDispatcher &dispatcher(); +}; + +Tensor lp_pool1d(Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode); +void lp_pool1d_(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode); +} // namespace infinicore::op diff --git a/include/infinicore/ops/lp_pool2d.hpp b/include/infinicore/ops/lp_pool2d.hpp new file mode 100644 index 000000000..8523fd71b --- /dev/null +++ b/include/infinicore/ops/lp_pool2d.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { +class Lp_Pool2d { +public: + using schema = void (*)(Tensor, Tensor, float, const std::tuple, const std::tuple, bool); + static void execute(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode); + static common::OpDispatcher &dispatcher(); +}; + +Tensor lp_pool2d(Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode); +void lp_pool2d_(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode); +} // namespace infinicore::op diff --git a/include/infinicore/ops/lp_pool3d.hpp b/include/infinicore/ops/lp_pool3d.hpp new file mode 100644 index 000000000..7ad3facab --- /dev/null +++ b/include/infinicore/ops/lp_pool3d.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { +class Lp_Pool3d { +public: + using schema = void (*)(Tensor, Tensor, float, const std::tuple, const std::tuple, bool); + static void execute(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode); + static common::OpDispatcher &dispatcher(); +}; + +Tensor lp_pool3d(Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode); +void lp_pool3d_(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode); +} // namespace infinicore::op diff --git a/include/infinicore/ops/max_global.hpp b/include/infinicore/ops/max_global.hpp new file mode 100644 index 000000000..91707c679 --- /dev/null +++ b/include/infinicore/ops/max_global.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class MaxGlobal { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor input, Tensor output); + static common::OpDispatcher &dispatcher(); +}; + +Tensor max_global(Tensor input); +void max_global_(Tensor input, Tensor output); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/max_reduce.hpp b/include/infinicore/ops/max_reduce.hpp new file mode 100644 index 000000000..9919a1650 --- /dev/null +++ b/include/infinicore/ops/max_reduce.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +class MaxReduce { +public: + using schema = void (*)(Tensor, Tensor, Tensor, int, bool); + static void execute(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim); + static common::OpDispatcher &dispatcher(); +}; + +std::tuple max_reduce(Tensor input, int dim, bool keepdim); +void max_reduce_(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim); + +} // namespace infinicore::op diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 5c541ec3c..0827a98a6 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -45,6 +45,8 @@ from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange +from infinicore.ops.logsumexp import logsumexp +from infinicore.ops.max import max from infinicore.tensor import ( Tensor, empty, @@ -115,6 +117,8 @@ "strided_empty", "strided_from_blob", "zeros", + "logsumexp", + "max", ] use_ntops = False diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..4d6b2568a 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,6 +6,9 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .lp_pool1d import lp_pool1d +from .lp_pool2d import lp_pool2d +from .lp_pool3d import lp_pool3d __all__ = [ "causal_softmax", @@ -17,4 +20,7 @@ "embedding", "rope", "RopeAlgo", + "lp_pool1d", + "lp_pool2d", + "lp_pool3d", ] diff --git a/python/infinicore/nn/functional/lp_pool1d.py b/python/infinicore/nn/functional/lp_pool1d.py new file mode 100644 index 000000000..b8970f3a2 --- /dev/null +++ b/python/infinicore/nn/functional/lp_pool1d.py @@ -0,0 +1,27 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def lp_pool1d( + input: Tensor, + norm_type: float, + kernel_size: int, + stride: int | None = None, + ceil_mode: bool = False, +): + r"""Applies a 1D power-average pooling over an input signal composed of several input planes.""" + + if stride is None: + stride = kernel_size + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.lp_pool1d( + input, norm_type, kernel_size, stride, ceil_mode + ) + + return Tensor( + _infinicore.lp_pool1d( + input._underlying, norm_type, kernel_size, stride, ceil_mode + ) + ) diff --git a/python/infinicore/nn/functional/lp_pool2d.py b/python/infinicore/nn/functional/lp_pool2d.py new file mode 100644 index 000000000..d51d13255 --- /dev/null +++ b/python/infinicore/nn/functional/lp_pool2d.py @@ -0,0 +1,32 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def lp_pool2d( + input: Tensor, + norm_type: float, + kernel_size: int | tuple[int, int], + stride: int | tuple[int, int] | None = None, + ceil_mode: bool = False, +): + r"""Applies a 2D power-average pooling over an input signal composed of several input planes.""" + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size) + + if isinstance(stride, int): + stride = (stride, stride) + + if stride is None: + stride = kernel_size + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.lp_pool2d( + input, norm_type, kernel_size, stride, ceil_mode + ) + + return Tensor( + _infinicore.lp_pool2d( + input._underlying, norm_type, kernel_size, stride, ceil_mode + ) + ) diff --git a/python/infinicore/nn/functional/lp_pool3d.py b/python/infinicore/nn/functional/lp_pool3d.py new file mode 100644 index 000000000..42d97c58c --- /dev/null +++ b/python/infinicore/nn/functional/lp_pool3d.py @@ -0,0 +1,32 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def lp_pool3d( + input: Tensor, + norm_type: float, + kernel_size: int | tuple[int, int, int], + stride: int | tuple[int, int, int] | None = None, + ceil_mode: bool = False, +): + r"""Applies a 3D power-average pooling over an input signal composed of several input planes.""" + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size, kernel_size) + + if isinstance(stride, int): + stride = (stride, stride, stride) + + if stride is None: + stride = kernel_size + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.lp_pool3d( + input, norm_type, kernel_size, stride, ceil_mode + ) + + return Tensor( + _infinicore.lp_pool3d( + input._underlying, norm_type, kernel_size, stride, ceil_mode + ) + ) diff --git a/python/infinicore/ops/logsumexp.py b/python/infinicore/ops/logsumexp.py new file mode 100644 index 000000000..751f13c06 --- /dev/null +++ b/python/infinicore/ops/logsumexp.py @@ -0,0 +1,19 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def logsumexp( + input: Tensor, dim: int | None = None, keepdim=False, *, out=None +) -> Tensor: + r"""Apply the logsumexp function.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logsumexp(input, dim, keepdim=keepdim, out=out) + + if out is None: + return Tensor(_infinicore.logsumexp(input._underlying, dim, keepdim)) + + _infinicore.logsumexp_(input._underlying, dim, keepdim, out._underlying) + + return out diff --git a/python/infinicore/ops/max.py b/python/infinicore/ops/max.py new file mode 100644 index 000000000..d643dae7c --- /dev/null +++ b/python/infinicore/ops/max.py @@ -0,0 +1,33 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def max( + input: Tensor, dim: int | None = None, keepdim=False, *, out=None +) -> Tensor | tuple[Tensor, Tensor]: + r"""Apply the max function.""" + + if dim is None: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.max(input, out=out) + + if out is None: + return Tensor(_infinicore.max_global(input._underlying)) + + _infinicore.max_global_(input._underlying, out._underlying) + + return out + else: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.max(input, dim, keepdim=keepdim, out=out) + + if out is None: + res, res_idx = _infinicore.max_reduce(input._underlying, dim, keepdim) + return Tensor(res), Tensor(res_idx) + + _infinicore.max_reduce_( + input._underlying, out[0]._underlying, out[1]._underlying, dim, keepdim + ) + + return out diff --git a/src/infinicore/ops/logsumexp/logsumexp.cc b/src/infinicore/ops/logsumexp/logsumexp.cc new file mode 100644 index 000000000..0026ad45b --- /dev/null +++ b/src/infinicore/ops/logsumexp/logsumexp.cc @@ -0,0 +1,54 @@ +#include "infinicore/ops/logsumexp.hpp" +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &LogSumExp::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void LogSumExp::execute(Tensor input, int dim, bool keepdim, Tensor output) { + infinicore::context::setDevice(input->device(), true); + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No LogSumExp implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(input, dim, keepdim, output); +} + +Tensor logsumexp(Tensor input, int dim, bool keepdim) { + // 规范化 dim + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = input->ndim() + normalized_dim; + } + + // 计算输出形状 + Shape output_shape; + const auto &input_shape = input->shape(); + + if (keepdim) { + output_shape = input_shape; + output_shape[normalized_dim] = 1; + } else { + for (int i = 0; i < static_cast(input_shape.size()); ++i) { + if (i != normalized_dim) { + output_shape.push_back(input_shape[i]); + } + } + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + logsumexp_(input, dim, keepdim, output); + return output; +} + +void logsumexp_(Tensor input, int dim, bool keepdim, Tensor output) { + LogSumExp::execute(input, dim, keepdim, output); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/logsumexp/logsumexp_cpu.cc b/src/infinicore/ops/logsumexp/logsumexp_cpu.cc new file mode 100644 index 000000000..14a5df5c6 --- /dev/null +++ b/src/infinicore/ops/logsumexp/logsumexp_cpu.cc @@ -0,0 +1,141 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/logsumexp.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include +#include +#include + +namespace infinicore::op::logsumexp_impl::cpu { + +void calculate(Tensor input, int dim, bool keepdim, Tensor output) { + auto input_shapes = input->shape(); + auto input_strides = input->strides(); + auto output_shapes = output->shape(); + auto output_strides = output->strides(); + auto ndim = input->ndim(); + auto dtype = input->dtype(); + auto dtype_size = input->element_size(); + + // 规范化 dim 到 [0, ndim) + if (dim < 0) { + dim = ndim + dim; + } + + auto input_base = input->data(); + auto output_base = output->data(); + + // 获取约化维度的大小 + size_t reduce_size = input_shapes[dim]; + size_t output_numel = output->numel(); + +// 对每个输出元素,计算沿着 dim 的 logsumexp +#pragma omp parallel for collapse(1) + for (size_t output_idx = 0; output_idx < output_numel; ++output_idx) { + // 根据输出索引计算多维坐标 + std::vector output_indices(output_shapes.size()); + size_t temp_idx = output_idx; + for (int i = static_cast(output_shapes.size()) - 1; i >= 0; --i) { + output_indices[i] = temp_idx % output_shapes[i]; + temp_idx /= output_shapes[i]; + } + + // 根据输出坐标映射到输入坐标,计算起始位置 + // 对于 keepdim=True: 输出形状 = 输入形状,但减少维度为 1 + // 对于 keepdim=False: 输出形状 < 输入形状,缺少减少的维度 + std::vector input_indices(ndim); + if (keepdim) { + // 直接对应:输出维度对应输入维度 + for (int i = 0; i < ndim; ++i) { + if (i == dim) { + input_indices[i] = 0; // 减少维度设为 0(我们稍后会遍历) + } else { + input_indices[i] = output_indices[i]; + } + } + } else { + // 跳过减少的维度:输出缺少一个维度 + int output_dim = 0; + for (int i = 0; i < ndim; ++i) { + if (i == dim) { + input_indices[i] = 0; // 减少维度设为 0 + } else { + input_indices[i] = output_indices[output_dim]; + output_dim++; + } + } + } + + // 计算在输入中的起始偏移 + size_t offset = 0; + for (int i = 0; i < ndim; ++i) { + if (i != dim) { // 跳过约化维度 + offset += input_indices[i] * input_strides[i]; + } + } + + // 计算沿着 reduce 维度的 logsumexp + if (dtype == DataType::F32) { + float max_val = -std::numeric_limits::infinity(); + + // 第一遍:找最大值 + for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) { + size_t current_offset = offset + reduce_idx * input_strides[dim]; + float *input_ptr = reinterpret_cast(input_base + current_offset * dtype_size); + max_val = std::max(max_val, *input_ptr); + } + + // 第二遍:计算 sum(exp(x - max)) + float sum_exp = 0.0f; + for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) { + size_t current_offset = offset + reduce_idx * input_strides[dim]; + float *input_ptr = reinterpret_cast(input_base + current_offset * dtype_size); + sum_exp += std::exp(*input_ptr - max_val); + } + + // 结果:log(sum(exp(x))) = max + log(sum(exp(x - max))) + float result = max_val + std::log(sum_exp); + + float *output_ptr = reinterpret_cast(output_base + output_idx * dtype_size); + *output_ptr = result; + + } else if (dtype == DataType::F16) { + float max_val = -std::numeric_limits::infinity(); + + // 第一遍:找最大值(转换为 F32) + for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) { + size_t current_offset = offset + reduce_idx * input_strides[dim]; + auto *input_ptr = reinterpret_cast(input_base + current_offset * dtype_size); + float val_f32 = utils::cast(*input_ptr); + max_val = std::max(max_val, val_f32); + } + + // 第二遍:计算 sum(exp(x - max)) + float sum_exp = 0.0f; + for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) { + size_t current_offset = offset + reduce_idx * input_strides[dim]; + auto *input_ptr = reinterpret_cast(input_base + current_offset * dtype_size); + float val_f32 = utils::cast(*input_ptr); + sum_exp += std::exp(val_f32 - max_val); + } + + // 结果:log(sum(exp(x))) = max + log(sum(exp(x - max))) + float result = max_val + std::log(sum_exp); + + auto *output_ptr = reinterpret_cast(output_base + output_idx * dtype_size); + *output_ptr = utils::cast(result); + + } else { + throw std::runtime_error("Unsupported data type for logsumexp operation."); + } + } +} + +static bool registered = []() { + LogSumExp::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::logsumexp_impl::cpu diff --git a/src/infinicore/ops/lp_pool1d/lp_pool1d.cc b/src/infinicore/ops/lp_pool1d/lp_pool1d.cc new file mode 100644 index 000000000..7e5ad7e20 --- /dev/null +++ b/src/infinicore/ops/lp_pool1d/lp_pool1d.cc @@ -0,0 +1,55 @@ +#include "infinicore/ops/lp_pool1d.hpp" +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &Lp_Pool1d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Lp_Pool1d::execute(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) { + infinicore::context::setDevice(input->device(), true); + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(input->device().getType()); + + if (func == nullptr) { + throw std::runtime_error("No Lp_Pool1d implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, norm_type, kernel_size, stride, ceil_mode); +} + +Tensor lp_pool1d(Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) { + auto ndim = input->ndim(); + auto input_shape = input->shape(); + + if (ndim != 3 && ndim != 2) { + throw std::runtime_error("Input tensor must be 3-dimensional (N, C, L_in) or (C, L_in)"); + } + + if (ndim == 2) { + input = input->view({1, input_shape[0], input_shape[1]}); + input_shape = input->shape(); + } + + auto L_in = input_shape[2]; + size_t L_out = 0; + if (ceil_mode) { + L_out = static_cast(std::ceil(static_cast(L_in - kernel_size) / stride)) + 1; + } else { + L_out = static_cast(std::floor(static_cast(L_in - kernel_size) / stride)) + 1; + } + + auto output_shape = Shape{input_shape[0], input_shape[1], L_out}; + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + lp_pool1d_(output, input, norm_type, kernel_size, stride, ceil_mode); + return output; +} + +void lp_pool1d_(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) { + Lp_Pool1d::execute(output, input, norm_type, kernel_size, stride, ceil_mode); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/lp_pool1d/lp_pool1d_cpu.cc b/src/infinicore/ops/lp_pool1d/lp_pool1d_cpu.cc new file mode 100644 index 000000000..379addae1 --- /dev/null +++ b/src/infinicore/ops/lp_pool1d/lp_pool1d_cpu.cc @@ -0,0 +1,115 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/lp_pool1d.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include +#include +#include + +namespace infinicore::op::lp_pool1d_impl::cpu { + +void calculate(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) { + // input: [N, C, L_in], output: [N, C, L_out] + auto input_shapes = input->shape(); + auto input_strides = input->strides(); + auto output_shapes = output->shape(); + auto dtype = input->dtype(); + + auto N = input_shapes[0]; + auto C = input_shapes[1]; + auto L_in = input_shapes[2]; + auto L_out = output_shapes[2]; + + auto stride_N = input_strides[0]; + auto stride_C = input_strides[1]; + auto stride_L = input_strides[2]; + + auto input_base = input->data(); + auto output_base = output->data(); + auto element_size = input->element_size(); + + // 遍历所有样本、通道、输出位置 + for (size_t n = 0; n < N; ++n) { + for (size_t c = 0; c < C; ++c) { + for (size_t out_l = 0; out_l < L_out; ++out_l) { + // 计算窗口的起始位置 + size_t window_start = out_l * stride; + size_t window_end = std::min(window_start + kernel_size, L_in); + + // 计算 Lp 范数 + double sum_power = 0.0; + + // 标准处理:处理有效元素 + for (size_t i = window_start; i < window_end; ++i) { + // 计算元素在内存中的偏移 + size_t offset = n * stride_N + c * stride_C + i * stride_L; + + double val = 0.0; + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(*ptr); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(*ptr); + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(utils::cast(*ptr)); + } + + // 累加 |val|^norm_type + sum_power += std::pow(std::abs(val), norm_type); + } + + // 处理 replicate padding(仅当 ceil_mode=True 且窗口不完整时) + if (ceil_mode && window_end < window_start + kernel_size) { + // 窗口不完整,需要用 replicate padding + // 获取最后一个有效元素 + size_t last_valid_idx = window_end - 1; + size_t offset = n * stride_N + c * stride_C + last_valid_idx * stride_L; + + double last_val = 0.0; + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + last_val = static_cast(*ptr); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + last_val = static_cast(*ptr); + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + last_val = static_cast(utils::cast(*ptr)); + } + + // 重复最后一个有效元素来补全窗口 + size_t padding_count = (window_start + kernel_size) - window_end; + sum_power += padding_count * std::pow(std::abs(last_val), norm_type); + } + + // 计算 Lp 范数结果:(sum_power)^(1/norm_type) + double result = std::pow(sum_power, 1.0 / norm_type); + + // 写入输出(output 一定是连续的) + size_t out_offset = n * (C * L_out) + c * L_out + out_l; + + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = static_cast(result); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = result; + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = utils::cast(static_cast(result)); + } + } + } + } +} + +static bool registered = []() { + Lp_Pool1d::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::lp_pool1d_impl::cpu diff --git a/src/infinicore/ops/lp_pool2d/lp_pool2d.cc b/src/infinicore/ops/lp_pool2d/lp_pool2d.cc new file mode 100644 index 000000000..7e1df296b --- /dev/null +++ b/src/infinicore/ops/lp_pool2d/lp_pool2d.cc @@ -0,0 +1,61 @@ +#include "infinicore/ops/lp_pool2d.hpp" +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &Lp_Pool2d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Lp_Pool2d::execute(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + infinicore::context::setDevice(input->device(), true); + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Lp_Pool2d implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, norm_type, kernel_size, stride, ceil_mode); +} + +Tensor lp_pool2d(Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + const auto ndim = input->ndim(); + auto input_shape = input->shape(); + + if (ndim != 4 && ndim != 3) { + throw std::runtime_error("Input tensor must be 4-dimensional (N, C, H_in, W_in) or (C, H_in, W_in)"); + } + + if (ndim == 3) { + input = input->view({1, input_shape[0], input_shape[1], input_shape[2]}); + input_shape = input->shape(); + } + + const auto [Kernel_H, Kernel_W] = kernel_size; + const auto [Stride_H, Stride_W] = stride; + const auto H_in = input_shape[2]; + const auto W_in = input_shape[3]; + size_t H_out = 0; + size_t W_out = 0; + if (ceil_mode) { + H_out = static_cast(std::ceil(static_cast(H_in - Kernel_H) / Stride_H)) + 1; + W_out = static_cast(std::ceil(static_cast(W_in - Kernel_W) / Stride_W)) + 1; + } else { + H_out = static_cast(std::floor(static_cast(H_in - Kernel_H) / Stride_H)) + 1; + W_out = static_cast(std::floor(static_cast(W_in - Kernel_W) / Stride_W)) + 1; + } + + auto output_shape = Shape{input_shape[0], input_shape[1], H_out, W_out}; + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + lp_pool2d_(output, input, norm_type, kernel_size, stride, ceil_mode); + return output; +} + +void lp_pool2d_(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + Lp_Pool2d::execute(output, input, norm_type, kernel_size, stride, ceil_mode); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/lp_pool2d/lp_pool2d_cpu.cc b/src/infinicore/ops/lp_pool2d/lp_pool2d_cpu.cc new file mode 100644 index 000000000..0aa1b1fe6 --- /dev/null +++ b/src/infinicore/ops/lp_pool2d/lp_pool2d_cpu.cc @@ -0,0 +1,111 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/lp_pool2d.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include +#include +#include + +namespace infinicore::op::lp_pool2d_impl::cpu { + +void calculate(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + // input: [N, C, H_in, W_in], output: [N, C, H_out, W_out] + auto input_shapes = input->shape(); + auto input_strides = input->strides(); + auto output_shapes = output->shape(); + auto dtype = input->dtype(); + + const size_t N = input_shapes[0]; + const size_t C = input_shapes[1]; + const size_t H_in = input_shapes[2]; + const size_t W_in = input_shapes[3]; + + const size_t H_out = output_shapes[2]; + const size_t W_out = output_shapes[3]; + + const size_t stride_N = input_strides[0]; + const size_t stride_C = input_strides[1]; + const size_t stride_H = input_strides[2]; + const size_t stride_W = input_strides[3]; + + const size_t kernel_h = std::get<0>(kernel_size); + const size_t kernel_w = std::get<1>(kernel_size); + const size_t stride_h = std::get<0>(stride); + const size_t stride_w = std::get<1>(stride); + + auto input_base = input->data(); + auto output_base = output->data(); + const auto element_size = input->element_size(); + + for (size_t n = 0; n < N; ++n) { + for (size_t c = 0; c < C; ++c) { + for (size_t oh = 0; oh < H_out; ++oh) { + const size_t h_start = oh * stride_h; + const size_t h_end = std::min(h_start + kernel_h, H_in); + + for (size_t ow = 0; ow < W_out; ++ow) { + const size_t w_start = ow * stride_w; + const size_t w_end = std::min(w_start + kernel_w, W_in); + + double sum_power = 0.0; + + // 累加有效元素 + for (size_t ih = h_start; ih < h_end; ++ih) { + for (size_t iw = w_start; iw < w_end; ++iw) { + const size_t offset = n * stride_N + c * stride_C + ih * stride_H + iw * stride_W; + + double val = 0.0; + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(*ptr); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(*ptr); + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(utils::cast(*ptr)); + } + + sum_power += std::pow(std::abs(val), norm_type); + } + } + + // 处理 ceil_mode:用 replicate padding 放大到完整窗口 + const size_t valid_h = h_end - h_start; + const size_t valid_w = w_end - w_start; + const size_t valid_cnt = valid_h * valid_w; + const size_t full_cnt = kernel_h * kernel_w; + + double scale = 1.0; + if (ceil_mode && valid_cnt < full_cnt) { + // 为了匹配 torch 的行为:mean(valid) * kernel_size + scale = static_cast(full_cnt) / static_cast(valid_cnt); + } + + const double norm = std::pow(sum_power * scale, 1.0 / norm_type); + + const size_t out_offset = n * (C * H_out * W_out) + c * (H_out * W_out) + oh * W_out + ow; + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = static_cast(norm); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = norm; + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = utils::cast(static_cast(norm)); + } + } + } + } + } +} + +static bool registered = []() { + Lp_Pool2d::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::lp_pool2d_impl::cpu diff --git a/src/infinicore/ops/lp_pool3d/lp_pool3d.cc b/src/infinicore/ops/lp_pool3d/lp_pool3d.cc new file mode 100644 index 000000000..9d28e2b47 --- /dev/null +++ b/src/infinicore/ops/lp_pool3d/lp_pool3d.cc @@ -0,0 +1,65 @@ +#include "infinicore/ops/lp_pool3d.hpp" +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &Lp_Pool3d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Lp_Pool3d::execute(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + infinicore::context::setDevice(input->device(), true); + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Lp_Pool3d implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, norm_type, kernel_size, stride, ceil_mode); +} + +Tensor lp_pool3d(Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + const auto ndim = input->ndim(); + auto input_shape = input->shape(); + + if (ndim != 5 && ndim != 4) { + throw std::runtime_error("Input tensor must be 4-dimensional (N, C, D_in, H_in, W_in) or (C, D_in, H_in, W_in)"); + } + + if (ndim == 4) { + input = input->view({1, input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); + input_shape = input->shape(); + } + + const auto [Kernel_D, Kernel_H, Kernel_W] = kernel_size; + const auto [Stride_D, Stride_H, Stride_W] = stride; + const auto D_in = input_shape[2]; + const auto H_in = input_shape[3]; + const auto W_in = input_shape[4]; + size_t D_out = 0; + size_t H_out = 0; + size_t W_out = 0; + if (ceil_mode) { + D_out = static_cast(std::ceil(static_cast(D_in - Kernel_D) / Stride_D)) + 1; + H_out = static_cast(std::ceil(static_cast(H_in - Kernel_H) / Stride_H)) + 1; + W_out = static_cast(std::ceil(static_cast(W_in - Kernel_W) / Stride_W)) + 1; + } else { + D_out = static_cast(std::floor(static_cast(D_in - Kernel_D) / Stride_D)) + 1; + H_out = static_cast(std::floor(static_cast(H_in - Kernel_H) / Stride_H)) + 1; + W_out = static_cast(std::floor(static_cast(W_in - Kernel_W) / Stride_W)) + 1; + } + + auto output_shape = Shape{input_shape[0], input_shape[1], D_out, H_out, W_out}; + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + lp_pool3d_(output, input, norm_type, kernel_size, stride, ceil_mode); + return output; +} + +void lp_pool3d_(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + Lp_Pool3d::execute(output, input, norm_type, kernel_size, stride, ceil_mode); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/lp_pool3d/lp_pool3d_cpu.cc b/src/infinicore/ops/lp_pool3d/lp_pool3d_cpu.cc new file mode 100644 index 000000000..f23faabb8 --- /dev/null +++ b/src/infinicore/ops/lp_pool3d/lp_pool3d_cpu.cc @@ -0,0 +1,124 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/lp_pool3d.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include +#include +#include + +namespace infinicore::op::lp_pool3d_impl::cpu { + +void calculate(Tensor output, Tensor input, float norm_type, const std::tuple kernel_size, const std::tuple stride, bool ceil_mode) { + // input: [N, C, D_in, H_in, W_in], output: [N, C, D_out, H_out, W_out] + auto input_shapes = input->shape(); + auto input_strides = input->strides(); + auto output_shapes = output->shape(); + auto dtype = input->dtype(); + + const size_t N = input_shapes[0]; + const size_t C = input_shapes[1]; + const size_t D_in = input_shapes[2]; + const size_t H_in = input_shapes[3]; + const size_t W_in = input_shapes[4]; + + const size_t D_out = output_shapes[2]; + const size_t H_out = output_shapes[3]; + const size_t W_out = output_shapes[4]; + + const size_t stride_N = input_strides[0]; + const size_t stride_C = input_strides[1]; + const size_t stride_D = input_strides[2]; + const size_t stride_H = input_strides[3]; + const size_t stride_W = input_strides[4]; + + const size_t kernel_d = std::get<0>(kernel_size); + const size_t kernel_h = std::get<1>(kernel_size); + const size_t kernel_w = std::get<2>(kernel_size); + + const size_t stride_d = std::get<0>(stride); + const size_t stride_h = std::get<1>(stride); + const size_t stride_w = std::get<2>(stride); + + auto input_base = input->data(); + auto output_base = output->data(); + const auto element_size = input->element_size(); + + for (size_t n = 0; n < N; ++n) { + for (size_t c = 0; c < C; ++c) { + for (size_t od = 0; od < D_out; ++od) { + const size_t d_start = od * stride_d; + const size_t d_end = std::min(d_start + kernel_d, D_in); + + for (size_t oh = 0; oh < H_out; ++oh) { + const size_t h_start = oh * stride_h; + const size_t h_end = std::min(h_start + kernel_h, H_in); + + for (size_t ow = 0; ow < W_out; ++ow) { + const size_t w_start = ow * stride_w; + const size_t w_end = std::min(w_start + kernel_w, W_in); + + double sum_power = 0.0; + + // 累加有效元素 + for (size_t id = d_start; id < d_end; ++id) { + for (size_t ih = h_start; ih < h_end; ++ih) { + for (size_t iw = w_start; iw < w_end; ++iw) { + const size_t offset = n * stride_N + c * stride_C + id * stride_D + ih * stride_H + iw * stride_W; + + double val = 0.0; + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(*ptr); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(*ptr); + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(input_base + offset * element_size); + val = static_cast(utils::cast(*ptr)); + } + + sum_power += std::pow(std::abs(val), norm_type); + } + } + } + + // ceil_mode: 放大到完整窗口 + const size_t valid_d = d_end - d_start; + const size_t valid_h = h_end - h_start; + const size_t valid_w = w_end - w_start; + const size_t valid_cnt = valid_d * valid_h * valid_w; + const size_t full_cnt = kernel_d * kernel_h * kernel_w; + + double scale = 1.0; + if (ceil_mode && valid_cnt < full_cnt) { + scale = static_cast(full_cnt) / static_cast(valid_cnt); + } + + const double norm = std::pow(sum_power * scale, 1.0 / norm_type); + + const size_t out_offset = n * (C * D_out * H_out * W_out) + c * (D_out * H_out * W_out) + od * (H_out * W_out) + oh * W_out + ow; + if (dtype == DataType::F32) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = static_cast(norm); + } else if (dtype == DataType::F64) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = norm; + } else if (dtype == DataType::F16) { + auto *ptr = reinterpret_cast(output_base + out_offset * element_size); + *ptr = utils::cast(static_cast(norm)); + } + } + } + } + } + } +} + +static bool registered = []() { + Lp_Pool3d::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::lp_pool3d_impl::cpu diff --git a/src/infinicore/ops/max_global/max_global.cc b/src/infinicore/ops/max_global/max_global.cc new file mode 100644 index 000000000..df9a60131 --- /dev/null +++ b/src/infinicore/ops/max_global/max_global.cc @@ -0,0 +1,28 @@ +#include "infinicore/ops/max_global.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &MaxGlobal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void MaxGlobal::execute(Tensor input, Tensor output) { + infinicore::context::setDevice(input->device(), true); + dispatcher().lookup(input->device().getType())(input, output); +} + +Tensor max_global(Tensor input) { + Shape shape = Shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + max_global_(input, output); + return output; +} + +void max_global_(Tensor input, Tensor output) { + MaxGlobal::execute(input, output); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/max_global/max_global_cpu.cc b/src/infinicore/ops/max_global/max_global_cpu.cc new file mode 100644 index 000000000..bb35661f7 --- /dev/null +++ b/src/infinicore/ops/max_global/max_global_cpu.cc @@ -0,0 +1,88 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/max_global.hpp" +#include +#include + +namespace infinicore::op::max_global_impl::cpu { + +void calculate(Tensor input, Tensor output) { + auto strides = input->strides(); // vector + auto shapes = input->shape(); // vector + auto ndim = input->ndim(); + auto dtype = input->dtype(); + auto dtype_size = input->element_size(); + auto numel = input->numel(); + + auto input_base = input->data(); + auto output_base = output->data(); + + // ---- 正确的 max 变量定义(不再在 if block 内) ---- + float max_f32 = -std::numeric_limits::infinity(); + double max_f64 = -std::numeric_limits::infinity(); + float max_f16f = -std::numeric_limits::infinity(); // F16 accumulate in F32 + + // ---- 根据 dtype 分支遍历 ---- + + // 初始化 indices + std::vector indices(ndim, 0); + + for (size_t idx = 0; idx < numel; ++idx) { + size_t offset = 0; + for (size_t dim = 0; dim < ndim; ++dim) { + offset += indices[dim] * strides[dim]; + } + + if (dtype == DataType::F32) { + auto* ptr = reinterpret_cast(input_base + offset * dtype_size); + float v = *ptr; + max_f32 = std::max(max_f32, v); + + } else if (dtype == DataType::F64) { + auto* ptr = reinterpret_cast(input_base + offset * dtype_size); + double v = *ptr; + max_f64 = std::max(max_f64, v); + + } else if (dtype == DataType::F16) { + auto* ptr = reinterpret_cast(input_base + offset * dtype_size); + float v = utils::cast(*ptr); + max_f16f = std::max(max_f16f, v); + + } else { + throw std::runtime_error("Unsupported dtype."); + } + + // 更新 indices + for (ssize_t dim = ndim - 1; dim >= 0; --dim) { + indices[dim]++; + if (indices[dim] < shapes[dim]) + break; + indices[dim] = 0; + } + } + + // ---- 写输出(scalar)---- + if (dtype == DataType::F32) { + auto* out = reinterpret_cast(output_base); + *out = max_f32; + + } else if (dtype == DataType::F64) { + auto* out = reinterpret_cast(output_base); + *out = max_f64; + + } else if (dtype == DataType::F16) { + auto* out = reinterpret_cast(output_base); + *out = utils::cast(max_f16f); + + } else { + throw std::runtime_error("Unsupported dtype."); + } +} + + +static bool registered = []() { + MaxGlobal::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::max_global_impl::cpu diff --git a/src/infinicore/ops/max_reduce/max_reduce.cc b/src/infinicore/ops/max_reduce/max_reduce.cc new file mode 100644 index 000000000..7820afbf8 --- /dev/null +++ b/src/infinicore/ops/max_reduce/max_reduce.cc @@ -0,0 +1,50 @@ +#include "infinicore/ops/max_reduce.hpp" + +#include "../../utils.hpp" +#include "infinicore/dtype.hpp" + +namespace infinicore::op { + +common::OpDispatcher &MaxReduce::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void MaxReduce::execute(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim) { + infinicore::context::setDevice(input->device(), true); + dispatcher().lookup(input->device().getType())(input, output, indices, dim, keepdim); +} + +std::tuple max_reduce(Tensor input, int dim, bool keepdim) { + // 规范化 dim + int normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim = input->ndim() + normalized_dim; + } + + // 计算输出形状 + Shape output_shape; + const auto &input_shape = input->shape(); + + if (keepdim) { + output_shape = input_shape; + output_shape[normalized_dim] = 1; + } else { + for (int i = 0; i < static_cast(input_shape.size()); ++i) { + if (i != normalized_dim) { + output_shape.push_back(input_shape[i]); + } + } + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + auto indices = Tensor::empty(output_shape, DataType::I64, input->device()); + max_reduce_(input, output, indices, dim, keepdim); + return {output, indices}; +} + +void max_reduce_(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim) { + MaxReduce::execute(input, output, indices, dim, keepdim); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/max_reduce/max_reduce_cpu.cc b/src/infinicore/ops/max_reduce/max_reduce_cpu.cc new file mode 100644 index 000000000..b4216d920 --- /dev/null +++ b/src/infinicore/ops/max_reduce/max_reduce_cpu.cc @@ -0,0 +1,124 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/max_reduce.hpp" +#include +#include + +namespace infinicore::op::max_reduce_impl::cpu { + +void calculate(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim) { + auto input_shapes = input->shape(); + auto input_strides = input->strides(); + auto output_shapes = output->shape(); + auto output_strides = output->strides(); + auto indices_shapes = indices->shape(); + auto indices_strides = indices->strides(); + auto ndim = input->ndim(); + auto dtype = input->dtype(); + auto dtype_size = input->element_size(); + auto indices_size = indices->element_size(); + + // 规范化 dim 到 [0, ndim) + if (dim < 0) { + dim = ndim + dim; + } + + auto input_base = input->data(); + auto output_base = output->data(); + auto indices_base = indices->data(); + + // 获取约化维度的大小 + size_t reduce_size = input_shapes[dim]; + size_t output_numel = output->numel(); + +// 对每个输出元素,计算沿着 dim 的 max +#pragma omp parallel for collapse(1) + for (size_t output_idx = 0; output_idx < output_numel; ++output_idx) { + // 根据输出索引计算多维坐标 + std::vector output_indices(output_shapes.size()); + size_t temp_idx = output_idx; + for (int i = static_cast(output_shapes.size()) - 1; i >= 0; --i) { + output_indices[i] = temp_idx % output_shapes[i]; + temp_idx /= output_shapes[i]; + } + + // 根据输出坐标映射到输入坐标,计算起始位置 + // 对于 keepdim=True: 输出形状 = 输入形状,但减少维度为 1 + // 对于 keepdim=False: 输出形状 < 输入形状,缺少减少的维度 + std::vector input_indices(ndim); + if (keepdim) { + // 直接对应:输出维度对应输入维度 + for (int i = 0; i < ndim; ++i) { + if (i == dim) { + input_indices[i] = 0; // 减少维度设为 0(我们稍后会遍历) + } else { + input_indices[i] = output_indices[i]; + } + } + } else { + // 跳过减少的维度:输出缺少一个维度 + int output_dim = 0; + for (int i = 0; i < ndim; ++i) { + if (i == dim) { + input_indices[i] = 0; // 减少维度设为 0 + } else { + input_indices[i] = output_indices[output_dim]; + output_dim++; + } + } + } + + // 计算在输入中的起始偏移 + size_t offset = 0; + for (int i = 0; i < ndim; ++i) { + if (i != dim) { // 跳过约化维度 + offset += input_indices[i] * input_strides[i]; + } + } + + // 计算沿着 reduce 维度的 max + if (dtype == DataType::F32) { + float max_val = -std::numeric_limits::infinity(); + size_t max_idx = -1; + for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) { + size_t current_offset = offset + reduce_idx * input_strides[dim]; + float *input_ptr = reinterpret_cast(input_base + current_offset * dtype_size); + if (max_val < *input_ptr) { + max_val = *input_ptr; + max_idx = reduce_idx; + } + } + float *output_ptr = reinterpret_cast(output_base + output_idx * dtype_size); + int64_t *indices_ptr = reinterpret_cast(indices_base + output_idx * indices_size); + *output_ptr = max_val; + *indices_ptr = static_cast(max_idx); + + } else if (dtype == DataType::F16) { + auto max_val = -std::numeric_limits::infinity(); + size_t max_idx = -1; + for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) { + size_t current_offset = offset + reduce_idx * input_strides[dim]; + auto *input_ptr = reinterpret_cast(input_base + current_offset * dtype_size); + auto val_f32 = utils::cast(*input_ptr); + if (max_val < val_f32) { + max_val = val_f32; + max_idx = reduce_idx; + } + } + auto *output_ptr = reinterpret_cast(output_base + output_idx * dtype_size); + int64_t *indices_ptr = reinterpret_cast(indices_base + output_idx * indices_size); + *output_ptr = utils::cast(max_val); + *indices_ptr = static_cast(max_idx); + + } else { + throw std::runtime_error("Unsupported data type for logsumexp operation."); + } + } +} + +static bool registered = []() { + MaxReduce::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::max_reduce_impl::cpu diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..dd99865e6 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -7,7 +7,12 @@ #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" +#include "ops/logsumexp.hpp" +#include "ops/lp_pool1d.hpp" +#include "ops/lp_pool2d.hpp" +#include "ops/lp_pool3d.hpp" #include "ops/matmul.hpp" +#include "ops/max.hpp" #include "ops/mul.hpp" #include "ops/random_sample.hpp" #include "ops/rearrange.hpp" @@ -34,6 +39,11 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_logsumexp(m); + bind_lp_pool1d(m); + bind_lp_pool2d(m); + bind_lp_pool3d(m); + bind_max(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/logsumexp.hpp b/src/infinicore/pybind11/ops/logsumexp.hpp new file mode 100644 index 000000000..1a9bce049 --- /dev/null +++ b/src/infinicore/pybind11/ops/logsumexp.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +#include "infinicore/ops/logsumexp.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_logsumexp(py::module &m) { + m.def("logsumexp", + &op::logsumexp, + py::arg("input"), + py::arg("dim"), + py::arg("keepdim"), + R"doc(Logarithm of the sum of exponentials of the input tensor.)doc"); + + m.def("logsumexp_", + &op::logsumexp_, + py::arg("input"), + py::arg("dim"), + py::arg("keepdim"), + py::arg("output"), + R"doc(In-place logarithm of the sum of exponentials of the input tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/lp_pool1d.hpp b/src/infinicore/pybind11/ops/lp_pool1d.hpp new file mode 100644 index 000000000..4585f08e2 --- /dev/null +++ b/src/infinicore/pybind11/ops/lp_pool1d.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "infinicore/ops/lp_pool1d.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_lp_pool1d(py::module &m) { + m.def("lp_pool1d", + &op::lp_pool1d, + py::arg("input"), + py::arg("norm_type"), + py::arg("kernel_size"), + py::arg("stride") = 0, + py::arg("ceil_mode") = false, + R"doc(Applies a 1D power-average pooling over an input signal composed of several input planes.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/lp_pool2d.hpp b/src/infinicore/pybind11/ops/lp_pool2d.hpp new file mode 100644 index 000000000..87af4497b --- /dev/null +++ b/src/infinicore/pybind11/ops/lp_pool2d.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "infinicore/ops/lp_pool2d.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_lp_pool2d(py::module &m) { + m.def("lp_pool2d", + &op::lp_pool2d, + py::arg("input"), + py::arg("norm_type"), + py::arg("kernel_size"), + py::arg("stride") = py::none(), + py::arg("ceil_mode") = false, + R"doc(Applies a 2D power-average pooling over an input signal composed of several input planes.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/lp_pool3d.hpp b/src/infinicore/pybind11/ops/lp_pool3d.hpp new file mode 100644 index 000000000..4648d047c --- /dev/null +++ b/src/infinicore/pybind11/ops/lp_pool3d.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "infinicore/ops/lp_pool3d.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_lp_pool3d(py::module &m) { + m.def("lp_pool3d", + &op::lp_pool3d, + py::arg("input"), + py::arg("norm_type"), + py::arg("kernel_size"), + py::arg("stride") = py::none(), + py::arg("ceil_mode") = false, + R"doc(Applies a 3D power-average pooling over an input signal composed of several input planes.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/max.hpp b/src/infinicore/pybind11/ops/max.hpp new file mode 100644 index 000000000..050bb784b --- /dev/null +++ b/src/infinicore/pybind11/ops/max.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include + +#include "infinicore/ops/max_global.hpp" +#include "infinicore/ops/max_reduce.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_max(py::module &m) { + m.def("max_reduce", + &op::max_reduce, + py::arg("input"), + py::arg("dim"), + py::arg("keepdim") = false, + R"doc(Reduces the input tensor along the specified dimension by taking the maximum value.)doc"); + + m.def("max_reduce_", + &op::max_reduce_, + py::arg("input"), + py::arg("output"), + py::arg("indices"), + py::arg("dim"), + py::arg("keepdim") = false, + R"doc(In-place max reduction along the specified dimension.)doc"); + + m.def("max_global", + &op::max_global, + py::arg("input"), + R"doc(Reduces the input tensor globally by taking the maximum value across all elements.)doc"); + + m.def("max_global_", + &op::max_global_, + py::arg("input"), + py::arg("output"), + R"doc(In-place global max reduction.)doc"); +} + +} // namespace infinicore::ops diff --git a/test/infinicore/ops/logsumexp.py b/test/infinicore/ops/logsumexp.py index ff5df7a61..9ddbb82a2 100644 --- a/test/infinicore/ops/logsumexp.py +++ b/test/infinicore/ops/logsumexp.py @@ -3,8 +3,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch from framework import ( BaseOperatorTest, TensorSpec, @@ -22,7 +22,7 @@ ((2, 3, 4), None, 2, True, (0, 1, 1)), ((1, 8), None, 0, False, None), ((16, 64), (128, 1), 1, True, None), - ((4, 5, 6), (60, 12, 2), 2, True, (12, 4, 1)), + # ((4, 5, 6), (60, 12, 2), 2, True, (12, 4, 1)), # 这个测试用例会导致数据重叠 ] _TOLERANCE_MAP = { @@ -104,9 +104,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.logsumexp(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.logsumexp(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.logsumexp(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/lp_pool1d.py b/test/infinicore/ops/lp_pool1d.py index 6efd30de2..856c020a2 100644 --- a/test/infinicore/ops/lp_pool1d.py +++ b/test/infinicore/ops/lp_pool1d.py @@ -3,8 +3,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner # Test cases format: (in_shape, in_strides_or_None, norm_type, kernel_size, stride_or_None, ceil_mode) @@ -60,9 +60,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.lp_pool1d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.lp_pool1d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.lp_pool1d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/lp_pool2d.py b/test/infinicore/ops/lp_pool2d.py index 4d8ec03cd..8a1d3bfd8 100644 --- a/test/infinicore/ops/lp_pool2d.py +++ b/test/infinicore/ops/lp_pool2d.py @@ -3,8 +3,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner # Test cases format: (in_shape, in_strides_or_None, norm_type, kernel_size, stride_or_None, ceil_mode) @@ -60,9 +60,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.lp_pool2d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.lp_pool2d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.lp_pool2d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/lp_pool3d.py b/test/infinicore/ops/lp_pool3d.py index c3acc22a9..0bda4ae62 100644 --- a/test/infinicore/ops/lp_pool3d.py +++ b/test/infinicore/ops/lp_pool3d.py @@ -3,8 +3,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner # Test cases format: (in_shape, in_strides_or_None, norm_type, kernel_size, stride_or_None, ceil_mode) @@ -57,9 +57,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.lp_pool3d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.lp_pool3d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.lp_pool3d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/max.py b/test/infinicore/ops/max.py index 89308c0f5..d45f725e0 100644 --- a/test/infinicore/ops/max.py +++ b/test/infinicore/ops/max.py @@ -3,8 +3,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch from framework import ( BaseOperatorTest, TensorSpec, @@ -111,9 +111,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.max(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.max(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.max(*args, **kwargs) def main():