diff --git a/include/infinicore/ops/logsumexp.hpp b/include/infinicore/ops/logsumexp.hpp
new file mode 100644
index 000000000..aa9b359c8
--- /dev/null
+++ b/include/infinicore/ops/logsumexp.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class LogSumExp {
+public:
+    using schema = void (*)(Tensor, int, bool, Tensor);
+    static void execute(Tensor input, int dim, bool keepdim, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor logsumexp(Tensor input, int dim, bool keepdim);
+void logsumexp_(Tensor input, int dim, bool keepdim, Tensor output);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/lp_pool1d.hpp b/include/infinicore/ops/lp_pool1d.hpp
new file mode 100644
index 000000000..71704059a
--- /dev/null
+++ b/include/infinicore/ops/lp_pool1d.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Lp_Pool1d {
+public:
+    using schema = void (*)(Tensor, Tensor, float, size_t, size_t, bool);
+    static void execute(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor lp_pool1d(Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode);
+void lp_pool1d_(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/lp_pool2d.hpp b/include/infinicore/ops/lp_pool2d.hpp
new file mode 100644
index 000000000..8523fd71b
--- /dev/null
+++ b/include/infinicore/ops/lp_pool2d.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <tuple>
+
+namespace infinicore::op {
+class Lp_Pool2d {
+public:
+    using schema = void (*)(Tensor, Tensor, float, const std::tuple<size_t, size_t>, const std::tuple<size_t, size_t>, bool);
+    static void execute(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor lp_pool2d(Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode);
+void lp_pool2d_(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/lp_pool3d.hpp b/include/infinicore/ops/lp_pool3d.hpp
new file mode 100644
index 000000000..7ad3facab
--- /dev/null
+++ b/include/infinicore/ops/lp_pool3d.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <tuple>
+
+namespace infinicore::op {
+class Lp_Pool3d {
+public:
+    using schema = void (*)(Tensor, Tensor, float, const std::tuple<size_t, size_t, size_t>, const std::tuple<size_t, size_t, size_t>, bool);
+    static void execute(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor lp_pool3d(Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode);
+void lp_pool3d_(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/max_global.hpp b/include/infinicore/ops/max_global.hpp
new file mode 100644
index 000000000..91707c679
--- /dev/null
+++ b/include/infinicore/ops/max_global.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class MaxGlobal {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor input, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor max_global(Tensor input);
+void max_global_(Tensor input, Tensor output);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/max_reduce.hpp b/include/infinicore/ops/max_reduce.hpp
new file mode 100644
index 000000000..9919a1650
--- /dev/null
+++ b/include/infinicore/ops/max_reduce.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <tuple>
+
+namespace infinicore::op {
+
+class MaxReduce {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, int, bool);
+    static void execute(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+std::tuple<Tensor, Tensor> max_reduce(Tensor input, int dim, bool keepdim);
+void max_reduce_(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim);
+
+} // namespace infinicore::op
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 5c541ec3c..0827a98a6 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -45,6 +45,8 @@
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.logsumexp import logsumexp
+from infinicore.ops.max import max
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -115,6 +117,8 @@
     "strided_empty",
     "strided_from_blob",
     "zeros",
+    "logsumexp",
+    "max",
 ]
 
 use_ntops = False
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..4d6b2568a 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,9 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .lp_pool1d import lp_pool1d
+from .lp_pool2d import lp_pool2d
+from .lp_pool3d import lp_pool3d
 
 __all__ = [
     "causal_softmax",
@@ -17,4 +20,7 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "lp_pool1d",
+    "lp_pool2d",
+    "lp_pool3d",
 ]
diff --git a/python/infinicore/nn/functional/lp_pool1d.py b/python/infinicore/nn/functional/lp_pool1d.py
new file mode 100644
index 000000000..b8970f3a2
--- /dev/null
+++ b/python/infinicore/nn/functional/lp_pool1d.py
@@ -0,0 +1,27 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def lp_pool1d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: int,
+    stride: int | None = None,
+    ceil_mode: bool = False,
+):
+    r"""Applies a 1D power-average pooling over an input signal composed of several input planes."""
+
+    if stride is None:
+        stride = kernel_size
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.lp_pool1d(
+            input, norm_type, kernel_size, stride, ceil_mode
+        )
+
+    return Tensor(
+        _infinicore.lp_pool1d(
+            input._underlying, norm_type, kernel_size, stride, ceil_mode
+        )
+    )
diff --git a/python/infinicore/nn/functional/lp_pool2d.py b/python/infinicore/nn/functional/lp_pool2d.py
new file mode 100644
index 000000000..d51d13255
--- /dev/null
+++ b/python/infinicore/nn/functional/lp_pool2d.py
@@ -0,0 +1,32 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def lp_pool2d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: int | tuple[int, int],
+    stride: int | tuple[int, int] | None = None,
+    ceil_mode: bool = False,
+):
+    r"""Applies a 2D power-average pooling over an input signal composed of several input planes."""
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+
+    if isinstance(stride, int):
+        stride = (stride, stride)
+
+    if stride is None:
+        stride = kernel_size
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.lp_pool2d(
+            input, norm_type, kernel_size, stride, ceil_mode
+        )
+
+    return Tensor(
+        _infinicore.lp_pool2d(
+            input._underlying, norm_type, kernel_size, stride, ceil_mode
+        )
+    )
diff --git a/python/infinicore/nn/functional/lp_pool3d.py b/python/infinicore/nn/functional/lp_pool3d.py
new file mode 100644
index 000000000..42d97c58c
--- /dev/null
+++ b/python/infinicore/nn/functional/lp_pool3d.py
@@ -0,0 +1,32 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def lp_pool3d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: int | tuple[int, int, int],
+    stride: int | tuple[int, int, int] | None = None,
+    ceil_mode: bool = False,
+):
+    r"""Applies a 3D power-average pooling over an input signal composed of several input planes."""
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+
+    if isinstance(stride, int):
+        stride = (stride, stride, stride)
+
+    if stride is None:
+        stride = kernel_size
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.lp_pool3d(
+            input, norm_type, kernel_size, stride, ceil_mode
+        )
+
+    return Tensor(
+        _infinicore.lp_pool3d(
+            input._underlying, norm_type, kernel_size, stride, ceil_mode
+        )
+    )
diff --git a/python/infinicore/ops/logsumexp.py b/python/infinicore/ops/logsumexp.py
new file mode 100644
index 000000000..751f13c06
--- /dev/null
+++ b/python/infinicore/ops/logsumexp.py
@@ -0,0 +1,19 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def logsumexp(
+    input: Tensor, dim: int | None = None, keepdim=False, *, out=None
+) -> Tensor:
+    r"""Apply the logsumexp function."""
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.logsumexp(input, dim, keepdim=keepdim, out=out)
+
+    if out is None:
+        return Tensor(_infinicore.logsumexp(input._underlying, dim, keepdim))
+
+    _infinicore.logsumexp_(input._underlying, dim, keepdim, out._underlying)
+
+    return out
diff --git a/python/infinicore/ops/max.py b/python/infinicore/ops/max.py
new file mode 100644
index 000000000..d643dae7c
--- /dev/null
+++ b/python/infinicore/ops/max.py
@@ -0,0 +1,33 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def max(
+    input: Tensor, dim: int | None = None, keepdim=False, *, out=None
+) -> Tensor | tuple[Tensor, Tensor]:
+    r"""Apply the max function."""
+
+    if dim is None:
+        if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+            return infinicore.ntops.torch.max(input, out=out)
+
+        if out is None:
+            return Tensor(_infinicore.max_global(input._underlying))
+
+        _infinicore.max_global_(input._underlying, out._underlying)
+
+        return out
+    else:
+        if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+            return infinicore.ntops.torch.max(input, dim, keepdim=keepdim, out=out)
+
+        if out is None:
+            res, res_idx = _infinicore.max_reduce(input._underlying, dim, keepdim)
+            return Tensor(res), Tensor(res_idx)
+
+        _infinicore.max_reduce_(
+            input._underlying, out[0]._underlying, out[1]._underlying, dim, keepdim
+        )
+
+        return out
diff --git a/src/infinicore/ops/logsumexp/logsumexp.cc b/src/infinicore/ops/logsumexp/logsumexp.cc
new file mode 100644
index 000000000..0026ad45b
--- /dev/null
+++ b/src/infinicore/ops/logsumexp/logsumexp.cc
@@ -0,0 +1,54 @@
+#include "infinicore/ops/logsumexp.hpp"
+#include <iostream>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<LogSumExp::schema> &LogSumExp::dispatcher() {
+    static common::OpDispatcher<LogSumExp::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void LogSumExp::execute(Tensor input, int dim, bool keepdim, Tensor output) {
+    infinicore::context::setDevice(input->device(), true);
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No LogSumExp implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(input, dim, keepdim, output);
+}
+
+Tensor logsumexp(Tensor input, int dim, bool keepdim) {
+    // 规范化 dim
+    int normalized_dim = dim;
+    if (normalized_dim < 0) {
+        normalized_dim = input->ndim() + normalized_dim;
+    }
+
+    // 计算输出形状
+    Shape output_shape;
+    const auto &input_shape = input->shape();
+
+    if (keepdim) {
+        output_shape = input_shape;
+        output_shape[normalized_dim] = 1;
+    } else {
+        for (int i = 0; i < static_cast<int>(input_shape.size()); ++i) {
+            if (i != normalized_dim) {
+                output_shape.push_back(input_shape[i]);
+            }
+        }
+    }
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    logsumexp_(input, dim, keepdim, output);
+    return output;
+}
+
+void logsumexp_(Tensor input, int dim, bool keepdim, Tensor output) {
+    LogSumExp::execute(input, dim, keepdim, output);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/logsumexp/logsumexp_cpu.cc b/src/infinicore/ops/logsumexp/logsumexp_cpu.cc
new file mode 100644
index 000000000..14a5df5c6
--- /dev/null
+++ b/src/infinicore/ops/logsumexp/logsumexp_cpu.cc
@@ -0,0 +1,141 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/logsumexp.hpp"
+#include "infinicore/tensor.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+
+namespace infinicore::op::logsumexp_impl::cpu {
+
+void calculate(Tensor input, int dim, bool keepdim, Tensor output) {
+    auto input_shapes = input->shape();
+    auto input_strides = input->strides();
+    auto output_shapes = output->shape();
+    auto output_strides = output->strides();
+    auto ndim = input->ndim();
+    auto dtype = input->dtype();
+    auto dtype_size = input->element_size();
+
+    // 规范化 dim 到 [0, ndim)
+    if (dim < 0) {
+        dim = ndim + dim;
+    }
+
+    auto input_base = input->data();
+    auto output_base = output->data();
+
+    // 获取约化维度的大小
+    size_t reduce_size = input_shapes[dim];
+    size_t output_numel = output->numel();
+
+// 对每个输出元素，计算沿着 dim 的 logsumexp
+#pragma omp parallel for collapse(1)
+    for (size_t output_idx = 0; output_idx < output_numel; ++output_idx) {
+        // 根据输出索引计算多维坐标
+        std::vector<size_t> output_indices(output_shapes.size());
+        size_t temp_idx = output_idx;
+        for (int i = static_cast<int>(output_shapes.size()) - 1; i >= 0; --i) {
+            output_indices[i] = temp_idx % output_shapes[i];
+            temp_idx /= output_shapes[i];
+        }
+
+        // 根据输出坐标映射到输入坐标，计算起始位置
+        // 对于 keepdim=True: 输出形状 = 输入形状，但减少维度为 1
+        // 对于 keepdim=False: 输出形状 < 输入形状，缺少减少的维度
+        std::vector<size_t> input_indices(ndim);
+        if (keepdim) {
+            // 直接对应：输出维度对应输入维度
+            for (int i = 0; i < ndim; ++i) {
+                if (i == dim) {
+                    input_indices[i] = 0; // 减少维度设为 0（我们稍后会遍历）
+                } else {
+                    input_indices[i] = output_indices[i];
+                }
+            }
+        } else {
+            // 跳过减少的维度：输出缺少一个维度
+            int output_dim = 0;
+            for (int i = 0; i < ndim; ++i) {
+                if (i == dim) {
+                    input_indices[i] = 0; // 减少维度设为 0
+                } else {
+                    input_indices[i] = output_indices[output_dim];
+                    output_dim++;
+                }
+            }
+        }
+
+        // 计算在输入中的起始偏移
+        size_t offset = 0;
+        for (int i = 0; i < ndim; ++i) {
+            if (i != dim) { // 跳过约化维度
+                offset += input_indices[i] * input_strides[i];
+            }
+        }
+
+        // 计算沿着 reduce 维度的 logsumexp
+        if (dtype == DataType::F32) {
+            float max_val = -std::numeric_limits<float>::infinity();
+
+            // 第一遍：找最大值
+            for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) {
+                size_t current_offset = offset + reduce_idx * input_strides[dim];
+                float *input_ptr = reinterpret_cast<float *>(input_base + current_offset * dtype_size);
+                max_val = std::max(max_val, *input_ptr);
+            }
+
+            // 第二遍：计算 sum(exp(x - max))
+            float sum_exp = 0.0f;
+            for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) {
+                size_t current_offset = offset + reduce_idx * input_strides[dim];
+                float *input_ptr = reinterpret_cast<float *>(input_base + current_offset * dtype_size);
+                sum_exp += std::exp(*input_ptr - max_val);
+            }
+
+            // 结果：log(sum(exp(x))) = max + log(sum(exp(x - max)))
+            float result = max_val + std::log(sum_exp);
+
+            float *output_ptr = reinterpret_cast<float *>(output_base + output_idx * dtype_size);
+            *output_ptr = result;
+
+        } else if (dtype == DataType::F16) {
+            float max_val = -std::numeric_limits<float>::infinity();
+
+            // 第一遍：找最大值（转换为 F32）
+            for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) {
+                size_t current_offset = offset + reduce_idx * input_strides[dim];
+                auto *input_ptr = reinterpret_cast<fp16_t *>(input_base + current_offset * dtype_size);
+                float val_f32 = utils::cast<float>(*input_ptr);
+                max_val = std::max(max_val, val_f32);
+            }
+
+            // 第二遍：计算 sum(exp(x - max))
+            float sum_exp = 0.0f;
+            for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) {
+                size_t current_offset = offset + reduce_idx * input_strides[dim];
+                auto *input_ptr = reinterpret_cast<fp16_t *>(input_base + current_offset * dtype_size);
+                float val_f32 = utils::cast<float>(*input_ptr);
+                sum_exp += std::exp(val_f32 - max_val);
+            }
+
+            // 结果：log(sum(exp(x))) = max + log(sum(exp(x - max)))
+            float result = max_val + std::log(sum_exp);
+
+            auto *output_ptr = reinterpret_cast<fp16_t *>(output_base + output_idx * dtype_size);
+            *output_ptr = utils::cast<fp16_t>(result);
+
+        } else {
+            throw std::runtime_error("Unsupported data type for logsumexp operation.");
+        }
+    }
+}
+
+static bool registered = []() {
+    LogSumExp::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::logsumexp_impl::cpu
diff --git a/src/infinicore/ops/lp_pool1d/lp_pool1d.cc b/src/infinicore/ops/lp_pool1d/lp_pool1d.cc
new file mode 100644
index 000000000..7e5ad7e20
--- /dev/null
+++ b/src/infinicore/ops/lp_pool1d/lp_pool1d.cc
@@ -0,0 +1,55 @@
+#include "infinicore/ops/lp_pool1d.hpp"
+#include <cmath>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Lp_Pool1d::schema> &Lp_Pool1d::dispatcher() {
+    static common::OpDispatcher<Lp_Pool1d::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Lp_Pool1d::execute(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) {
+    infinicore::context::setDevice(input->device(), true);
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(input->device().getType());
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Lp_Pool1d implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, norm_type, kernel_size, stride, ceil_mode);
+}
+
+Tensor lp_pool1d(Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) {
+    auto ndim = input->ndim();
+    auto input_shape = input->shape();
+
+    if (ndim != 3 && ndim != 2) {
+        throw std::runtime_error("Input tensor must be 3-dimensional (N, C, L_in) or (C, L_in)");
+    }
+
+    if (ndim == 2) {
+        input = input->view({1, input_shape[0], input_shape[1]});
+        input_shape = input->shape();
+    }
+
+    auto L_in = input_shape[2];
+    size_t L_out = 0;
+    if (ceil_mode) {
+        L_out = static_cast<size_t>(std::ceil(static_cast<float>(L_in - kernel_size) / stride)) + 1;
+    } else {
+        L_out = static_cast<size_t>(std::floor(static_cast<float>(L_in - kernel_size) / stride)) + 1;
+    }
+
+    auto output_shape = Shape{input_shape[0], input_shape[1], L_out};
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    lp_pool1d_(output, input, norm_type, kernel_size, stride, ceil_mode);
+    return output;
+}
+
+void lp_pool1d_(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) {
+    Lp_Pool1d::execute(output, input, norm_type, kernel_size, stride, ceil_mode);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/lp_pool1d/lp_pool1d_cpu.cc b/src/infinicore/ops/lp_pool1d/lp_pool1d_cpu.cc
new file mode 100644
index 000000000..379addae1
--- /dev/null
+++ b/src/infinicore/ops/lp_pool1d/lp_pool1d_cpu.cc
@@ -0,0 +1,115 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/lp_pool1d.hpp"
+#include "infinicore/tensor.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+
+namespace infinicore::op::lp_pool1d_impl::cpu {
+
+void calculate(Tensor output, Tensor input, float norm_type, size_t kernel_size, size_t stride, bool ceil_mode) {
+    // input: [N, C, L_in], output: [N, C, L_out]
+    auto input_shapes = input->shape();
+    auto input_strides = input->strides();
+    auto output_shapes = output->shape();
+    auto dtype = input->dtype();
+
+    auto N = input_shapes[0];
+    auto C = input_shapes[1];
+    auto L_in = input_shapes[2];
+    auto L_out = output_shapes[2];
+
+    auto stride_N = input_strides[0];
+    auto stride_C = input_strides[1];
+    auto stride_L = input_strides[2];
+
+    auto input_base = input->data();
+    auto output_base = output->data();
+    auto element_size = input->element_size();
+
+    // 遍历所有样本、通道、输出位置
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t c = 0; c < C; ++c) {
+            for (size_t out_l = 0; out_l < L_out; ++out_l) {
+                // 计算窗口的起始位置
+                size_t window_start = out_l * stride;
+                size_t window_end = std::min(window_start + kernel_size, L_in);
+
+                // 计算 Lp 范数
+                double sum_power = 0.0;
+
+                // 标准处理：处理有效元素
+                for (size_t i = window_start; i < window_end; ++i) {
+                    // 计算元素在内存中的偏移
+                    size_t offset = n * stride_N + c * stride_C + i * stride_L;
+
+                    double val = 0.0;
+                    if (dtype == DataType::F32) {
+                        auto *ptr = reinterpret_cast<float *>(input_base + offset * element_size);
+                        val = static_cast<double>(*ptr);
+                    } else if (dtype == DataType::F64) {
+                        auto *ptr = reinterpret_cast<double *>(input_base + offset * element_size);
+                        val = static_cast<double>(*ptr);
+                    } else if (dtype == DataType::F16) {
+                        auto *ptr = reinterpret_cast<fp16_t *>(input_base + offset * element_size);
+                        val = static_cast<double>(utils::cast<float>(*ptr));
+                    }
+
+                    // 累加 |val|^norm_type
+                    sum_power += std::pow(std::abs(val), norm_type);
+                }
+
+                // 处理 replicate padding（仅当 ceil_mode=True 且窗口不完整时）
+                if (ceil_mode && window_end < window_start + kernel_size) {
+                    // 窗口不完整，需要用 replicate padding
+                    // 获取最后一个有效元素
+                    size_t last_valid_idx = window_end - 1;
+                    size_t offset = n * stride_N + c * stride_C + last_valid_idx * stride_L;
+
+                    double last_val = 0.0;
+                    if (dtype == DataType::F32) {
+                        auto *ptr = reinterpret_cast<float *>(input_base + offset * element_size);
+                        last_val = static_cast<double>(*ptr);
+                    } else if (dtype == DataType::F64) {
+                        auto *ptr = reinterpret_cast<double *>(input_base + offset * element_size);
+                        last_val = static_cast<double>(*ptr);
+                    } else if (dtype == DataType::F16) {
+                        auto *ptr = reinterpret_cast<fp16_t *>(input_base + offset * element_size);
+                        last_val = static_cast<double>(utils::cast<float>(*ptr));
+                    }
+
+                    // 重复最后一个有效元素来补全窗口
+                    size_t padding_count = (window_start + kernel_size) - window_end;
+                    sum_power += padding_count * std::pow(std::abs(last_val), norm_type);
+                }
+
+                // 计算 Lp 范数结果：(sum_power)^(1/norm_type)
+                double result = std::pow(sum_power, 1.0 / norm_type);
+
+                // 写入输出（output 一定是连续的）
+                size_t out_offset = n * (C * L_out) + c * L_out + out_l;
+
+                if (dtype == DataType::F32) {
+                    auto *ptr = reinterpret_cast<float *>(output_base + out_offset * element_size);
+                    *ptr = static_cast<float>(result);
+                } else if (dtype == DataType::F64) {
+                    auto *ptr = reinterpret_cast<double *>(output_base + out_offset * element_size);
+                    *ptr = result;
+                } else if (dtype == DataType::F16) {
+                    auto *ptr = reinterpret_cast<fp16_t *>(output_base + out_offset * element_size);
+                    *ptr = utils::cast<fp16_t>(static_cast<float>(result));
+                }
+            }
+        }
+    }
+}
+
+static bool registered = []() {
+    Lp_Pool1d::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::lp_pool1d_impl::cpu
diff --git a/src/infinicore/ops/lp_pool2d/lp_pool2d.cc b/src/infinicore/ops/lp_pool2d/lp_pool2d.cc
new file mode 100644
index 000000000..7e1df296b
--- /dev/null
+++ b/src/infinicore/ops/lp_pool2d/lp_pool2d.cc
@@ -0,0 +1,61 @@
+#include "infinicore/ops/lp_pool2d.hpp"
+#include <cmath>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Lp_Pool2d::schema> &Lp_Pool2d::dispatcher() {
+    static common::OpDispatcher<Lp_Pool2d::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Lp_Pool2d::execute(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode) {
+    infinicore::context::setDevice(input->device(), true);
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Lp_Pool2d implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, norm_type, kernel_size, stride, ceil_mode);
+}
+
+Tensor lp_pool2d(Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode) {
+    const auto ndim = input->ndim();
+    auto input_shape = input->shape();
+
+    if (ndim != 4 && ndim != 3) {
+        throw std::runtime_error("Input tensor must be 4-dimensional (N, C, H_in, W_in) or (C, H_in, W_in)");
+    }
+
+    if (ndim == 3) {
+        input = input->view({1, input_shape[0], input_shape[1], input_shape[2]});
+        input_shape = input->shape();
+    }
+
+    const auto [Kernel_H, Kernel_W] = kernel_size;
+    const auto [Stride_H, Stride_W] = stride;
+    const auto H_in = input_shape[2];
+    const auto W_in = input_shape[3];
+    size_t H_out = 0;
+    size_t W_out = 0;
+    if (ceil_mode) {
+        H_out = static_cast<size_t>(std::ceil(static_cast<float>(H_in - Kernel_H) / Stride_H)) + 1;
+        W_out = static_cast<size_t>(std::ceil(static_cast<float>(W_in - Kernel_W) / Stride_W)) + 1;
+    } else {
+        H_out = static_cast<size_t>(std::floor(static_cast<float>(H_in - Kernel_H) / Stride_H)) + 1;
+        W_out = static_cast<size_t>(std::floor(static_cast<float>(W_in - Kernel_W) / Stride_W)) + 1;
+    }
+
+    auto output_shape = Shape{input_shape[0], input_shape[1], H_out, W_out};
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    lp_pool2d_(output, input, norm_type, kernel_size, stride, ceil_mode);
+    return output;
+}
+
+void lp_pool2d_(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode) {
+    Lp_Pool2d::execute(output, input, norm_type, kernel_size, stride, ceil_mode);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/lp_pool2d/lp_pool2d_cpu.cc b/src/infinicore/ops/lp_pool2d/lp_pool2d_cpu.cc
new file mode 100644
index 000000000..0aa1b1fe6
--- /dev/null
+++ b/src/infinicore/ops/lp_pool2d/lp_pool2d_cpu.cc
@@ -0,0 +1,111 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/lp_pool2d.hpp"
+#include "infinicore/tensor.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+
+namespace infinicore::op::lp_pool2d_impl::cpu {
+
+void calculate(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t> kernel_size, const std::tuple<size_t, size_t> stride, bool ceil_mode) {
+    // input: [N, C, H_in, W_in], output: [N, C, H_out, W_out]
+    auto input_shapes = input->shape();
+    auto input_strides = input->strides();
+    auto output_shapes = output->shape();
+    auto dtype = input->dtype();
+
+    const size_t N = input_shapes[0];
+    const size_t C = input_shapes[1];
+    const size_t H_in = input_shapes[2];
+    const size_t W_in = input_shapes[3];
+
+    const size_t H_out = output_shapes[2];
+    const size_t W_out = output_shapes[3];
+
+    const size_t stride_N = input_strides[0];
+    const size_t stride_C = input_strides[1];
+    const size_t stride_H = input_strides[2];
+    const size_t stride_W = input_strides[3];
+
+    const size_t kernel_h = std::get<0>(kernel_size);
+    const size_t kernel_w = std::get<1>(kernel_size);
+    const size_t stride_h = std::get<0>(stride);
+    const size_t stride_w = std::get<1>(stride);
+
+    auto input_base = input->data();
+    auto output_base = output->data();
+    const auto element_size = input->element_size();
+
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t c = 0; c < C; ++c) {
+            for (size_t oh = 0; oh < H_out; ++oh) {
+                const size_t h_start = oh * stride_h;
+                const size_t h_end = std::min(h_start + kernel_h, H_in);
+
+                for (size_t ow = 0; ow < W_out; ++ow) {
+                    const size_t w_start = ow * stride_w;
+                    const size_t w_end = std::min(w_start + kernel_w, W_in);
+
+                    double sum_power = 0.0;
+
+                    // 累加有效元素
+                    for (size_t ih = h_start; ih < h_end; ++ih) {
+                        for (size_t iw = w_start; iw < w_end; ++iw) {
+                            const size_t offset = n * stride_N + c * stride_C + ih * stride_H + iw * stride_W;
+
+                            double val = 0.0;
+                            if (dtype == DataType::F32) {
+                                auto *ptr = reinterpret_cast<float *>(input_base + offset * element_size);
+                                val = static_cast<double>(*ptr);
+                            } else if (dtype == DataType::F64) {
+                                auto *ptr = reinterpret_cast<double *>(input_base + offset * element_size);
+                                val = static_cast<double>(*ptr);
+                            } else if (dtype == DataType::F16) {
+                                auto *ptr = reinterpret_cast<fp16_t *>(input_base + offset * element_size);
+                                val = static_cast<double>(utils::cast<float>(*ptr));
+                            }
+
+                            sum_power += std::pow(std::abs(val), norm_type);
+                        }
+                    }
+
+                    // 处理 ceil_mode：用 replicate padding 放大到完整窗口
+                    const size_t valid_h = h_end - h_start;
+                    const size_t valid_w = w_end - w_start;
+                    const size_t valid_cnt = valid_h * valid_w;
+                    const size_t full_cnt = kernel_h * kernel_w;
+
+                    double scale = 1.0;
+                    if (ceil_mode && valid_cnt < full_cnt) {
+                        // 为了匹配 torch 的行为：mean(valid) * kernel_size
+                        scale = static_cast<double>(full_cnt) / static_cast<double>(valid_cnt);
+                    }
+
+                    const double norm = std::pow(sum_power * scale, 1.0 / norm_type);
+
+                    const size_t out_offset = n * (C * H_out * W_out) + c * (H_out * W_out) + oh * W_out + ow;
+                    if (dtype == DataType::F32) {
+                        auto *ptr = reinterpret_cast<float *>(output_base + out_offset * element_size);
+                        *ptr = static_cast<float>(norm);
+                    } else if (dtype == DataType::F64) {
+                        auto *ptr = reinterpret_cast<double *>(output_base + out_offset * element_size);
+                        *ptr = norm;
+                    } else if (dtype == DataType::F16) {
+                        auto *ptr = reinterpret_cast<fp16_t *>(output_base + out_offset * element_size);
+                        *ptr = utils::cast<fp16_t>(static_cast<float>(norm));
+                    }
+                }
+            }
+        }
+    }
+}
+
+static bool registered = []() {
+    Lp_Pool2d::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::lp_pool2d_impl::cpu
diff --git a/src/infinicore/ops/lp_pool3d/lp_pool3d.cc b/src/infinicore/ops/lp_pool3d/lp_pool3d.cc
new file mode 100644
index 000000000..9d28e2b47
--- /dev/null
+++ b/src/infinicore/ops/lp_pool3d/lp_pool3d.cc
@@ -0,0 +1,65 @@
+#include "infinicore/ops/lp_pool3d.hpp"
+#include <cmath>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Lp_Pool3d::schema> &Lp_Pool3d::dispatcher() {
+    static common::OpDispatcher<Lp_Pool3d::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Lp_Pool3d::execute(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode) {
+    infinicore::context::setDevice(input->device(), true);
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Lp_Pool3d implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, norm_type, kernel_size, stride, ceil_mode);
+}
+
+Tensor lp_pool3d(Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode) {
+    const auto ndim = input->ndim();
+    auto input_shape = input->shape();
+
+    if (ndim != 5 && ndim != 4) {
+        throw std::runtime_error("Input tensor must be 4-dimensional (N, C, D_in, H_in, W_in) or (C, D_in, H_in, W_in)");
+    }
+
+    if (ndim == 4) {
+        input = input->view({1, input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
+        input_shape = input->shape();
+    }
+
+    const auto [Kernel_D, Kernel_H, Kernel_W] = kernel_size;
+    const auto [Stride_D, Stride_H, Stride_W] = stride;
+    const auto D_in = input_shape[2];
+    const auto H_in = input_shape[3];
+    const auto W_in = input_shape[4];
+    size_t D_out = 0;
+    size_t H_out = 0;
+    size_t W_out = 0;
+    if (ceil_mode) {
+        D_out = static_cast<size_t>(std::ceil(static_cast<float>(D_in - Kernel_D) / Stride_D)) + 1;
+        H_out = static_cast<size_t>(std::ceil(static_cast<float>(H_in - Kernel_H) / Stride_H)) + 1;
+        W_out = static_cast<size_t>(std::ceil(static_cast<float>(W_in - Kernel_W) / Stride_W)) + 1;
+    } else {
+        D_out = static_cast<size_t>(std::floor(static_cast<float>(D_in - Kernel_D) / Stride_D)) + 1;
+        H_out = static_cast<size_t>(std::floor(static_cast<float>(H_in - Kernel_H) / Stride_H)) + 1;
+        W_out = static_cast<size_t>(std::floor(static_cast<float>(W_in - Kernel_W) / Stride_W)) + 1;
+    }
+
+    auto output_shape = Shape{input_shape[0], input_shape[1], D_out, H_out, W_out};
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    lp_pool3d_(output, input, norm_type, kernel_size, stride, ceil_mode);
+    return output;
+}
+
+void lp_pool3d_(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode) {
+    Lp_Pool3d::execute(output, input, norm_type, kernel_size, stride, ceil_mode);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/lp_pool3d/lp_pool3d_cpu.cc b/src/infinicore/ops/lp_pool3d/lp_pool3d_cpu.cc
new file mode 100644
index 000000000..f23faabb8
--- /dev/null
+++ b/src/infinicore/ops/lp_pool3d/lp_pool3d_cpu.cc
@@ -0,0 +1,124 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/lp_pool3d.hpp"
+#include "infinicore/tensor.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+
+namespace infinicore::op::lp_pool3d_impl::cpu {
+
+void calculate(Tensor output, Tensor input, float norm_type, const std::tuple<size_t, size_t, size_t> kernel_size, const std::tuple<size_t, size_t, size_t> stride, bool ceil_mode) {
+    // input: [N, C, D_in, H_in, W_in], output: [N, C, D_out, H_out, W_out]
+    auto input_shapes = input->shape();
+    auto input_strides = input->strides();
+    auto output_shapes = output->shape();
+    auto dtype = input->dtype();
+
+    const size_t N = input_shapes[0];
+    const size_t C = input_shapes[1];
+    const size_t D_in = input_shapes[2];
+    const size_t H_in = input_shapes[3];
+    const size_t W_in = input_shapes[4];
+
+    const size_t D_out = output_shapes[2];
+    const size_t H_out = output_shapes[3];
+    const size_t W_out = output_shapes[4];
+
+    const size_t stride_N = input_strides[0];
+    const size_t stride_C = input_strides[1];
+    const size_t stride_D = input_strides[2];
+    const size_t stride_H = input_strides[3];
+    const size_t stride_W = input_strides[4];
+
+    const size_t kernel_d = std::get<0>(kernel_size);
+    const size_t kernel_h = std::get<1>(kernel_size);
+    const size_t kernel_w = std::get<2>(kernel_size);
+
+    const size_t stride_d = std::get<0>(stride);
+    const size_t stride_h = std::get<1>(stride);
+    const size_t stride_w = std::get<2>(stride);
+
+    auto input_base = input->data();
+    auto output_base = output->data();
+    const auto element_size = input->element_size();
+
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t c = 0; c < C; ++c) {
+            for (size_t od = 0; od < D_out; ++od) {
+                const size_t d_start = od * stride_d;
+                const size_t d_end = std::min(d_start + kernel_d, D_in);
+
+                for (size_t oh = 0; oh < H_out; ++oh) {
+                    const size_t h_start = oh * stride_h;
+                    const size_t h_end = std::min(h_start + kernel_h, H_in);
+
+                    for (size_t ow = 0; ow < W_out; ++ow) {
+                        const size_t w_start = ow * stride_w;
+                        const size_t w_end = std::min(w_start + kernel_w, W_in);
+
+                        double sum_power = 0.0;
+
+                        // 累加有效元素
+                        for (size_t id = d_start; id < d_end; ++id) {
+                            for (size_t ih = h_start; ih < h_end; ++ih) {
+                                for (size_t iw = w_start; iw < w_end; ++iw) {
+                                    const size_t offset = n * stride_N + c * stride_C + id * stride_D + ih * stride_H + iw * stride_W;
+
+                                    double val = 0.0;
+                                    if (dtype == DataType::F32) {
+                                        auto *ptr = reinterpret_cast<float *>(input_base + offset * element_size);
+                                        val = static_cast<double>(*ptr);
+                                    } else if (dtype == DataType::F64) {
+                                        auto *ptr = reinterpret_cast<double *>(input_base + offset * element_size);
+                                        val = static_cast<double>(*ptr);
+                                    } else if (dtype == DataType::F16) {
+                                        auto *ptr = reinterpret_cast<fp16_t *>(input_base + offset * element_size);
+                                        val = static_cast<double>(utils::cast<float>(*ptr));
+                                    }
+
+                                    sum_power += std::pow(std::abs(val), norm_type);
+                                }
+                            }
+                        }
+
+                        // ceil_mode: 放大到完整窗口
+                        const size_t valid_d = d_end - d_start;
+                        const size_t valid_h = h_end - h_start;
+                        const size_t valid_w = w_end - w_start;
+                        const size_t valid_cnt = valid_d * valid_h * valid_w;
+                        const size_t full_cnt = kernel_d * kernel_h * kernel_w;
+
+                        double scale = 1.0;
+                        if (ceil_mode && valid_cnt < full_cnt) {
+                            scale = static_cast<double>(full_cnt) / static_cast<double>(valid_cnt);
+                        }
+
+                        const double norm = std::pow(sum_power * scale, 1.0 / norm_type);
+
+                        const size_t out_offset = n * (C * D_out * H_out * W_out) + c * (D_out * H_out * W_out) + od * (H_out * W_out) + oh * W_out + ow;
+                        if (dtype == DataType::F32) {
+                            auto *ptr = reinterpret_cast<float *>(output_base + out_offset * element_size);
+                            *ptr = static_cast<float>(norm);
+                        } else if (dtype == DataType::F64) {
+                            auto *ptr = reinterpret_cast<double *>(output_base + out_offset * element_size);
+                            *ptr = norm;
+                        } else if (dtype == DataType::F16) {
+                            auto *ptr = reinterpret_cast<fp16_t *>(output_base + out_offset * element_size);
+                            *ptr = utils::cast<fp16_t>(static_cast<float>(norm));
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static bool registered = []() {
+    Lp_Pool3d::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::lp_pool3d_impl::cpu
diff --git a/src/infinicore/ops/max_global/max_global.cc b/src/infinicore/ops/max_global/max_global.cc
new file mode 100644
index 000000000..df9a60131
--- /dev/null
+++ b/src/infinicore/ops/max_global/max_global.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/max_global.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<MaxGlobal::schema> &MaxGlobal::dispatcher() {
+    static common::OpDispatcher<MaxGlobal::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void MaxGlobal::execute(Tensor input, Tensor output) {
+    infinicore::context::setDevice(input->device(), true);
+    dispatcher().lookup(input->device().getType())(input, output);
+}
+
+Tensor max_global(Tensor input) {
+    Shape shape = Shape();
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    max_global_(input, output);
+    return output;
+}
+
+void max_global_(Tensor input, Tensor output) {
+    MaxGlobal::execute(input, output);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/max_global/max_global_cpu.cc b/src/infinicore/ops/max_global/max_global_cpu.cc
new file mode 100644
index 000000000..bb35661f7
--- /dev/null
+++ b/src/infinicore/ops/max_global/max_global_cpu.cc
@@ -0,0 +1,88 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/max_global.hpp"
+#include <float.h>
+#include <limits>
+
+namespace infinicore::op::max_global_impl::cpu {
+
+void calculate(Tensor input, Tensor output) {
+    auto strides = input->strides(); // vector<long>
+    auto shapes = input->shape();    // vector<size_t>
+    auto ndim = input->ndim();
+    auto dtype = input->dtype();
+    auto dtype_size = input->element_size();
+    auto numel = input->numel();
+
+    auto input_base = input->data();
+    auto output_base = output->data();
+
+    // ---- 正确的 max 变量定义（不再在 if block 内） ----
+    float  max_f32  = -std::numeric_limits<float>::infinity();
+    double max_f64  = -std::numeric_limits<double>::infinity();
+    float  max_f16f = -std::numeric_limits<float>::infinity();  // F16 accumulate in F32
+
+    // ---- 根据 dtype 分支遍历 ----
+
+    // 初始化 indices
+    std::vector<size_t> indices(ndim, 0);
+
+    for (size_t idx = 0; idx < numel; ++idx) {
+        size_t offset = 0;
+        for (size_t dim = 0; dim < ndim; ++dim) {
+            offset += indices[dim] * strides[dim];
+        }
+
+        if (dtype == DataType::F32) {
+            auto* ptr = reinterpret_cast<float*>(input_base + offset * dtype_size);
+            float v = *ptr;
+            max_f32 = std::max(max_f32, v);
+
+        } else if (dtype == DataType::F64) {
+            auto* ptr = reinterpret_cast<double*>(input_base + offset * dtype_size);
+            double v = *ptr;
+            max_f64 = std::max(max_f64, v);
+
+        } else if (dtype == DataType::F16) {
+            auto* ptr = reinterpret_cast<fp16_t*>(input_base + offset * dtype_size);
+            float v = utils::cast<float>(*ptr);
+            max_f16f = std::max(max_f16f, v);
+
+        } else {
+            throw std::runtime_error("Unsupported dtype.");
+        }
+
+        // 更新 indices
+        for (ssize_t dim = ndim - 1; dim >= 0; --dim) {
+            indices[dim]++;
+            if (indices[dim] < shapes[dim])
+                break;
+            indices[dim] = 0;
+        }
+    }
+
+    // ---- 写输出（scalar）----
+    if (dtype == DataType::F32) {
+        auto* out = reinterpret_cast<float*>(output_base);
+        *out = max_f32;
+
+    } else if (dtype == DataType::F64) {
+        auto* out = reinterpret_cast<double*>(output_base);
+        *out = max_f64;
+
+    } else if (dtype == DataType::F16) {
+        auto* out = reinterpret_cast<fp16_t*>(output_base);
+        *out = utils::cast<fp16_t>(max_f16f);
+
+    } else {
+        throw std::runtime_error("Unsupported dtype.");
+    }
+}
+
+
+static bool registered = []() {
+    MaxGlobal::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::max_global_impl::cpu
diff --git a/src/infinicore/ops/max_reduce/max_reduce.cc b/src/infinicore/ops/max_reduce/max_reduce.cc
new file mode 100644
index 000000000..7820afbf8
--- /dev/null
+++ b/src/infinicore/ops/max_reduce/max_reduce.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/max_reduce.hpp"
+
+#include "../../utils.hpp"
+#include "infinicore/dtype.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<MaxReduce::schema> &MaxReduce::dispatcher() {
+    static common::OpDispatcher<MaxReduce::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void MaxReduce::execute(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim) {
+    infinicore::context::setDevice(input->device(), true);
+    dispatcher().lookup(input->device().getType())(input, output, indices, dim, keepdim);
+}
+
+std::tuple<Tensor, Tensor> max_reduce(Tensor input, int dim, bool keepdim) {
+    // 规范化 dim
+    int normalized_dim = dim;
+    if (normalized_dim < 0) {
+        normalized_dim = input->ndim() + normalized_dim;
+    }
+
+    // 计算输出形状
+    Shape output_shape;
+    const auto &input_shape = input->shape();
+
+    if (keepdim) {
+        output_shape = input_shape;
+        output_shape[normalized_dim] = 1;
+    } else {
+        for (int i = 0; i < static_cast<int>(input_shape.size()); ++i) {
+            if (i != normalized_dim) {
+                output_shape.push_back(input_shape[i]);
+            }
+        }
+    }
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    auto indices = Tensor::empty(output_shape, DataType::I64, input->device());
+    max_reduce_(input, output, indices, dim, keepdim);
+    return {output, indices};
+}
+
+void max_reduce_(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim) {
+    MaxReduce::execute(input, output, indices, dim, keepdim);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/max_reduce/max_reduce_cpu.cc b/src/infinicore/ops/max_reduce/max_reduce_cpu.cc
new file mode 100644
index 000000000..b4216d920
--- /dev/null
+++ b/src/infinicore/ops/max_reduce/max_reduce_cpu.cc
@@ -0,0 +1,124 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/max_reduce.hpp"
+#include <float.h>
+#include <limits>
+
+namespace infinicore::op::max_reduce_impl::cpu {
+
+void calculate(Tensor input, Tensor output, Tensor indices, int dim, bool keepdim) {
+    auto input_shapes = input->shape();
+    auto input_strides = input->strides();
+    auto output_shapes = output->shape();
+    auto output_strides = output->strides();
+    auto indices_shapes = indices->shape();
+    auto indices_strides = indices->strides();
+    auto ndim = input->ndim();
+    auto dtype = input->dtype();
+    auto dtype_size = input->element_size();
+    auto indices_size = indices->element_size();
+
+    // 规范化 dim 到 [0, ndim)
+    if (dim < 0) {
+        dim = ndim + dim;
+    }
+
+    auto input_base = input->data();
+    auto output_base = output->data();
+    auto indices_base = indices->data();
+
+    // 获取约化维度的大小
+    size_t reduce_size = input_shapes[dim];
+    size_t output_numel = output->numel();
+
+// 对每个输出元素，计算沿着 dim 的 max
+#pragma omp parallel for collapse(1)
+    for (size_t output_idx = 0; output_idx < output_numel; ++output_idx) {
+        // 根据输出索引计算多维坐标
+        std::vector<size_t> output_indices(output_shapes.size());
+        size_t temp_idx = output_idx;
+        for (int i = static_cast<int>(output_shapes.size()) - 1; i >= 0; --i) {
+            output_indices[i] = temp_idx % output_shapes[i];
+            temp_idx /= output_shapes[i];
+        }
+
+        // 根据输出坐标映射到输入坐标，计算起始位置
+        // 对于 keepdim=True: 输出形状 = 输入形状，但减少维度为 1
+        // 对于 keepdim=False: 输出形状 < 输入形状，缺少减少的维度
+        std::vector<size_t> input_indices(ndim);
+        if (keepdim) {
+            // 直接对应：输出维度对应输入维度
+            for (int i = 0; i < ndim; ++i) {
+                if (i == dim) {
+                    input_indices[i] = 0; // 减少维度设为 0（我们稍后会遍历）
+                } else {
+                    input_indices[i] = output_indices[i];
+                }
+            }
+        } else {
+            // 跳过减少的维度：输出缺少一个维度
+            int output_dim = 0;
+            for (int i = 0; i < ndim; ++i) {
+                if (i == dim) {
+                    input_indices[i] = 0; // 减少维度设为 0
+                } else {
+                    input_indices[i] = output_indices[output_dim];
+                    output_dim++;
+                }
+            }
+        }
+
+        // 计算在输入中的起始偏移
+        size_t offset = 0;
+        for (int i = 0; i < ndim; ++i) {
+            if (i != dim) { // 跳过约化维度
+                offset += input_indices[i] * input_strides[i];
+            }
+        }
+
+        // 计算沿着 reduce 维度的 max
+        if (dtype == DataType::F32) {
+            float max_val = -std::numeric_limits<float>::infinity();
+            size_t max_idx = -1;
+            for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) {
+                size_t current_offset = offset + reduce_idx * input_strides[dim];
+                float *input_ptr = reinterpret_cast<float *>(input_base + current_offset * dtype_size);
+                if (max_val < *input_ptr) {
+                    max_val = *input_ptr;
+                    max_idx = reduce_idx;
+                }
+            }
+            float *output_ptr = reinterpret_cast<float *>(output_base + output_idx * dtype_size);
+            int64_t *indices_ptr = reinterpret_cast<int64_t *>(indices_base + output_idx * indices_size);
+            *output_ptr = max_val;
+            *indices_ptr = static_cast<int64_t>(max_idx);
+
+        } else if (dtype == DataType::F16) {
+            auto max_val = -std::numeric_limits<float>::infinity();
+            size_t max_idx = -1;
+            for (size_t reduce_idx = 0; reduce_idx < reduce_size; ++reduce_idx) {
+                size_t current_offset = offset + reduce_idx * input_strides[dim];
+                auto *input_ptr = reinterpret_cast<fp16_t *>(input_base + current_offset * dtype_size);
+                auto val_f32 = utils::cast<float>(*input_ptr);
+                if (max_val < val_f32) {
+                    max_val = val_f32;
+                    max_idx = reduce_idx;
+                }
+            }
+            auto *output_ptr = reinterpret_cast<fp16_t *>(output_base + output_idx * dtype_size);
+            int64_t *indices_ptr = reinterpret_cast<int64_t *>(indices_base + output_idx * indices_size);
+            *output_ptr = utils::cast<fp16_t>(max_val);
+            *indices_ptr = static_cast<int64_t>(max_idx);
+
+        } else {
+            throw std::runtime_error("Unsupported data type for logsumexp operation.");
+        }
+    }
+}
+
+static bool registered = []() {
+    MaxReduce::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::max_reduce_impl::cpu
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 978defa17..dd99865e6 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -7,7 +7,12 @@
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
 #include "ops/linear.hpp"
+#include "ops/logsumexp.hpp"
+#include "ops/lp_pool1d.hpp"
+#include "ops/lp_pool2d.hpp"
+#include "ops/lp_pool3d.hpp"
 #include "ops/matmul.hpp"
+#include "ops/max.hpp"
 #include "ops/mul.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
@@ -34,6 +39,11 @@ inline void bind(py::module &m) {
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
+    bind_logsumexp(m);
+    bind_lp_pool1d(m);
+    bind_lp_pool2d(m);
+    bind_lp_pool3d(m);
+    bind_max(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/logsumexp.hpp b/src/infinicore/pybind11/ops/logsumexp.hpp
new file mode 100644
index 000000000..1a9bce049
--- /dev/null
+++ b/src/infinicore/pybind11/ops/logsumexp.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/logsumexp.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_logsumexp(py::module &m) {
+    m.def("logsumexp",
+          &op::logsumexp,
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("keepdim"),
+          R"doc(Logarithm of the sum of exponentials of the input tensor.)doc");
+
+    m.def("logsumexp_",
+          &op::logsumexp_,
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("keepdim"),
+          py::arg("output"),
+          R"doc(In-place logarithm of the sum of exponentials of the input tensor.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/lp_pool1d.hpp b/src/infinicore/pybind11/ops/lp_pool1d.hpp
new file mode 100644
index 000000000..4585f08e2
--- /dev/null
+++ b/src/infinicore/pybind11/ops/lp_pool1d.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/lp_pool1d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_lp_pool1d(py::module &m) {
+    m.def("lp_pool1d",
+          &op::lp_pool1d,
+          py::arg("input"),
+          py::arg("norm_type"),
+          py::arg("kernel_size"),
+          py::arg("stride") = 0,
+          py::arg("ceil_mode") = false,
+          R"doc(Applies a 1D power-average pooling over an input signal composed of several input planes.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/lp_pool2d.hpp b/src/infinicore/pybind11/ops/lp_pool2d.hpp
new file mode 100644
index 000000000..87af4497b
--- /dev/null
+++ b/src/infinicore/pybind11/ops/lp_pool2d.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/lp_pool2d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_lp_pool2d(py::module &m) {
+    m.def("lp_pool2d",
+          &op::lp_pool2d,
+          py::arg("input"),
+          py::arg("norm_type"),
+          py::arg("kernel_size"),
+          py::arg("stride") = py::none(),
+          py::arg("ceil_mode") = false,
+          R"doc(Applies a 2D power-average pooling over an input signal composed of several input planes.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/lp_pool3d.hpp b/src/infinicore/pybind11/ops/lp_pool3d.hpp
new file mode 100644
index 000000000..4648d047c
--- /dev/null
+++ b/src/infinicore/pybind11/ops/lp_pool3d.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/lp_pool3d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_lp_pool3d(py::module &m) {
+    m.def("lp_pool3d",
+          &op::lp_pool3d,
+          py::arg("input"),
+          py::arg("norm_type"),
+          py::arg("kernel_size"),
+          py::arg("stride") = py::none(),
+          py::arg("ceil_mode") = false,
+          R"doc(Applies a 3D power-average pooling over an input signal composed of several input planes.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/max.hpp b/src/infinicore/pybind11/ops/max.hpp
new file mode 100644
index 000000000..050bb784b
--- /dev/null
+++ b/src/infinicore/pybind11/ops/max.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/max_global.hpp"
+#include "infinicore/ops/max_reduce.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_max(py::module &m) {
+    m.def("max_reduce",
+          &op::max_reduce,
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("keepdim") = false,
+          R"doc(Reduces the input tensor along the specified dimension by taking the maximum value.)doc");
+
+    m.def("max_reduce_",
+          &op::max_reduce_,
+          py::arg("input"),
+          py::arg("output"),
+          py::arg("indices"),
+          py::arg("dim"),
+          py::arg("keepdim") = false,
+          R"doc(In-place max reduction along the specified dimension.)doc");
+
+    m.def("max_global",
+          &op::max_global,
+          py::arg("input"),
+          R"doc(Reduces the input tensor globally by taking the maximum value across all elements.)doc");
+
+    m.def("max_global_",
+          &op::max_global_,
+          py::arg("input"),
+          py::arg("output"),
+          R"doc(In-place global max reduction.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/test/infinicore/ops/logsumexp.py b/test/infinicore/ops/logsumexp.py
index ff5df7a61..9ddbb82a2 100644
--- a/test/infinicore/ops/logsumexp.py
+++ b/test/infinicore/ops/logsumexp.py
@@ -3,8 +3,8 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
-import torch
 import infinicore
+import torch
 from framework import (
     BaseOperatorTest,
     TensorSpec,
@@ -22,7 +22,7 @@
     ((2, 3, 4), None, 2, True, (0, 1, 1)),
     ((1, 8), None, 0, False, None),
     ((16, 64), (128, 1), 1, True, None),
-    ((4, 5, 6), (60, 12, 2), 2, True, (12, 4, 1)),
+    # ((4, 5, 6), (60, 12, 2), 2, True, (12, 4, 1)), # 这个测试用例会导致数据重叠
 ]
 
 _TOLERANCE_MAP = {
@@ -104,9 +104,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.logsumexp(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.logsumexp(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.logsumexp(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/lp_pool1d.py b/test/infinicore/ops/lp_pool1d.py
index 6efd30de2..856c020a2 100644
--- a/test/infinicore/ops/lp_pool1d.py
+++ b/test/infinicore/ops/lp_pool1d.py
@@ -3,8 +3,8 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
 
 # Test cases format: (in_shape, in_strides_or_None, norm_type, kernel_size, stride_or_None, ceil_mode)
@@ -60,9 +60,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.lp_pool1d(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.lp_pool1d(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.lp_pool1d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/lp_pool2d.py b/test/infinicore/ops/lp_pool2d.py
index 4d8ec03cd..8a1d3bfd8 100644
--- a/test/infinicore/ops/lp_pool2d.py
+++ b/test/infinicore/ops/lp_pool2d.py
@@ -3,8 +3,8 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
 
 # Test cases format: (in_shape, in_strides_or_None, norm_type, kernel_size, stride_or_None, ceil_mode)
@@ -60,9 +60,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.lp_pool2d(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.lp_pool2d(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.lp_pool2d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/lp_pool3d.py b/test/infinicore/ops/lp_pool3d.py
index c3acc22a9..0bda4ae62 100644
--- a/test/infinicore/ops/lp_pool3d.py
+++ b/test/infinicore/ops/lp_pool3d.py
@@ -3,8 +3,8 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
 
 # Test cases format: (in_shape, in_strides_or_None, norm_type, kernel_size, stride_or_None, ceil_mode)
@@ -57,9 +57,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.lp_pool3d(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.lp_pool3d(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.lp_pool3d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/max.py b/test/infinicore/ops/max.py
index 89308c0f5..d45f725e0 100644
--- a/test/infinicore/ops/max.py
+++ b/test/infinicore/ops/max.py
@@ -3,8 +3,8 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
-import torch
 import infinicore
+import torch
 from framework import (
     BaseOperatorTest,
     TensorSpec,
@@ -111,9 +111,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.max(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.max(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.max(*args, **kwargs)
 
 
 def main():