From 29903e64b2d550d60a9aa97754559f2f811e9cb9 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Thu, 20 Feb 2025 15:00:13 -0800
Subject: [PATCH 1/8] Begin work on adding padding argument support. Transposes
 updated.

---
 benchmark/benchmark.cu                        | 20 +++--
 examples/cc/basic_usage/basic_usage.cu        | 14 ++--
 .../cc/basic_usage/basic_usage_autotune.cu    | 14 ++--
 examples/cc/taylor_green/tg.cu                | 16 ++--
 include/cudecomp.h                            | 47 +++++++++---
 include/internal/halo.h                       |  2 +-
 include/internal/transpose.h                  | 53 ++++++++-----
 src/autotune.cc                               | 24 +++---
 src/cudecomp.cc                               | 69 ++++++++++-------
 src/cudecomp_m.cuf                            | 76 +++++++++++++++----
 tests/cc/halo_test.cc                         |  2 +-
 tests/cc/transpose_test.cc                    | 61 +++++++++++----
 tests/fortran/transpose_test.f90              | 40 +++++++---
 13 files changed, 298 insertions(+), 140 deletions(-)

diff --git a/benchmark/benchmark.cu b/benchmark/benchmark.cu
index d2871b4..8e9391b 100644
--- a/benchmark/benchmark.cu
+++ b/benchmark/benchmark.cu
@@ -267,20 +267,20 @@ int main(int argc, char** argv) {
 #ifdef R2C
   // Get x-pencil information (real)
   cudecompPencilInfo_t pinfo_x_r;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr, nullptr));
 #endif
 
   // Get x-pencil information (complex)
   cudecompPencilInfo_t pinfo_x_c;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr, nullptr));
 
   // Get y-pencil information (complex)
   cudecompPencilInfo_t pinfo_y_c;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr, nullptr));
 
   // Get z-pencil information (complex)
   cudecompPencilInfo_t pinfo_z_c;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr, nullptr));
 
   // Get workspace size
   int64_t num_elements_work_c;
@@ -508,7 +508,8 @@ int main(int argc, char** argv) {
 
     if (!slab_xyz) {
       CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc_c, input, output, work_c_d,
-                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0));
+                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
+                                                nullptr, nullptr, 0));
     }
 
     if (!slab_xy && !slab_xyz) {
@@ -530,7 +531,8 @@ int main(int argc, char** argv) {
     // For y-z slab case, no need to perform yz transposes or z-axis FFT
     if (!slab_yz && !slab_xyz) {
       CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc_c, input, output, work_c_d,
-                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0));
+                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
+                                                nullptr, nullptr, 0));
     }
 
     if (!slab_yz && !slab_xyz) {
@@ -545,7 +547,8 @@ int main(int argc, char** argv) {
 
     if (!slab_yz && !slab_xyz) {
       CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc_c, input, output, work_c_d,
-                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0));
+                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
+                                                nullptr, nullptr, 0));
     }
 
     if (!slab_xy && !slab_xyz) {
@@ -566,7 +569,8 @@ int main(int argc, char** argv) {
 
     if (!slab_xyz) {
       CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc_c, input, output, work_c_d,
-                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0));
+                                                get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
+                                                nullptr, nullptr, 0));
     }
 #ifdef R2C
     CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2r_x, output, output_r, CUFFT_INVERSE));
diff --git a/examples/cc/basic_usage/basic_usage.cu b/examples/cc/basic_usage/basic_usage.cu
index e91062b..c151012 100644
--- a/examples/cc/basic_usage/basic_usage.cu
+++ b/examples/cc/basic_usage/basic_usage.cu
@@ -157,15 +157,15 @@ int main(int argc, char** argv) {
   // Get X-pencil information (with halo elements)
   cudecompPencilInfo_t pinfo_x;
   int32_t halo_extents_x[3]{1, 1, 1};
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x, nullptr));
 
   // Get Y-pencil information
   cudecompPencilInfo_t pinfo_y;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr));
 
   // Get Z-pencil information
   cudecompPencilInfo_t pinfo_z;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr));
 
   // Allocate pencil memory
   int64_t data_num_elements = std::max(std::max(pinfo_x.size, pinfo_y.size), pinfo_z.size);
@@ -263,19 +263,19 @@ int main(int argc, char** argv) {
 
   // Transpose from X-pencils to Y-pencils.
   CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
-                                            pinfo_x.halo_extents, nullptr, 0));
+                                            pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0));
 
   // Transpose from Y-pencils to Z-pencils.
   CHECK_CUDECOMP_EXIT(
-      cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0));
+      cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
 
   // Transpose from Z-pencils to Y-pencils.
   CHECK_CUDECOMP_EXIT(
-      cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0));
+      cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
 
   // Transpose from Y-pencils to X-pencils.
   CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
-                                            nullptr, pinfo_x.halo_extents, 0));
+                                            nullptr, pinfo_x.halo_extents, nullptr, nullptr, 0));
 
   // Updating halos
 
diff --git a/examples/cc/basic_usage/basic_usage_autotune.cu b/examples/cc/basic_usage/basic_usage_autotune.cu
index 3fc10ab..bb5e47d 100644
--- a/examples/cc/basic_usage/basic_usage_autotune.cu
+++ b/examples/cc/basic_usage/basic_usage_autotune.cu
@@ -190,15 +190,15 @@ int main(int argc, char** argv) {
   // Get X-pencil information (with halo elements).
   cudecompPencilInfo_t pinfo_x;
   int32_t halo_extents_x[3]{1, 1, 1};
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x, nullptr));
 
   // Get Y-pencil information
   cudecompPencilInfo_t pinfo_y;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr));
 
   // Get Z-pencil information
   cudecompPencilInfo_t pinfo_z;
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr));
+  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr));
 
   // Allocate pencil memory
   int64_t data_num_elements = std::max(std::max(pinfo_x.size, pinfo_y.size), pinfo_z.size);
@@ -241,19 +241,19 @@ int main(int argc, char** argv) {
 
   // Transpose from X-pencils to Y-pencils.
   CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
-                                            pinfo_x.halo_extents, nullptr, 0));
+                                            pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0));
 
   // Transpose from Y-pencils to Z-pencils.
   CHECK_CUDECOMP_EXIT(
-      cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0));
+      cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
 
   // Transpose from Z-pencils to Y-pencils.
   CHECK_CUDECOMP_EXIT(
-      cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0));
+      cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
 
   // Transpose from Y-pencils to X-pencils.
   CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
-                                            nullptr, pinfo_x.halo_extents, 0));
+                                            nullptr, pinfo_x.halo_extents, nullptr, nullptr, 0));
 
   // Updating halos
 
diff --git a/examples/cc/taylor_green/tg.cu b/examples/cc/taylor_green/tg.cu
index 564267f..dd9c195 100644
--- a/examples/cc/taylor_green/tg.cu
+++ b/examples/cc/taylor_green/tg.cu
@@ -381,16 +381,16 @@ public:
     cudecompGridDescCreate(handle, &grid_desc_r, &config, nullptr);
 
     // Get x-pencil information (real)
-    cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr);
+    cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr, nullptr);
 
     // Get x-pencil information (complex)
-    cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr);
+    cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr, nullptr);
 
     // Get y-pencil information (complex)
-    cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr);
+    cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr, nullptr);
 
     // Get z-pencil information (complex)
-    cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr);
+    cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr, nullptr);
 
     // Get workspace size (only complex workspace required)
     int64_t num_elements_work_c;
@@ -665,10 +665,10 @@ private:
     for (int i = 0; i < 3; ++i) {
       CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_r2c_x, U_r[i], U_c[i], CUFFT_FORWARD));
       cudecompTransposeXToY(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr,
-                            nullptr, 0);
+                            nullptr, nullptr, nullptr, 0);
       CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_y, U_c[i], U_c[i], CUFFT_FORWARD));
       cudecompTransposeYToZ(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr,
-                            nullptr, 0);
+                            nullptr, nullptr, nullptr, 0);
       CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_z, U_c[i], U_c[i], CUFFT_FORWARD));
     }
   }
@@ -677,10 +677,10 @@ private:
     for (int i = 0; i < 3; ++i) {
       CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_z, U_c[i], U_c[i], CUFFT_INVERSE));
       cudecompTransposeZToY(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr,
-                            nullptr, 0);
+                            nullptr, nullptr, nullptr, 0);
       CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_y, U_c[i], U_c[i], CUFFT_INVERSE));
       cudecompTransposeYToX(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr,
-                            nullptr, 0);
+                            nullptr, nullptr, nullptr, 0);
       CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2r_x, U_c[i], U_r[i], CUFFT_INVERSE));
     }
   }
diff --git a/include/cudecomp.h b/include/cudecomp.h
index 5e931d1..83ea503 100644
--- a/include/cudecomp.h
+++ b/include/cudecomp.h
@@ -187,12 +187,13 @@ typedef struct {
  * @brief A data structure containing geometry information about a pencil data buffer.
  */
 typedef struct {
-  int32_t shape[3];        ///< pencil shape (in local order, including halo elements)
-  int32_t lo[3];           ///< lower bound coordinates (in local order, excluding halo elements)
-  int32_t hi[3];           ///< upper bound coordinates (in local order, excluding halo elements)
+  int32_t shape[3];        ///< pencil shape (in local order, including halo and padding elements)
+  int32_t lo[3];           ///< lower bound coordinates (in local order, excluding halo and padding elements)
+  int32_t hi[3];           ///< upper bound coordinates (in local order, excluding halo and padding elements)
   int32_t order[3];        ///< data layout order (e.g. 2,1,0 means memory is ordered Z,Y,X)
   int32_t halo_extents[3]; ///< halo extents by dimension (in global order)
-  int64_t size;            ///< number of elements in pencil (including halo elements)
+  int32_t padding[3];      ///< padding by dimension (in global order)
+  int64_t size;            ///< number of elements in pencil (including halo and padding elements)
 } cudecompPencilInfo_t;
 
 // cuDecomp initialization/finalization functions
@@ -280,11 +281,15 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto
  * i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th
  * global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements,
  * one element on each side). If no halo regions are necessary, a NULL pointer can be provided in place of this array.
+ * @param[in] padding An array of three integers to define padding of the pencil, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If no padding is
+ * necesary, a NULL pointer can be provided in place of this array.
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                                       cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[]);
+                                       cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[],
+                                       const int32_t padding[]);
 
 /**
  * @brief Queries the required transpose workspace size, in elements, for a provided grid descriptor.
@@ -423,13 +428,19 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes
  * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided.
  * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo
  * regions, a NULL pointer can be provided.
+ * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
+ * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer
+ * can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream);
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream);
 
 /**
  * @brief Function to transpose data from Y-axis aligned pencils to a Z-axis aligned pencils.
@@ -446,13 +457,19 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc
  * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided.
  * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo
  * regions, a NULL pointer can be provided.
+ * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
+ * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer
+ * can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream);
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream);
 
 /**
  * @brief Function to transpose data from Z-axis aligned pencils to a Y-axis aligned pencils.
@@ -469,13 +486,19 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc
  * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided.
  * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo
  * regions, a NULL pointer can be provided.
+ * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
+ * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer
+ * can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream);
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream);
 
 /**
  * @brief Function to transpose data from Y-axis aligned pencils to a X-axis aligned pencils.
@@ -492,13 +515,19 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc
  * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided.
  * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo
  * regions, a NULL pointer can be provided.
+ * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
+ * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer
+ * can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream);
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream);
 
 // Halo functions
 /**
diff --git a/include/internal/halo.h b/include/internal/halo.h
index 8cd6aa3..2a922d3 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -55,7 +55,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
 
   // Get pencil info
   cudecompPencilInfo_t pinfo_h;
-  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data()));
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr));
 
   // Get global ordered shapes
   auto shape_g_h = getShapeG(pinfo_h);
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index bfc402e..610e8ac 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -163,7 +163,8 @@ static void localPermute(const cudecompHandle_t handle, const std::array<int64_t
 template <typename T>
 static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
                                T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr,
-                               const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) {
+                               const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr,
+                               const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) {
 
   std::array<int32_t, 3> input_halo_extents{};
   std::array<int32_t, 3> output_halo_extents{};
@@ -171,11 +172,16 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
   if (output_halo_extents_ptr)
     std::copy(output_halo_extents_ptr, output_halo_extents_ptr + 3, output_halo_extents.begin());
 
+  std::array<int32_t, 3> input_padding{};
+  std::array<int32_t, 3> output_padding{};
+  if (input_padding_ptr) std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin());
+  if (output_padding_ptr) std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin());
+
   bool fwd = dir > 0;
 
   bool inplace = (input == output);
-  bool input_has_halos = anyNonzeros(input_halo_extents);
-  bool output_has_halos = anyNonzeros(output_halo_extents);
+  bool input_has_halos_padding = anyNonzeros(input_halo_extents) || anyNonzeros(input_padding);
+  bool output_has_halos_padding = anyNonzeros(output_halo_extents) || anyNonzeros(output_padding);
   bool pipelined = isTransposeCommPipelined(grid_desc->config.transpose_comm_backend);
   int memcpy_limit = pipelined ? 1 : CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY;
 
@@ -206,11 +212,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 
   // Get pencil info
   cudecompPencilInfo_t pinfo_a, pinfo_a_h;
-  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a, ax_a, nullptr));
-  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a_h, ax_a, input_halo_extents.data()));
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a, ax_a, nullptr, nullptr));
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a_h, ax_a, input_halo_extents.data(), input_padding.data()));
   cudecompPencilInfo_t pinfo_b, pinfo_b_h;
-  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b, ax_b, nullptr));
-  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data()));
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b, ax_b, nullptr, nullptr));
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data(), output_padding.data()));
 
   // Check if input and output orders are the same
   bool orders_equal = true;
@@ -218,10 +224,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     if (pinfo_a.order[i] != pinfo_b.order[i]) orders_equal = false;
   }
 
-  // Check if input and output halo extents are the same
-  bool halos_equal = true;
+  // Check if input and output halo extents and padding are the same
+  bool halos_padding_equal = true;
   for (int i = 0; i < 3; ++i) {
-    if (input_halo_extents[i] != output_halo_extents[i]) halos_equal = false;
+    if (input_halo_extents[i] != output_halo_extents[i]) halos_padding_equal = false;
+    if (input_padding[i] != output_padding[i]) halos_padding_equal = false;
   }
 
   // Get global ordered shapes
@@ -248,7 +255,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     // Special cases for single rank communicators
     if (orders_equal) {
       if (inplace) {
-        if (halos_equal) {
+        if (halos_padding_equal) {
           // Single rank, in place, Pack -> Unpack: No transpose necessary.
           return;
         }
@@ -295,11 +302,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
     }
 
     if (enable) {
-      if (pinfo_a.order[2] == ax_a && !input_has_halos) {
+      if (pinfo_a.order[2] == ax_a && !input_has_halos_padding) {
         // Input is already packed for all to all, skip pack
         o1 = input;
         o2 = work;
-      } else if (pinfo_a.order[2] == ax_b && orders_equal && !output_has_halos) {
+      } else if (pinfo_a.order[2] == ax_b && orders_equal && !output_has_halos_padding) {
         // Output of all to all is in correct orientation, skip unpack
         o2 = output;
       }
@@ -728,40 +735,44 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
 template <typename T>
 void cudecompTransposeXToY(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output,
                            T* work, const int32_t input_halo_extents_ptr[] = nullptr,
-                           const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) {
+                           const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr,
+                           const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) {
   nvtx::rangePush("cudecompTransposeXToY");
   cudecompTranspose_(0, 1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr,
-                     stream);
+                     input_padding_ptr, output_padding_ptr, stream);
   nvtx::rangePop();
 }
 
 template <typename T>
 void cudecompTransposeYToZ(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output,
                            T* work, const int32_t input_halo_extents_ptr[] = nullptr,
-                           const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) {
+                           const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr,
+                           const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) {
   nvtx::rangePush("cudecompTransposeYToZ");
   cudecompTranspose_(1, 1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr,
-                     stream);
+                     input_padding_ptr, output_padding_ptr, stream);
   nvtx::rangePop();
 }
 
 template <typename T>
 void cudecompTransposeZToY(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output,
                            T* work, const int32_t input_halo_extents_ptr[] = nullptr,
-                           const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) {
+                           const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr,
+                           const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) {
   nvtx::rangePush("cudecompTransposeZToY");
   cudecompTranspose_(2, -1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr,
-                     stream);
+                     input_padding_ptr, output_padding_ptr, stream);
   nvtx::rangePop();
 }
 
 template <typename T>
 void cudecompTransposeYToX(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output,
                            T* work, const int32_t input_halo_extents_ptr[] = nullptr,
-                           const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) {
+                           const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr,
+                           const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) {
   nvtx::rangePush("cudecompTransposeYToX");
   cudecompTranspose_(1, -1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr,
-                     stream);
+                     input_padding_ptr, output_padding_ptr, stream);
   nvtx::rangePop();
 }
 
diff --git a/src/autotune.cc b/src/autotune.cc
index 5248768..9df20b3 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -154,9 +154,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
 
     cudecompPencilInfo_t pinfo_x, pinfo_y, pinfo_z;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, nullptr));
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr));
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, nullptr, nullptr));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr));
 
     // Skip any decompositions with empty pencils
     if (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) ||
@@ -268,22 +268,22 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         if (options->transpose_op_weights[0] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[0] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         if (options->transpose_op_weights[1] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[1] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         if (options->transpose_op_weights[2] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[2] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         if (options->transpose_op_weights[3] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[3] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
       }
 
@@ -303,25 +303,25 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
         if (options->transpose_op_weights[0] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[0] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         CHECK_CUDA(cudaEventRecord(events[1], 0));
         if (options->transpose_op_weights[1] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[1] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         CHECK_CUDA(cudaEventRecord(events[2], 0));
         if (options->transpose_op_weights[2] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[2] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         CHECK_CUDA(cudaEventRecord(events[3], 0));
         if (options->transpose_op_weights[3] != 0.0) {
           CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, data,
                                                options->transpose_use_inplace_buffers[3] ? data : data2, w,
-                                               options->dtype, nullptr, nullptr, 0));
+                                               options->dtype, nullptr, nullptr, nullptr, nullptr, 0));
         }
         CHECK_CUDA(cudaEventRecord(events[4], 0));
         CHECK_CUDA(cudaDeviceSynchronize());
@@ -536,7 +536,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
     grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
 
     cudecompPencilInfo_t pinfo;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents, nullptr));
 
     // Skip any decompositions with empty pencils
     if (std::max(grid_desc->config.pdims[0], grid_desc->config.pdims[1]) >
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index c1d5bbc..8af3643 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -774,7 +774,8 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto
 }
 
 cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
-                                       cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[]) {
+                                       cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[],
+                                       const int32_t padding[]) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -817,6 +818,13 @@ cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc
       } else {
         pencil_info->halo_extents[i] = 0;
       }
+
+      if (padding) {
+        pencil_info->shape[ord] += padding[i];
+        pencil_info->padding[i] = padding[i];
+      } else {
+        pencil_info->padding[i] = 0;
+      }
       pencil_info->size *= pencil_info->shape[ord];
     }
 
@@ -888,7 +896,7 @@ cudecompResult_t cudecompGetHaloWorkspaceSize(cudecompHandle_t handle, cudecompG
     if (!halo_extents) { THROW_INVALID_USAGE("halo_extents argument cannot be null."); }
     if (!workspace_size) { THROW_INVALID_USAGE("workspace_size argument cannot be null."); }
     cudecompPencilInfo_t pinfo;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, halo_extents));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, halo_extents, nullptr));
     auto shape_g = getShapeG(pinfo);
     size_t halo_size_x = 4 * shape_g[1] * shape_g[2] * pinfo.halo_extents[0];
     size_t halo_size_y = 4 * shape_g[0] * shape_g[2] * pinfo.halo_extents[1];
@@ -1114,7 +1122,7 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes
     }
 
     cudecompPencilInfo_t pinfo;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, nullptr));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, nullptr, nullptr));
 
     int count = 0;
     for (int i = 0; i < 3; ++i) {
@@ -1144,7 +1152,8 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes
 
 cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream) {
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1156,23 +1165,25 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeXToY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
-                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompTransposeXToY(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(output),
-                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompTransposeXToY(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
                             reinterpret_cast<cuda::std::complex<float>*>(output),
                             reinterpret_cast<cuda::std::complex<float>*>(work), input_halo_extents, output_halo_extents,
-                            stream);
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompTransposeXToY(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
                             reinterpret_cast<cuda::std::complex<double>*>(output),
                             reinterpret_cast<cuda::std::complex<double>*>(work), input_halo_extents,
-                            output_halo_extents, stream);
+                            output_halo_extents, input_padding, output_padding, stream);
       break;
     }
   } catch (const cudecomp::BaseException& e) {
@@ -1184,7 +1195,8 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc
 
 cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream) {
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1196,26 +1208,27 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
-                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(output),
-                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
                             reinterpret_cast<cuda::std::complex<float>*>(output),
                             reinterpret_cast<cuda::std::complex<float>*>(work), input_halo_extents, output_halo_extents,
-                            stream);
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
                             reinterpret_cast<cuda::std::complex<double>*>(output),
                             reinterpret_cast<cuda::std::complex<double>*>(work), input_halo_extents,
-                            output_halo_extents, stream);
+                            output_halo_extents, input_padding, output_padding, stream);
       break;
     }
-
   } catch (const cudecomp::BaseException& e) {
     std::cerr << e.what();
     return e.getResult();
@@ -1225,7 +1238,8 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc
 
 cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream) {
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1237,26 +1251,27 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeZToY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
-                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompTransposeZToY(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(output),
-                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompTransposeZToY(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
                             reinterpret_cast<cuda::std::complex<float>*>(output),
                             reinterpret_cast<cuda::std::complex<float>*>(work), input_halo_extents, output_halo_extents,
-                            stream);
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompTransposeZToY(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
                             reinterpret_cast<cuda::std::complex<double>*>(output),
                             reinterpret_cast<cuda::std::complex<double>*>(work), input_halo_extents,
-                            output_halo_extents, stream);
+                            output_halo_extents, input_padding, output_padding, stream);
       break;
     }
-
   } catch (const cudecomp::BaseException& e) {
     std::cerr << e.what();
     return e.getResult();
@@ -1266,7 +1281,8 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc
 
 cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output,
                                        void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[],
-                                       const int32_t output_halo_extents[], cudaStream_t stream) {
+                                       const int32_t output_halo_extents[], const int32_t input_padding[],
+                                       const int32_t output_padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1278,26 +1294,27 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompTransposeYToX(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(output),
-                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<float*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompTransposeYToX(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(output),
-                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents, stream);
+                            reinterpret_cast<double*>(work), input_halo_extents, output_halo_extents,
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompTransposeYToX(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
                             reinterpret_cast<cuda::std::complex<float>*>(output),
                             reinterpret_cast<cuda::std::complex<float>*>(work), input_halo_extents, output_halo_extents,
-                            stream);
+                            input_padding, output_padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompTransposeYToX(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
                             reinterpret_cast<cuda::std::complex<double>*>(output),
                             reinterpret_cast<cuda::std::complex<double>*>(work), input_halo_extents,
-                            output_halo_extents, stream);
+                            output_halo_extents, input_padding, output_padding, stream);
       break;
     }
-
   } catch (const cudecomp::BaseException& e) {
     std::cerr << e.what();
     return e.getResult();
diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf
index a9d913b..d96022e 100644
--- a/src/cudecomp_m.cuf
+++ b/src/cudecomp_m.cuf
@@ -157,6 +157,7 @@ module cudecomp
     integer(c_int32_t) :: hi(3)           ! upper bound coordinates (in local order, excluding halo elements)
     integer(c_int32_t) :: order(3)        ! data layout order (e.g. 3,2,1 means memory is ordered z,y,x)
     integer(c_int32_t) :: halo_extents(3) ! halo extents by dimension (always in x,y,z order)
+    integer(c_int32_t) :: padding(3)      ! padding by dimension (always in x,y,z order)
     integer(c_int64_t) :: size            ! number of elements in pencil (including halo elements)
   end type cudecompPencilInfo
 
@@ -258,13 +259,14 @@ module cudecomp
 
   ! General functions
   interface
-    function cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis, halo_extents) &
+    function cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis, halo_extents, padding) &
       bind(C, name="cudecompGetPencilInfo") result(res)
       import
       type(cudecompHandle), value :: handle
       type(cudecompGridDesc), value :: grid_desc
       integer(c_int32_t), value :: axis
       integer(c_int32_t) :: halo_extents(3)
+      integer(c_int32_t) :: padding(3)
       type(cudecompPencilInfo) :: pencil_info
       integer(c_int) :: res
     end function cudecompGetPencilInfoC
@@ -380,7 +382,8 @@ module cudecomp
   ! Transpose functions
   interface
     function cudecompTransposeXToY_C(handle, grid_desc, input, output, work, dtype, &
-                                     input_halo_extents, output_halo_extents, stream) &
+                                     input_halo_extents, output_halo_extents, input_padding, &
+                                     output_padding, stream) &
       bind(C, name="cudecompTransposeXToY") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -389,6 +392,7 @@ module cudecomp
       real(c_float), device :: input(*), output(*), work(*)
       integer(c_int), value :: dtype
       integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3)
+      integer(c_int32_t) :: input_padding(3), output_padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompTransposeXToY_C
@@ -396,7 +400,8 @@ module cudecomp
 
   interface
     function cudecompTransposeYToZ_C(handle, grid_desc, input, output, work, dtype, &
-                                     input_halo_extents, output_halo_extents, stream) &
+                                     input_halo_extents, output_halo_extents, input_padding, &
+                                     output_padding, stream) &
       bind(C, name="cudecompTransposeYToZ") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -405,6 +410,7 @@ module cudecomp
       real(c_float), device :: input(*), output(*), work(*)
       integer(c_int), value :: dtype
       integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3)
+      integer(c_int32_t) :: input_padding(3), output_padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompTransposeYToZ_C
@@ -412,7 +418,8 @@ module cudecomp
 
   interface
     function cudecompTransposeZToY_C(handle, grid_desc, input, output, work, dtype, &
-                                     input_halo_extents, output_halo_extents, stream) &
+                                     input_halo_extents, output_halo_extents, input_padding, &
+                                     output_padding, stream) &
        bind(C, name="cudecompTransposeZToY") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -421,6 +428,7 @@ module cudecomp
       real(c_float), device :: input(*), output(*), work(*)
       integer(c_int), value :: dtype
       integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3)
+      integer(c_int32_t) :: input_padding(3), output_padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompTransposeZToY_C
@@ -428,7 +436,8 @@ module cudecomp
 
   interface
     function cudecompTransposeYToX_C(handle, grid_desc, input, output, work, dtype, &
-                                     input_halo_extents, output_halo_extents, stream) &
+                                     input_halo_extents, output_halo_extents, input_padding, &
+                                     output_padding, stream) &
       bind(C, name="cudecompTransposeYToX") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -437,6 +446,7 @@ module cudecomp
       real(c_float), device :: input(*), output(*), work(*)
       integer(c_int), value :: dtype
       integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3)
+      integer(c_int32_t) :: input_padding(3), output_padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompTransposeYToX_C
@@ -593,20 +603,24 @@ contains
   end function cudecompGridDescCreate
 
   ! General functions
-  function cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis, halo_extents) result(res)
+  function cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis, halo_extents, padding) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
     integer :: axis  ! unit offset, so x/y/z = 1/2/3
     integer, optional:: halo_extents(3)
+    integer, optional:: padding(3)
     type(cudecompPencilInfo) :: pencil_info  ! res%order is unit offset, x/y/z = 1/2/3
     integer(c_int) :: res
 
     integer :: halo_extents_(3)
+    integer :: padding_(3)
 
     halo_extents_(:) = [0, 0, 0]
+    padding_(:) = [0, 0, 0]
     if (present(halo_extents)) halo_extents_ = halo_extents
-    res = cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis - 1, halo_extents_)
+    if (present(padding)) padding_ = padding
+    res = cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis - 1, halo_extents_, padding_)
     ! Update entries for Fortran indexing
     pencil_info%order = pencil_info%order + 1
     pencil_info%lo = pencil_info%lo + 1
@@ -786,7 +800,7 @@ contains
   ! Transpose functions
   function cudecompTransposeXToY(handle, grid_desc, &
        input, output, work, dtype, input_halo_extents, output_halo_extents, &
-       stream) result(res)
+       input_padding, output_padding, stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -796,26 +810,34 @@ contains
     integer(cuda_stream_kind), optional :: stream
     integer, optional :: input_halo_extents(3)
     integer, optional :: output_halo_extents(3)
+    integer, optional :: input_padding(3)
+    integer, optional :: output_padding(3)
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     integer :: input_halo_extents_(3)
     integer :: output_halo_extents_(3)
+    integer :: input_padding_(3)
+    integer :: output_padding_(3)
 
     stream_ = 0
     input_halo_extents_(:) = [0, 0, 0]
     output_halo_extents_(:) = [0, 0, 0]
+    input_padding_(:) = [0, 0, 0]
+    output_padding_(:) = [0, 0, 0]
     if (present(stream)) stream_ = stream
     if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents
     if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents
+    if (present(input_padding)) input_padding_ = input_padding
+    if (present(output_padding)) output_padding_ = output_padding
     res = cudecompTransposeXToY_C(handle, grid_desc, &
           input, output, work, dtype, input_halo_extents_, output_halo_extents_, &
-          stream_)
+          input_padding_, output_padding_, stream_)
   end function cudecompTransposeXToY
 
   function cudecompTransposeYToZ(handle, grid_desc, &
        input, output, work, dtype, input_halo_extents, output_halo_extents, &
-       stream) result(res)
+       input_padding, output_padding, stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -825,26 +847,34 @@ contains
     integer(cuda_stream_kind), optional :: stream
     integer, optional :: input_halo_extents(3)
     integer, optional :: output_halo_extents(3)
+    integer, optional :: input_padding(3)
+    integer, optional :: output_padding(3)
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     integer :: input_halo_extents_(3)
     integer :: output_halo_extents_(3)
+    integer :: input_padding_(3)
+    integer :: output_padding_(3)
 
     stream_ = 0
     input_halo_extents_(:) = [0, 0, 0]
     output_halo_extents_(:) = [0, 0, 0]
+    input_padding_(:) = [0, 0, 0]
+    output_padding_(:) = [0, 0, 0]
     if (present(stream)) stream_ = stream
     if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents
     if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents
+    if (present(input_padding)) input_padding_ = input_padding
+    if (present(output_padding)) output_padding_ = output_padding
     res = cudecompTransposeYToZ_C(handle, grid_desc, &
           input, output, work, dtype, input_halo_extents_, output_halo_extents_, &
-          stream_)
+          input_padding_, output_padding_, stream_)
   end function cudecompTransposeYToZ
 
   function cudecompTransposeZToY(handle, grid_desc, &
        input, output, work, dtype, input_halo_extents, output_halo_extents, &
-       stream) result(res)
+       input_padding, output_padding, stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -854,26 +884,34 @@ contains
     integer(cuda_stream_kind), optional :: stream
     integer, optional :: input_halo_extents(3)
     integer, optional :: output_halo_extents(3)
+    integer, optional :: input_padding(3)
+    integer, optional :: output_padding(3)
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     integer :: input_halo_extents_(3)
     integer :: output_halo_extents_(3)
+    integer :: input_padding_(3)
+    integer :: output_padding_(3)
 
     stream_ = 0
     input_halo_extents_(:) = [0, 0, 0]
     output_halo_extents_(:) = [0, 0, 0]
+    input_padding_(:) = [0, 0, 0]
+    output_padding_(:) = [0, 0, 0]
     if (present(stream)) stream_ = stream
     if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents
     if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents
+    if (present(input_padding)) input_padding_ = input_padding
+    if (present(output_padding)) output_padding_ = output_padding
     res = cudecompTransposeZToY_C(handle, grid_desc, &
           input, output, work, dtype, input_halo_extents_, output_halo_extents_, &
-          stream_)
+          input_padding_, output_padding_, stream_)
   end function cudecompTransposeZToY
 
   function cudecompTransposeYToX(handle, grid_desc, &
        input, output, work, dtype, input_halo_extents, output_halo_extents, &
-       stream) result(res)
+       input_padding, output_padding ,stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -883,21 +921,29 @@ contains
     integer(cuda_stream_kind), optional :: stream
     integer, optional :: input_halo_extents(3)
     integer, optional :: output_halo_extents(3)
+    integer, optional :: input_padding(3)
+    integer, optional :: output_padding(3)
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     integer :: input_halo_extents_(3)
     integer :: output_halo_extents_(3)
+    integer :: input_padding_(3)
+    integer :: output_padding_(3)
 
     stream_ = 0
     input_halo_extents_(:) = [0, 0, 0]
     output_halo_extents_(:) = [0, 0, 0]
+    input_padding_(:) = [0, 0, 0]
+    output_padding_(:) = [0, 0, 0]
     if (present(stream)) stream_ = stream
     if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents
     if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents
+    if (present(input_padding)) input_padding_ = input_padding
+    if (present(output_padding)) output_padding_ = output_padding
     res = cudecompTransposeYToX_C(handle, grid_desc, &
           input, output, work, dtype, input_halo_extents_, output_halo_extents_, &
-          stream_)
+          input_padding_, output_padding_, stream_)
   end function cudecompTransposeYToX
 
   ! Halo functions
diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc
index 10c69cb..86e175c 100644
--- a/tests/cc/halo_test.cc
+++ b/tests/cc/halo_test.cc
@@ -401,7 +401,7 @@ static int run_test(const std::string& arguments, bool silent) {
 
     // Get pencil information
     cudecompPencilInfo_t pinfo;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data()));
+    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data(), nullptr));
 
     // Get workspace size
     int64_t workspace_num_elements;
diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc
index 1595199..88fe76a 100644
--- a/tests/cc/transpose_test.cc
+++ b/tests/cc/transpose_test.cc
@@ -123,9 +123,9 @@ static bool compare_pencils(const std::vector<real_t>& ref, const std::vector<re
       lx[2] = i / (pinfo.shape[0] * pinfo.shape[1]);
 
       // Only compare values inside internal region
-      if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]]) &&
-          lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]]) &&
-          lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]])) {
+      if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]]) &&
+          lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]]) &&
+          lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]])) {
         return false;
       }
     }
@@ -150,9 +150,9 @@ static void initialize_pencil(std::vector<real_t>& ref, const cudecompPencilInfo
     int64_t gi = gx[0] + gdims[0] * (gx[1] + gx[2] * gdims[1]);
 
     // Only set values inside internal region
-    if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]]) &&
-        lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]]) &&
-        lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]])) {
+    if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]]) &&
+        lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]]) &&
+        lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]])) {
       ref[i] = gi;
     } else {
       ref[i] = -1;
@@ -198,6 +198,12 @@ static void usage(const char* pname) {
           "\t\tY-pencil halo_extents setting. (default: 0 0 0) \n"
           "\t--hez\n"
           "\t\tZ-pencil halo_extents setting. (default: 0 0 0) \n"
+          "\t--pdx\n"
+          "\t\tX-pencil padding setting. (default: 0 0 0) \n"
+          "\t--pdy\n"
+          "\t\tY-pencil padding setting. (default: 0 0 0) \n"
+          "\t--pdz\n"
+          "\t\tZ-pencil padding setting. (default: 0 0 0) \n"
           "\t--mem_order\n"
           "\t\ttranspose_mem_order setting. (default: unset) \n"
           "\t-m|--use-managed-memory\n"
@@ -220,6 +226,9 @@ struct transposeTestArgs {
   std::array<int, 3> halo_extents_x{};
   std::array<int, 3> halo_extents_y{};
   std::array<int, 3> halo_extents_z{};
+  std::array<int, 3> padding_x{};
+  std::array<int, 3> padding_y{};
+  std::array<int, 3> padding_z{};
   bool out_of_place = false;
   bool use_managed_memory = false;
   std::array<int, 9> mem_order{-1, -1, -1, -1, -1, -1, -1, -1, -1};
@@ -248,6 +257,9 @@ static transposeTestArgs parse_arguments(const std::string& arguments) {
                                            {"hex", required_argument, 0, '7'},
                                            {"hey", required_argument, 0, '8'},
                                            {"hez", required_argument, 0, '9'},
+                                           {"pdx", required_argument, 0, '&'},
+                                           {"pdy", required_argument, 0, '*'},
+                                           {"pdz", required_argument, 0, '('},
                                            {"mem_order", required_argument, 0, 'q'},
                                            {"out-of-place", no_argument, 0, 'o'},
                                            {"use-managed-memory", no_argument, 0, 'm'},
@@ -255,7 +267,7 @@ static transposeTestArgs parse_arguments(const std::string& arguments) {
                                            {0, 0, 0, 0}};
 
     int option_index = 0;
-    int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:q:omh", long_options, &option_index);
+    int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:&:*:(:q:omh", long_options, &option_index);
     if (ch == -1) break;
 
     switch (ch) {
@@ -297,6 +309,27 @@ static transposeTestArgs parse_arguments(const std::string& arguments) {
         optind++;
       }
       break;
+    case '&':
+      optind--;
+      for (int i = 0; i < 3; ++i) {
+        args.padding_x[i] = atoi(argv[optind]);
+        optind++;
+      }
+      break;
+    case '*':
+      optind--;
+      for (int i = 0; i < 3; ++i) {
+        args.padding_y[i] = atoi(argv[optind]);
+        optind++;
+      }
+      break;
+    case '(':
+      optind--;
+      for (int i = 0; i < 3; ++i) {
+        args.padding_z[i] = atoi(argv[optind]);
+        optind++;
+      }
+      break;
     case 'o': args.out_of_place = true; break;
     case 'm': args.use_managed_memory = true; break;
     case 'q':
@@ -389,15 +422,15 @@ static int run_test(const std::string& arguments, bool silent) {
 
     // Get x-pencil information
     cudecompPencilInfo_t pinfo_x;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, args.halo_extents_x.data()));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, args.halo_extents_x.data(), args.padding_x.data()));
 
     // Get y-pencil information
     cudecompPencilInfo_t pinfo_y;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data()));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data(), args.padding_x.data()));
 
     // Get z-pencil information
     cudecompPencilInfo_t pinfo_z;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data()));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data(), args.padding_x.data()));
 
     // Get workspace size
     int64_t workspace_num_elements;
@@ -449,7 +482,7 @@ static int run_test(const std::string& arguments, bool silent) {
 
     CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size));
     CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)),
-                                              pinfo_x.halo_extents, pinfo_y.halo_extents, 0));
+                                              pinfo_x.halo_extents, pinfo_y.halo_extents, pinfo_x.padding, pinfo_y.padding, 0));
     CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*output), cudaMemcpyDeviceToHost));
     if (!compare_pencils(yref, data, pinfo_y)) {
       fprintf(stderr, "FAILED cudecompTransposeXToY\n");
@@ -460,7 +493,7 @@ static int run_test(const std::string& arguments, bool silent) {
 
     CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size));
     CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)),
-                                              pinfo_y.halo_extents, pinfo_z.halo_extents, 0));
+                                              pinfo_y.halo_extents, pinfo_z.halo_extents, pinfo_y.padding, pinfo_z.padding, 0));
     CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*data_d), cudaMemcpyDeviceToHost));
     if (!compare_pencils(zref, data, pinfo_z)) {
       fprintf(stderr, "FAILED cudecompTransposeYToZ\n");
@@ -471,7 +504,7 @@ static int run_test(const std::string& arguments, bool silent) {
 
     CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size));
     CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)),
-                                              pinfo_z.halo_extents, pinfo_y.halo_extents, 0));
+                                              pinfo_z.halo_extents, pinfo_y.halo_extents, pinfo_z.padding, pinfo_y.padding, 0));
     CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*data_d), cudaMemcpyDeviceToHost));
     if (!compare_pencils(yref, data, pinfo_y)) {
       fprintf(stderr, "FAILED cudecompTransposeZToY\n");
@@ -482,7 +515,7 @@ static int run_test(const std::string& arguments, bool silent) {
 
     CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size));
     CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)),
-                                              pinfo_y.halo_extents, pinfo_x.halo_extents, 0));
+                                              pinfo_y.halo_extents, pinfo_x.halo_extents, pinfo_y.padding, pinfo_x.padding, 0));
     CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*data_d), cudaMemcpyDeviceToHost));
     if (!compare_pencils(xref, data, pinfo_x)) {
       fprintf(stderr, "FAILED cudecompTransposeYToX\n");
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index c36c7b8..9b18d15 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -69,12 +69,12 @@ module transpose_CUDECOMP_DOUBLE_COMPLEX_mod
   function compare_pencils(ref, res, pinfo) result(mismatch)
     implicit none
     type(cudecompPencilInfo) :: pinfo
-    ARRTYPE :: ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                   pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                   pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))
-    ARRTYPE :: res(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                   pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                   pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))
+    ARRTYPE :: ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), &
+                   pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), &
+                   pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3)))
+    ARRTYPE :: res(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), &
+                   pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), &
+                   pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3)))
 
     logical :: mismatch
     mismatch = any(ref(pinfo%lo(1): pinfo%hi(1), pinfo%lo(2): pinfo%hi(2), pinfo%lo(3): pinfo%hi(3)) /= &
@@ -90,10 +90,10 @@ subroutine initialize_pencil(ref, pinfo, gdims)
     integer :: gdims(3)
     integer :: gx(3)
 
-    ! Allocate reference pencil with halo regions
-    allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                 pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                 pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))))
+    ! Allocate reference pencil with halo and padding regions
+    allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), &
+                 pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), &
+                 pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3))))
 
     ref = -1
 
@@ -175,6 +175,7 @@ function run_test(arguments, silent) result(res)
     logical :: axis_contiguous(3)
     integer :: gdims_dist(3)
     integer :: halo_extents_x(3), halo_extents_y(3), halo_extents_z(3)
+    integer :: padding_x(3), padding_y(3), padding_z(3)
     integer :: mem_order(3, 3)
     logical :: out_of_place, use_managed_memory
     integer :: pr, pc
@@ -307,6 +308,24 @@ function run_test(arguments, silent) result(res)
             read(arg, *) halo_extents_z(j)
           enddo
           skip_count = 3
+        case('--pdx')
+          do j = 1, 3
+            call get_command_argument(i+j, arg)
+            read(arg, *) padding_x(j)
+          enddo
+          skip_count = 3
+        case('--pdy')
+          do j = 1, 3
+            call get_command_argument(i+j, arg)
+            read(arg, *) padding_y(j)
+          enddo
+          skip_count = 3
+        case('--pdz')
+          do j = 1, 3
+            call get_command_argument(i+j, arg)
+            read(arg, *) padding_z(j)
+          enddo
+          skip_count = 3
         case('--mem_order')
           l = 1
           do j = 1, 3
@@ -570,7 +589,6 @@ program main
     call read_testfile(testfile, testcases)
   endif
 
-
   nfailed = 0
   allocate(failed_cases(size(testcases)))
 

From ab1add29a8a855ead99a5372a48b2f727222fe4d Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Fri, 21 Feb 2025 12:20:05 -0800
Subject: [PATCH 2/8] Add padding support to halo implementation and API.

---
 examples/cc/basic_usage/basic_usage.cu        |  6 +-
 .../cc/basic_usage/basic_usage_autotune.cu    |  6 +-
 include/cudecomp.h                            | 15 +++-
 include/internal/common.h                     |  4 +
 include/internal/halo.h                       | 78 +++++++++++--------
 include/internal/transpose.h                  |  4 -
 src/autotune.cc                               | 12 +--
 src/cudecomp.cc                               | 30 +++----
 src/cudecomp_m.cuf                            | 33 +++++---
 tests/cc/halo_test.cc                         | 35 +++++----
 tests/fortran/halo_test.f90                   | 46 ++++++-----
 11 files changed, 161 insertions(+), 108 deletions(-)

diff --git a/examples/cc/basic_usage/basic_usage.cu b/examples/cc/basic_usage/basic_usage.cu
index c151012..9201285 100644
--- a/examples/cc/basic_usage/basic_usage.cu
+++ b/examples/cc/basic_usage/basic_usage.cu
@@ -284,15 +284,15 @@ int main(int argc, char** argv) {
 
   // Update X-pencil halos in X direction
   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE,
-                                           pinfo_x.halo_extents, halo_periods, 0, 0));
+                                           pinfo_x.halo_extents, halo_periods, 0, nullptr, 0));
 
   // Update X-pencil halos in Y direction
   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE,
-                                           pinfo_x.halo_extents, halo_periods, 1, 0));
+                                           pinfo_x.halo_extents, halo_periods, 1, nullptr, 0));
 
   // Update X-pencil halos in Z direction
   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE,
-                                           pinfo_x.halo_extents, halo_periods, 2, 0));
+                                           pinfo_x.halo_extents, halo_periods, 2, nullptr, 0));
 
   // Cleanup resources
   free(data);
diff --git a/examples/cc/basic_usage/basic_usage_autotune.cu b/examples/cc/basic_usage/basic_usage_autotune.cu
index bb5e47d..a61b039 100644
--- a/examples/cc/basic_usage/basic_usage_autotune.cu
+++ b/examples/cc/basic_usage/basic_usage_autotune.cu
@@ -262,15 +262,15 @@ int main(int argc, char** argv) {
 
   // Update X-pencil halos in X direction
   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE,
-                                           pinfo_x.halo_extents, halo_periods, 0, 0));
+                                           pinfo_x.halo_extents, halo_periods, 0, nullptr, 0));
 
   // Update X-pencil halos in Y direction
   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE,
-                                           pinfo_x.halo_extents, halo_periods, 1, 0));
+                                           pinfo_x.halo_extents, halo_periods, 1, nullptr, 0));
 
   // Update X-pencil halos in Z direction
   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE,
-                                           pinfo_x.halo_extents, halo_periods, 2, 0));
+                                           pinfo_x.halo_extents, halo_periods, 2, nullptr, 0));
 
   // Cleanup resources
   free(data);
diff --git a/include/cudecomp.h b/include/cudecomp.h
index 83ea503..82f47ee 100644
--- a/include/cudecomp.h
+++ b/include/cudecomp.h
@@ -546,13 +546,16 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc
  * If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. A NULL
  * pointer can be provided if none of the domain axes are periodic.
  * @param[in] dim Which pencil dimension (global indexed) to perform the halo update
+ * @param[in] padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work,
                                       cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[],
-                                      int32_t dim, cudaStream_t stream);
+                                      int32_t dim, const int32_t padding[], cudaStream_t stream);
 
 /**
  * @brief Function to perform halo communication of Y-axis aligned pencil data
@@ -569,13 +572,16 @@ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_
  * @param[in] halo_periods An array of three booleans to define halo periodicity of the input data, in global order.
  * If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis.
  * @param[in] dim Which pencil dimension (global indexed) to perform the halo update
+ * @param[in] padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work,
                                       cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[],
-                                      int32_t dim, cudaStream_t stream);
+                                      int32_t dim, const int32_t padding[], cudaStream_t stream);
 
 /**
  * @brief Function to perform halo communication of Z-axis aligned pencil data
@@ -592,13 +598,16 @@ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_
  * @param[in] halo_periods An array of three booleans to define halo periodicity of the input data, in global order.
  * If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis.
  * @param[in] dim Which pencil dimension (global indexed) to perform the halo update
+ * @param[in] padding An array of three integers to define padding of the input data, in global order. The i-th entry
+ * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has
+ * no padding, a NULL pointer can be provided.
  * @param[in] stream CUDA stream to enqueue GPU operations into
  *
  * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure.
  */
 cudecompResult_t cudecompUpdateHalosZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work,
                                       cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[],
-                                      int32_t dim, cudaStream_t stream);
+                                      int32_t dim, const int32_t padding[], cudaStream_t stream);
 
 #ifdef __cplusplus
 }
diff --git a/include/internal/common.h b/include/internal/common.h
index c971a38..9f899f0 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -329,6 +329,10 @@ static inline std::vector<int64_t> getSplits(int64_t N, int nchunks, int pad) {
   return splits;
 }
 
+template <typename T> static inline bool anyNonzeros(const std::array<T, 3>& arr) {
+  return (arr[0] != T(0) || arr[1] != T(0) || arr[2] != T(0));
+}
+
 // Assigns an integer ID to every unique value in a vector
 template <typename T>
 std::unordered_map<T, unsigned int> getUniqueIds(const std::vector<T>& v) {
diff --git a/include/internal/halo.h b/include/internal/halo.h
index 2a922d3..a636286 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -46,19 +46,25 @@ namespace cudecomp {
 
 template <typename T>
 void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work,
-                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim,
+                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[],
+                          int32_t dim, const int32_t padding_ptr[],
                           cudaStream_t stream) {
   std::array<int32_t, 3> halo_extents{};
   if (halo_extents_ptr) std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin());
   std::array<bool, 3> halo_periods{};
   if (halo_periods_ptr) std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin());
+  std::array<int32_t, 3> padding{};
+  if (padding_ptr) std::copy(padding_ptr, padding_ptr + 3, padding.begin());
 
   // Get pencil info
   cudecompPencilInfo_t pinfo_h;
   CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr));
+  cudecompPencilInfo_t pinfo_h_p; // with padding
+  CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h_p, ax, halo_extents.data(), padding.data()));
 
   // Get global ordered shapes
   auto shape_g_h = getShapeG(pinfo_h);
+  auto shape_g_h_p = getShapeG(pinfo_h_p);
 
   // Get neighbors
   std::array<int32_t, 2> neighbors;
@@ -120,9 +126,13 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
   }
 
   bool managed = isManagedPointer(input);
+  bool input_has_padding = anyNonzeros(padding);
+
   if (c == 2 &&
-      (haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend) ||
-      (managed && haloBackendRequiresMpi(grid_desc->config.halo_comm_backend)))) {
+      (input_has_padding ||
+       haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend) ||
+       (managed && haloBackendRequiresMpi(grid_desc->config.halo_comm_backend)))) {
+    // For padded input, always stage to work space.
     // For managed memory, always stage to work space if using MPI.
     // For any memory, always stage to workspace if using NVSHMEM.
     // Can revisit for NVSHMEM if input is NVSHMEM allocated.
@@ -136,21 +146,21 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     std::array<int32_t, 3> lx{};
 
     // Left
-    lx[dim] = shape_g_h[dim] - 2 * halo_extents[dim];
-    memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h, lx);
-    memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h, {0, 0, 0});
+    lx[dim] = shape_g_h_p[dim] - 2 * halo_extents[dim] - padding[dim];
+    memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h_p, lx);
+    memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h_p, {0, 0, 0});
 
     // Right
     lx[dim] = halo_extents[dim];
-    memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h, lx);
-    lx[dim] = shape_g_h[dim] - halo_extents[dim];
-    memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h, lx);
+    memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h_p, lx);
+    lx[dim] = shape_g_h_p[dim] - halo_extents[dim] - padding[dim];
+    memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h_p, lx);
 
     for (int i = 0; i < 2; ++i) {
-      memcpy_params.src_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1];
-      memcpy_params.src_strides[1][i] = pinfo_h.shape[0];
-      memcpy_params.dest_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1];
-      memcpy_params.dest_strides[1][i] = pinfo_h.shape[0];
+      memcpy_params.src_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1];
+      memcpy_params.src_strides[1][i] = pinfo_h_p.shape[0];
+      memcpy_params.dest_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1];
+      memcpy_params.dest_strides[1][i] = pinfo_h_p.shape[0];
       memcpy_params.extents[0][i] = (dim == pinfo_h.order[2]) ? halo_extents[dim] : pinfo_h.shape[2];
       memcpy_params.extents[1][i] = (dim == pinfo_h.order[1]) ? halo_extents[dim] : pinfo_h.shape[1];
       memcpy_params.extents[2][i] = (dim == pinfo_h.order[0]) ? halo_extents[dim] : pinfo_h.shape[0];
@@ -172,17 +182,17 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     // Pack
     // Left
     lx[dim] = halo_extents[dim];
-    memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h, lx);
+    memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h_p, lx);
     memcpy_params.dest[0] = send_buff;
 
     // Right
-    lx[dim] = shape_g_h[dim] - 2 * halo_extents[dim];
-    memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h, lx);
+    lx[dim] = shape_g_h_p[dim] - 2 * halo_extents[dim] - padding[dim];
+    memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h_p, lx);
     memcpy_params.dest[1] = send_buff + halo_size;
 
     for (int i = 0; i < 2; ++i) {
-      memcpy_params.src_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1];
-      memcpy_params.src_strides[1][i] = pinfo_h.shape[0];
+      memcpy_params.src_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1];
+      memcpy_params.src_strides[1][i] = pinfo_h_p.shape[0];
       memcpy_params.dest_strides[1][i] = (dim == pinfo_h.order[0]) ? halo_extents[dim] : pinfo_h.shape[0];
       memcpy_params.dest_strides[0][i] =
           memcpy_params.dest_strides[1][i] * ((dim == pinfo_h.order[1]) ? halo_extents[dim] : pinfo_h.shape[1]);
@@ -203,16 +213,16 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     // Unpack
     // Left
     memcpy_params.src[0] = recv_buff;
-    memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h, {0, 0, 0});
+    memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h_p, {0, 0, 0});
 
     // Right
     memcpy_params.src[1] = recv_buff + halo_size;
-    lx[dim] = shape_g_h[dim] - halo_extents[dim];
-    memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h, lx);
+    lx[dim] = shape_g_h_p[dim] - halo_extents[dim] - padding[dim];
+    memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h_p, lx);
 
     for (int i = 0; i < 2; ++i) {
-      memcpy_params.dest_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1];
-      memcpy_params.dest_strides[1][i] = pinfo_h.shape[0];
+      memcpy_params.dest_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1];
+      memcpy_params.dest_strides[1][i] = pinfo_h_p.shape[0];
       memcpy_params.src_strides[1][i] = (dim == pinfo_h.order[0]) ? halo_extents[dim] : pinfo_h.shape[0];
       memcpy_params.src_strides[0][i] =
           memcpy_params.src_strides[1][i] * ((dim == pinfo_h.order[1]) ? halo_extents[dim] : pinfo_h.shape[1]);
@@ -250,9 +260,9 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     send_offsets[0] = getPencilPtrOffset(pinfo_h, lx);
     recv_offsets[0] = getPencilPtrOffset(pinfo_h, {0, 0, 0});
     // Right
-    lx[dim] = shape_g_h[dim] - 2 * halo_extents[dim];
+    lx[dim] = shape_g_h_p[dim] - 2 * halo_extents[dim];
     send_offsets[1] = getPencilPtrOffset(pinfo_h, lx);
-    lx[dim] = shape_g_h[dim] - halo_extents[dim];
+    lx[dim] = shape_g_h_p[dim] - halo_extents[dim];
     recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx);
 
     cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets,
@@ -263,34 +273,34 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
 
 template <typename T>
 void cudecompUpdateHalosX(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work,
-                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim,
-                          cudaStream_t stream) {
+                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[],
+                          int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) {
   std::stringstream os;
   os << "cudecompUpdateHalosX_" << dim;
   nvtx::rangePush(os.str());
-  cudecompUpdateHalos_(0, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, stream);
+  cudecompUpdateHalos_(0, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, padding_ptr, stream);
   nvtx::rangePop();
 }
 
 template <typename T>
 void cudecompUpdateHalosY(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work,
-                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim,
-                          cudaStream_t stream) {
+                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[],
+                          int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) {
   std::stringstream os;
   os << "cudecompUpdateHalosY_" << dim;
   nvtx::rangePush(os.str());
-  cudecompUpdateHalos_(1, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, stream);
+  cudecompUpdateHalos_(1, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, padding_ptr, stream);
   nvtx::rangePop();
 }
 
 template <typename T>
 void cudecompUpdateHalosZ(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work,
-                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim,
-                          cudaStream_t stream) {
+                          const int32_t halo_extents_ptr[], const bool halo_periods_ptr[],
+                          int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) {
   std::stringstream os;
   os << "cudecompUpdateHalosZ_" << dim;
   nvtx::rangePush(os.str());
-  cudecompUpdateHalos_(2, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, stream);
+  cudecompUpdateHalos_(2, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, padding_ptr, stream);
   nvtx::rangePop();
 }
 
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 610e8ac..8c67d4d 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -54,10 +54,6 @@ static inline bool isTransposeCommPipelined(cudecompTransposeCommBackend_t commT
           commType == CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL);
 }
 
-template <typename T> static inline bool anyNonzeros(const std::array<T, 3>& arr) {
-  return (arr[0] != T(0) || arr[1] != T(0) || arr[2] != T(0));
-}
-
 #if CUTENSOR_MAJOR >= 2
 static inline cutensorDataType_t getCutensorDataType(float) { return CUTENSOR_R_32F; }
 static inline cutensorDataType_t getCutensorDataType(double) { return CUTENSOR_R_64F; }
diff --git a/src/autotune.cc b/src/autotune.cc
index 9df20b3..d3a0ccb 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -648,15 +648,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           switch (options->halo_axis) {
           case 0:
             CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents,
-                                                options->halo_periods, dim, 0));
+                                                options->halo_periods, dim, nullptr, 0));
             break;
           case 1:
             CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents,
-                                                options->halo_periods, dim, 0));
+                                                options->halo_periods, dim, nullptr, 0));
             break;
           case 2:
             CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents,
-                                                options->halo_periods, dim, 0));
+                                                options->halo_periods, dim, nullptr, 0));
             break;
           }
         }
@@ -672,15 +672,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           switch (options->halo_axis) {
           case 0:
             CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents,
-                                                options->halo_periods, dim, 0));
+                                                options->halo_periods, dim, nullptr, 0));
             break;
           case 1:
             CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents,
-                                                options->halo_periods, dim, 0));
+                                                options->halo_periods, dim, nullptr, 0));
             break;
           case 2:
             CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents,
-                                                options->halo_periods, dim, 0));
+                                                options->halo_periods, dim, nullptr, 0));
             break;
           }
         }
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 8af3643..7ffb7d9 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -1324,7 +1324,7 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc
 
 cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work,
                                       cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[],
-                                      int32_t dim, cudaStream_t stream) {
+                                      int32_t dim, const int32_t padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1343,19 +1343,19 @@ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(work),
-                           halo_extents, halo_periods, dim, stream);
+                           halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(work),
-                           halo_extents, halo_periods, dim, stream);
+                           halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
-                           reinterpret_cast<cuda::std::complex<float>*>(work), halo_extents, halo_periods, dim, stream);
+                           reinterpret_cast<cuda::std::complex<float>*>(work), halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
-                           reinterpret_cast<cuda::std::complex<double>*>(work), halo_extents, halo_periods, dim,
+                           reinterpret_cast<cuda::std::complex<double>*>(work), halo_extents, halo_periods, dim, padding,
                            stream);
       break;
     }
@@ -1369,7 +1369,7 @@ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_
 
 cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work,
                                       cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[],
-                                      int32_t dim, cudaStream_t stream) {
+                                      int32_t dim, const int32_t padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1388,19 +1388,19 @@ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(work),
-                           halo_extents, halo_periods, dim, stream);
+                           halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(work),
-                           halo_extents, halo_periods, dim, stream);
+                           halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
-                           reinterpret_cast<cuda::std::complex<float>*>(work), halo_extents, halo_periods, dim, stream);
+                           reinterpret_cast<cuda::std::complex<float>*>(work), halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
-                           reinterpret_cast<cuda::std::complex<double>*>(work), halo_extents, halo_periods, dim,
+                           reinterpret_cast<cuda::std::complex<double>*>(work), halo_extents, halo_periods, dim, padding,
                            stream);
       break;
     }
@@ -1414,7 +1414,7 @@ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_
 
 cudecompResult_t cudecompUpdateHalosZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work,
                                       cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[],
-                                      int32_t dim, cudaStream_t stream) {
+                                      int32_t dim, const int32_t padding[], cudaStream_t stream) {
   using namespace cudecomp;
   try {
     checkHandle(handle);
@@ -1433,19 +1433,19 @@ cudecompResult_t cudecompUpdateHalosZ(cudecompHandle_t handle, cudecompGridDesc_
     switch (dtype) {
     case CUDECOMP_FLOAT:
       cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast<float*>(input), reinterpret_cast<float*>(work),
-                           halo_extents, halo_periods, dim, stream);
+                           halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_DOUBLE:
       cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast<double*>(input), reinterpret_cast<double*>(work),
-                           halo_extents, halo_periods, dim, stream);
+                           halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_FLOAT_COMPLEX:
       cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast<cuda::std::complex<float>*>(input),
-                           reinterpret_cast<cuda::std::complex<float>*>(work), halo_extents, halo_periods, dim, stream);
+                           reinterpret_cast<cuda::std::complex<float>*>(work), halo_extents, halo_periods, dim, padding, stream);
       break;
     case CUDECOMP_DOUBLE_COMPLEX:
       cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast<cuda::std::complex<double>*>(input),
-                           reinterpret_cast<cuda::std::complex<double>*>(work), halo_extents, halo_periods, dim,
+                           reinterpret_cast<cuda::std::complex<double>*>(work), halo_extents, halo_periods, dim, padding,
                            stream);
       break;
     }
diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf
index d96022e..55945e6 100644
--- a/src/cudecomp_m.cuf
+++ b/src/cudecomp_m.cuf
@@ -455,7 +455,7 @@ module cudecomp
   ! Halo functions
   interface
     function cudecompUpdateHalosX_C(handle, grid_desc, input, work, dtype, &
-                                    halo_extents, halo_periods, dim, stream) &
+                                    halo_extents, halo_periods, dim, padding, stream) &
       bind(C, name="cudecompUpdateHalosX") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -466,6 +466,7 @@ module cudecomp
       integer(c_int32_t) :: halo_extents(3)
       logical(c_bool) :: halo_periods(3)
       integer(c_int32_t), value :: dim
+      integer(c_int32_t) :: padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompUpdateHalosX_C
@@ -473,7 +474,7 @@ module cudecomp
 
   interface
     function cudecompUpdateHalosY_C(handle, grid_desc, input, work, dtype, &
-                                    halo_extents, halo_periods, dim, stream) &
+                                    halo_extents, halo_periods, dim, padding, stream) &
       bind(C, name="cudecompUpdateHalosY") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -484,6 +485,7 @@ module cudecomp
       integer(c_int32_t) :: halo_extents(3)
       logical(c_bool) :: halo_periods(3)
       integer(c_int32_t), value :: dim
+      integer(c_int32_t) :: padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompUpdateHalosY_C
@@ -491,7 +493,7 @@ module cudecomp
 
   interface
     function cudecompUpdateHalosZ_C(handle, grid_desc, input, work, dtype, &
-                                    halo_extents, halo_periods, dim, stream) &
+                                    halo_extents, halo_periods, dim, padding, stream) &
       bind(C, name="cudecompUpdateHalosZ") result(res)
       import
       type(cudecompHandle), value :: handle
@@ -502,6 +504,7 @@ module cudecomp
       integer(c_int32_t) :: halo_extents(3)
       logical(c_bool) :: halo_periods(3)
       integer(c_int32_t), value :: dim
+      integer(c_int32_t) :: padding(3)
       integer(cuda_stream_kind), value :: stream
       integer(c_int) :: res
     end function cudecompUpdateHalosZ_C
@@ -949,7 +952,7 @@ contains
   ! Halo functions
   function cudecompUpdateHalosX(handle, grid_desc, &
        input, work, dtype, halo_extents, halo_periods, &
-       dim, stream) result(res)
+       dim, padding, stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -959,24 +962,28 @@ contains
     integer :: halo_extents(3)
     logical :: halo_periods(3)
     integer :: dim
+    integer, optional :: padding(3)
     integer(cuda_stream_kind), optional :: stream
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     logical(c_bool) :: halo_periods_c(3)
+    integer :: padding_(3)
 
     halo_periods_c(:) = halo_periods
 
     stream_ = 0
+    padding_ = [0, 0, 0]
     if (present(stream)) stream_ = stream
+    if (present(padding)) padding_ = padding
     res = cudecompUpdateHalosX_C(handle, grid_desc, &
           input, work, dtype, halo_extents, halo_periods_c, &
-          dim - 1, stream_)
+          dim - 1, padding_, stream_)
   end function cudecompUpdateHalosX
 
   function cudecompUpdateHalosY(handle, grid_desc, &
        input, work, dtype, halo_extents, halo_periods, &
-       dim, stream) result(res)
+       dim, padding, stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -986,24 +993,28 @@ contains
     integer :: halo_extents(3)
     logical :: halo_periods(3)
     integer :: dim
+    integer, optional :: padding(3)
     integer(cuda_stream_kind), optional :: stream
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     logical(c_bool) :: halo_periods_c(3)
+    integer :: padding_(3)
 
     halo_periods_c(:) = halo_periods
 
     stream_ = 0
+    padding_ = [0, 0, 0]
     if (present(stream)) stream_ = stream
+    if (present(padding)) padding_ = padding
     res = cudecompUpdateHalosY_C(handle, grid_desc, &
           input, work, dtype, halo_extents, halo_periods_c, &
-          dim - 1, stream_)
+          dim - 1, padding_, stream_)
   end function cudecompUpdateHalosY
 
   function cudecompUpdateHalosZ(handle, grid_desc, &
        input, work, dtype, halo_extents, halo_periods, &
-       dim, stream) result(res)
+       dim, padding, stream) result(res)
     implicit none
     type(cudecompHandle) :: handle
     type(cudecompGridDesc) :: grid_desc
@@ -1013,19 +1024,23 @@ contains
     integer :: halo_extents(3)
     logical :: halo_periods(3)
     integer :: dim
+    integer, optional :: padding(3)
     integer(cuda_stream_kind), optional :: stream
     integer(c_int) :: res
 
     integer(cuda_stream_kind) :: stream_
     logical(c_bool) :: halo_periods_c(3)
+    integer :: padding_(3)
 
     halo_periods_c(:) = halo_periods
 
     stream_ = 0
+    padding_ = [0, 0, 0]
     if (present(stream)) stream_ = stream
+    if (present(padding)) padding_ = padding
     res = cudecompUpdateHalosZ_C(handle, grid_desc, &
           input, work, dtype, halo_extents, halo_periods_c, &
-          dim - 1, stream_)
+          dim - 1, padding, stream_)
   end function cudecompUpdateHalosZ
 
   ! Helper function to copy string
diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc
index 86e175c..2a231be 100644
--- a/tests/cc/halo_test.cc
+++ b/tests/cc/halo_test.cc
@@ -115,9 +115,7 @@ static std::vector<std::string> read_testfile(const std::string& filename) {
 static bool compare_pencils(const std::vector<real_t>& ref, const std::vector<real_t>& res,
                             const cudecompPencilInfo_t& pinfo) {
   for (int64_t i = 0; i < ref.size(); ++i) {
-    if (ref[i] != real_t(-1)) {
-      if (ref[i] != res[i]) return false;
-    }
+    if (ref[i] != res[i]) return false;
   }
   return true;
 }
@@ -138,9 +136,9 @@ static void initialize_pencil(std::vector<real_t>& ref, const cudecompPencilInfo
     int64_t gi = gx[0] + gdims[0] * (gx[1] + gx[2] * gdims[1]);
 
     // Only set values inside internal region
-    if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]]) &&
-        lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]]) &&
-        lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]])) {
+    if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]]) &&
+        lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]]) &&
+        lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]])) {
       ref[i] = gi;
     } else {
       ref[i] = -1;
@@ -176,6 +174,12 @@ static void initialize_reference(std::vector<real_t>& ref, const cudecompPencilI
         }
       }
     }
+    // Also mark any padded elements for unset value (-1)
+    if (lx[0] >= pinfo.shape[0] - pinfo.padding[pinfo.order[0]] ||
+        lx[1] >= pinfo.shape[1] - pinfo.padding[pinfo.order[1]] ||
+        lx[2] >= pinfo.shape[2] - pinfo.padding[pinfo.order[2]]) {
+      unset = true;
+    }
 
     int64_t gi = (unset) ? -1 : gx[0] + gdims[0] * (gx[1] + gx[2] * gdims[1]);
     ref[i] = gi;
@@ -247,6 +251,7 @@ struct haloTestArgs {
   std::array<int, 3> gdims_dist{};
   std::array<int, 3> halo_extents{1, 1, 1};
   std::array<bool, 3> halo_periods{true, true, true};
+  std::array<int, 3> padding{};
   int axis = 0;
   bool use_managed_memory = false;
   std::array<int, 9> mem_order{-1, -1, -1, -1, -1, -1, -1, -1, -1};
@@ -271,12 +276,13 @@ static haloTestArgs parse_arguments(const std::string& arguments) {
         {"hex", required_argument, 0, '7'}, {"hey", required_argument, 0, '8'},
         {"hez", required_argument, 0, '9'}, {"hpx", required_argument, 0, 'e'},
         {"hpy", required_argument, 0, 'f'}, {"hpz", required_argument, 0, 'g'},
-        {"ax", required_argument, 0, 'a'},  {"use-managed-memory", no_argument, 0, 'm'},
-        {"mem_order", required_argument, 0, 'q'},
+        {"pdx", required_argument, 0, '&'}, {"pdy", required_argument, 0, '*'},
+        {"pdz", required_argument, 0, '*'}, {"ax", required_argument, 0, 'a'},
+        {"use-managed-memory", no_argument, 0, 'm'}, {"mem_order", required_argument, 0, 'q'},
         {"help", no_argument, 0, 'h'},      {0, 0, 0, 0}};
 
     int option_index = 0;
-    int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:e:f:g:a:q:mh", long_options, &option_index);
+    int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:e:f:g:a:q:&:*:(:mh", long_options, &option_index);
     if (ch == -1) break;
 
     switch (ch) {
@@ -303,6 +309,9 @@ static haloTestArgs parse_arguments(const std::string& arguments) {
     case 'e': args.halo_periods[0] = atoi(optarg); break;
     case 'f': args.halo_periods[1] = atoi(optarg); break;
     case 'g': args.halo_periods[2] = atoi(optarg); break;
+    case '&': args.padding[0] = atoi(optarg); break;
+    case '*': args.padding[1] = atoi(optarg); break;
+    case '(': args.padding[2] = atoi(optarg); break;
     case 'a': args.axis = atoi(optarg); break;
     case 'm': args.use_managed_memory = true; break;
     case 'q':
@@ -401,7 +410,7 @@ static int run_test(const std::string& arguments, bool silent) {
 
     // Get pencil information
     cudecompPencilInfo_t pinfo;
-    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data(), nullptr));
+    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data(), args.padding.data()));
 
     // Get workspace size
     int64_t workspace_num_elements;
@@ -441,15 +450,15 @@ static int run_test(const std::string& arguments, bool silent) {
       switch (args.axis) {
       case 0:
         CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, input, work_d, get_cudecomp_datatype(real_t(0)),
-                                                 pinfo.halo_extents, args.halo_periods.data(), i, 0));
+                                                 pinfo.halo_extents, args.halo_periods.data(), i, pinfo.padding, 0));
         break;
       case 1:
         CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, input, work_d, get_cudecomp_datatype(real_t(0)),
-                                                 pinfo.halo_extents, args.halo_periods.data(), i, 0));
+                                                 pinfo.halo_extents, args.halo_periods.data(), i, pinfo.padding, 0));
         break;
       case 2:
         CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, input, work_d, get_cudecomp_datatype(real_t(0)),
-                                                 pinfo.halo_extents, args.halo_periods.data(), i, 0));
+                                                 pinfo.halo_extents, args.halo_periods.data(), i, pinfo.padding, 0));
         break;
       }
     }
diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90
index 3f001f4..041398b 100644
--- a/tests/fortran/halo_test.f90
+++ b/tests/fortran/halo_test.f90
@@ -70,12 +70,8 @@ function compare_pencils(ref, res, pinfo) result(mismatch)
     use cudecomp
     implicit none
     type(cudecompPencilInfo) :: pinfo
-    ARRTYPE :: ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                   pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                   pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))
-    ARRTYPE :: res(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                   pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                   pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))
+    ARRTYPE :: ref(pinfo%shape(1), pinfo%shape(2), pinfo%shape(3))
+    ARRTYPE :: res(pinfo%shape(1), pinfo%shape(2), pinfo%shape(3))
 
     logical :: mismatch
     mismatch = any(ref /= res)
@@ -91,10 +87,10 @@ subroutine initialize_pencil(ref, pinfo, gdims)
     integer :: gdims(3)
     integer :: gx(3)
 
-    ! Allocate reference pencil with halo regions
-    allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                 pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                 pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))))
+    ! Allocate reference pencil with halo and padding regions
+    allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), &
+                 pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), &
+                 pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3))))
 
     ref = -1
 
@@ -127,10 +123,10 @@ subroutine initialize_reference(ref, pinfo, gdims, halo_periods)
     logical :: halo_periods(3)
     logical :: unset
 
-    ! Allocate reference pencil with halo regions
-    allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), &
-                 pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), &
-                 pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))))
+    ! Allocate reference pencil with halo and padding regions
+    allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), &
+                 pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), &
+                 pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3))))
 
     ref = -1
 
@@ -229,6 +225,7 @@ function run_test(arguments, silent) result(res)
     integer :: gdims_dist(3)
     integer :: halo_extents(3)
     logical :: halo_periods(3)
+    integer :: padding(3)
     integer :: mem_order(3, 3)
     logical :: use_managed_memory
     integer :: pr, pc
@@ -282,6 +279,7 @@ function run_test(arguments, silent) result(res)
     gdims_dist(:) = 0
     halo_extents(:) = 1
     halo_periods(:) = .true.
+    padding(:) = 0
     mem_order(:, :) = -1
     axis = 1
     use_managed_memory = .false.
@@ -369,6 +367,18 @@ function run_test(arguments, silent) result(res)
           read(arg, *) iarg
           halo_periods(3) = iarg
           skip_count = 1
+        case('--pdx')
+          read(args(i+1), *) arg
+          read(arg, *) padding(1)
+          skip_count = 1
+        case('--pdy')
+          read(args(i+1), *) arg
+          read(arg, *) padding(2)
+          skip_count = 1
+        case('--pdz')
+          read(args(i+1), *) arg
+          read(arg, *) padding(3)
+          skip_count = 1
         case('--ax')
           read(args(i+1), *) arg
           read(arg, *) axis
@@ -437,7 +447,7 @@ function run_test(arguments, silent) result(res)
     endif
 
     ! Get pencil information
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents))
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents, padding))
 
     ! Get workspace size
     CHECK_CUDECOMP(cudecompGetHaloWorkspaceSize(handle, grid_desc, axis, halo_extents, workspace_num_elements))
@@ -477,11 +487,11 @@ function run_test(arguments, silent) result(res)
     do i = 1, 3
       select case(axis)
         case(1)
-          CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i))
+          CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i, padding))
         case(2)
-          CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i))
+          CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i, padding))
         case(3)
-          CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i))
+          CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i, padding))
       end select
     end do
 

From 1d3b65522c3dee40b1b74900304e5bd56554878c Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 10 Mar 2025 11:54:59 -0700
Subject: [PATCH 3/8] Fixes after rebase.

---
 tests/cc/transpose_test.cc       |  4 ++--
 tests/fortran/transpose_test.f90 | 23 +++++++++++++----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc
index 88fe76a..0e1ddc8 100644
--- a/tests/cc/transpose_test.cc
+++ b/tests/cc/transpose_test.cc
@@ -426,11 +426,11 @@ static int run_test(const std::string& arguments, bool silent) {
 
     // Get y-pencil information
     cudecompPencilInfo_t pinfo_y;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data(), args.padding_x.data()));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data(), args.padding_y.data()));
 
     // Get z-pencil information
     cudecompPencilInfo_t pinfo_z;
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data(), args.padding_x.data()));
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data(), args.padding_z.data()));
 
     // Get workspace size
     int64_t workspace_num_elements;
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index 9b18d15..908042d 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -233,6 +233,9 @@ function run_test(arguments, silent) result(res)
     halo_extents_x(:) = 0
     halo_extents_y(:) = 0
     halo_extents_z(:) = 0
+    padding_x(:) = 0
+    padding_y(:) = 0
+    padding_z(:) = 0
     mem_order(:,:) = -1
     out_of_place = .false.
     use_managed_memory = .false.
@@ -310,19 +313,19 @@ function run_test(arguments, silent) result(res)
           skip_count = 3
         case('--pdx')
           do j = 1, 3
-            call get_command_argument(i+j, arg)
+            read(args(i+j), *) arg
             read(arg, *) padding_x(j)
           enddo
           skip_count = 3
         case('--pdy')
           do j = 1, 3
-            call get_command_argument(i+j, arg)
+            read(args(i+j), *) arg
             read(arg, *) padding_y(j)
           enddo
           skip_count = 3
         case('--pdz')
           do j = 1, 3
-            call get_command_argument(i+j, arg)
+            read(args(i+j), *) arg
             read(arg, *) padding_z(j)
           enddo
           skip_count = 3
@@ -389,13 +392,13 @@ function run_test(arguments, silent) result(res)
     endif
 
     ! Get x-pencil information
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1, halo_extents_x))
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1, halo_extents_x, padding_x))
 
     ! Get y-pencil information
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2, halo_extents_y))
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2, halo_extents_y, padding_y))
 
     ! Get z-pencil information
-    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3, halo_extents_z))
+    CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3, halo_extents_z, padding_z))
 
     ! Get workspace size
     CHECK_CUDECOMP(cudecompGetTransposeWorkspaceSize(handle, grid_desc, workspace_num_elements))
@@ -444,7 +447,7 @@ function run_test(arguments, silent) result(res)
     endif
 
     work_d = 0
-    CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, input, output, work_d, dtype, pinfo_x%halo_extents, pinfo_y%halo_extents))
+    CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, input, output, work_d, dtype, pinfo_x%halo_extents, pinfo_y%halo_extents, pinfo_x%padding, pinfo_y%padding))
     data = output
     if (compare_pencils(yref, data, pinfo_y)) then
       print*, "FAILED cudecompTranposeXToY"
@@ -463,7 +466,7 @@ function run_test(arguments, silent) result(res)
     endif
 
     work_d = 0
-    CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_z%halo_extents))
+    CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_z%halo_extents, pinfo_y%padding, pinfo_z%padding))
     data = output
     if (compare_pencils(zref, data, pinfo_z)) then
       print*, "FAILED cudecompTranposeYToZ"
@@ -482,7 +485,7 @@ function run_test(arguments, silent) result(res)
     endif
 
     work_d = 0
-    CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, input, output, work_d, dtype, pinfo_z%halo_extents, pinfo_y%halo_extents))
+    CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, input, output, work_d, dtype, pinfo_z%halo_extents, pinfo_y%halo_extents, pinfo_z%padding, pinfo_y%padding))
     data = output
     if (compare_pencils(yref, data, pinfo_y)) then
       print*, "FAILED cudecompTranposeZToY"
@@ -501,7 +504,7 @@ function run_test(arguments, silent) result(res)
     endif
 
     work_d = 0
-    CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_x%halo_extents))
+    CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_x%halo_extents, pinfo_y%padding, pinfo_x%padding))
     data = output
     if (compare_pencils(xref, data, pinfo_x)) then
       print*, "FAILED cudecompTranposeYToX"

From 0261a29656f8881ecc6e30276932aa3a811321c5 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 10 Mar 2025 15:39:34 -0700
Subject: [PATCH 4/8] Adding padding test cases with some additional test
 reorganization/optimization.

---
 tests/test_config.yaml | 187 +++++++++++++++++++++++++++++++++++++++--
 tests/test_runner.py   |  48 +++++++++--
 2 files changed, 225 insertions(+), 10 deletions(-)

diff --git a/tests/test_config.yaml b/tests/test_config.yaml
index abef7f2..293e86f 100644
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@@ -4,7 +4,8 @@ base: &base
     NVSHMEM_DISABLE_NCCL: 1
 
   dtypes: ['R32', 'R64', 'C32', 'C64']
-
+  apply_skips: None
+  use_single_pdim: False
 
 transpose_test_base: &transpose_test_base
   <<: *base
@@ -13,7 +14,8 @@ transpose_test_base: &transpose_test_base
   args : ['backend',
           'gx', 'gy', 'gz',
           'gd',
-          'hex' ,'hey', 'hez']
+          'hex' ,'hey', 'hez',
+          'pdx', 'pdy', 'pdz']
 
   backend: [1, 2, 3, 4, 5, 6, 7]
 
@@ -27,6 +29,10 @@ transpose_test_base: &transpose_test_base
   hey: ["0 0 0"]
   hez: ["0 0 0"]
 
+  pdx: ["0 0 0"]
+  pdy: ["0 0 0"]
+  pdz: ["0 0 0"]
+
   out_of_place: [True, False]
   managed_memory : [True, False]
   run_autotuning : False
@@ -36,11 +42,51 @@ transpose_test_base: &transpose_test_base
 transpose_test: &transpose_test
   <<: *transpose_test_base
 
+transpose_test_halo: &transpose_test_halo
+  <<: *transpose_test_base
+
+  backend: [1, 2] # Limit testing to one non-pipelined and one pipelined backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
+  hex: ["0 0 0", "1 1 1"]
+  hey: ["0 0 0", "1 1 1"]
+  hez: ["0 0 0", "1 1 1"]
+  apply_skips: 'transpose'
+
+transpose_test_padding: &transpose_test_padding
+  <<: *transpose_test_base
+
+  backend: [1, 2] # Limit testing to one non-pipelined and one pipelined backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
+  pdx: ["0 0 0", "1 1 1"]
+  pdy: ["0 0 0", "1 1 1"]
+  pdz: ["0 0 0", "1 1 1"]
+  apply_skips: 'transpose'
+
+transpose_test_gdimdist: &transpose_test_gdimdist
+  <<: *transpose_test_base
+  backend: [1] # Limit testing to one backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
+  gd: ["16 16 16"]
+  apply_skips: 'transpose'
+
+transpose_test_mix: &transpose_test_mix
+  <<: *transpose_test_base
+
+  backend: [1, 2] # Limit testing to one non-pipelined and one pipelined backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
   hex: ["0 0 0", "1 1 1"]
   hey: ["0 0 0", "1 1 1"]
   hez: ["0 0 0", "1 1 1"]
 
-  gd: ["0 0 0", "16 16 16"]
+  pdx: ["0 0 0", "1 1 1"]
+  pdy: ["0 0 0", "1 1 1"]
+  pdz: ["0 0 0", "1 1 1"]
+
+  apply_skips: 'transpose_mix'
 
 transpose_test_ac: &transpose_test_ac
   <<: *transpose_test_base
@@ -48,7 +94,8 @@ transpose_test_ac: &transpose_test_ac
           'acx', 'acy', 'acz',
           'gx', 'gy', 'gz',
           'gd',
-          'hex' ,'hey', 'hez']
+          'hex' ,'hey', 'hez',
+          'pdx', 'pdy', 'pdz']
 
   backend: [1] # Limit this testing to one backend
   dtypes: ['R32'] # Limit to one data type
@@ -61,6 +108,22 @@ transpose_test_cc:
   <<: *transpose_test
   executable_prefix: 'cc/transpose_test'
 
+transpose_test_halo_cc:
+  <<: *transpose_test_halo
+  executable_prefix: 'cc/transpose_test'
+
+transpose_test_padding_cc:
+  <<: *transpose_test_padding
+  executable_prefix: 'cc/transpose_test'
+
+transpose_test_gdimdist_cc:
+  <<: *transpose_test_gdimdist
+  executable_prefix: 'cc/transpose_test'
+
+transpose_test_mix_cc:
+  <<: *transpose_test_mix
+  executable_prefix: 'cc/transpose_test'
+
 transpose_test_ac_cc:
   <<: *transpose_test_ac
   executable_prefix: 'cc/transpose_test'
@@ -70,6 +133,26 @@ transpose_test_fortran:
   executable_prefix: 'fortran/transpose_test'
   fortran_indexing: true
 
+transpose_test_halo_fortran:
+  <<: *transpose_test_halo
+  executable_prefix: 'fortran/transpose_test'
+  fortran_indexing: true
+
+transpose_test_padding_fortran:
+  <<: *transpose_test_padding
+  executable_prefix: 'fortran/transpose_test'
+  fortran_indexing: true
+
+transpose_test_gdimdist_fortran:
+  <<: *transpose_test_gdimdist
+  executable_prefix: 'fortran/transpose_test'
+  fortran_indexing: true
+
+transpose_test_mix_fortran:
+  <<: *transpose_test_mix
+  executable_prefix: 'fortran/transpose_test'
+  fortran_indexing: true
+
 transpose_test_ac_fortran:
   <<: *transpose_test_ac
   executable_prefix: 'fortran/transpose_test'
@@ -83,6 +166,7 @@ halo_test_base: &halo_test_base
           'gd',
           'hex' ,'hey', 'hez',
           'hpx', 'hpy', 'hpz',
+          'pdx', 'pdy', 'pdz',
           'ax']
 
   backend: [1, 2, 3, 4, 5]
@@ -101,6 +185,10 @@ halo_test_base: &halo_test_base
   hpy: [1]
   hpz: [1]
 
+  pdx: [0]
+  pdy: [0]
+  pdz: [0]
+
   ax: [0, 1, 2]
 
   managed_memory : [True, False]
@@ -112,6 +200,13 @@ halo_test_base: &halo_test_base
 
 halo_test: &halo_test
   <<: *halo_test_base
+
+halo_test_halomix: &halo_test_halomix
+  <<: *halo_test_base
+
+  backend: [1] # Limit testing to one backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
   hex: [0, 1]
   hey: [0, 1]
   hez: [0, 1]
@@ -119,8 +214,49 @@ halo_test: &halo_test
   hpx: [0, 1]
   hpy: [0, 1]
   hpz: [0, 1]
+  apply_skips: 'halo'
 
-  gd: ["0 0 0", "16 16 16"]
+halo_test_padding: &halo_test_padding
+  <<: *halo_test_base
+
+  backend: [1] # Limit testing to one backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
+  pdx: [0, 1]
+  pdy: [0, 1]
+  pdz: [0, 1]
+
+  apply_skips: 'halo_padding'
+
+halo_test_gdimdist: &halo_test_gdimdist
+  <<: *halo_test_base
+
+  backend: [1] # Limit testing to one backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
+  gd: ["16 16 16"]
+
+  apply_skips: 'halo'
+
+halo_test_mix: &halo_test_mix
+  <<: *halo_test_base
+
+  backend: [1] # Limit testing to one backend
+  dtypes: ['R32'] # Limit testing to one dtype
+
+  pdx: [0, 1]
+  pdy: [0, 1]
+  pdz: [0, 1]
+
+  hex: [0, 1]
+  hey: [0, 1]
+  hez: [0, 1]
+
+  hpx: [0, 1]
+  hpy: [0, 1]
+  hpz: [0, 1]
+
+  apply_skips: 'halo_padding'
 
 halo_test_ac: &halo_test_ac
   <<: *halo_test_base
@@ -130,6 +266,7 @@ halo_test_ac: &halo_test_ac
           'gd',
           'hex' ,'hey', 'hez',
           'hpx', 'hpy', 'hpz',
+          'pdx', 'pdy', 'pdz',
           'ax']
 
   backend: [1] # Limit this testing to one backend
@@ -143,6 +280,22 @@ halo_test_cc:
   <<: *halo_test
   executable_prefix: 'cc/halo_test'
 
+halo_test_halomix_cc:
+  <<: *halo_test_halomix
+  executable_prefix: 'cc/halo_test'
+
+halo_test_padding_cc:
+  <<: *halo_test_padding
+  executable_prefix: 'cc/halo_test'
+
+halo_test_gdimdist_cc:
+  <<: *halo_test_gdimdist
+  executable_prefix: 'cc/halo_test'
+
+halo_test_mix_cc:
+  <<: *halo_test_mix
+  executable_prefix: 'cc/halo_test'
+
 halo_test_ac_cc:
   <<: *halo_test_ac
   executable_prefix: 'cc/halo_test'
@@ -153,6 +306,30 @@ halo_test_fortran:
   executable_prefix: 'fortran/halo_test'
   fortran_indexing: true
 
+halo_test_halomix_fortran:
+  <<: *halo_test_halomix
+  ax: [1, 2, 3]
+  executable_prefix: 'fortran/halo_test'
+  fortran_indexing: true
+
+halo_test_padding_fortran:
+  <<: *halo_test_padding
+  ax: [1, 2, 3]
+  executable_prefix: 'fortran/halo_test'
+  fortran_indexing: true
+
+halo_test_gdimdist_fortran:
+  <<: *halo_test_gdimdist
+  ax: [1, 2, 3]
+  executable_prefix: 'fortran/halo_test'
+  fortran_indexing: true
+
+halo_test_mix_fortran:
+  <<: *halo_test_mix
+  ax: [1, 2, 3]
+  executable_prefix: 'fortran/halo_test'
+  fortran_indexing: true
+
 halo_test_ac_fortran:
   <<: *halo_test_ac
   ax: [1, 2, 3]
diff --git a/tests/test_runner.py b/tests/test_runner.py
index 44b2b03..e988709 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -25,10 +25,36 @@ def load_yaml_config(yaml_file, config_name):
     config = yaml.safe_load(f)[config_name]
   return config
 
-def should_skip_case(arg_dict):
+def should_skip_case(arg_dict, key):
   skip = False
 
-  try:
+  if key == 'transpose':
+    # Skip cases with all halo extents, padding, and gdimdist as zero
+    if ((arg_dict["hex"] == "0 0 0" and arg_dict["hey"] == "0 0 0" and arg_dict["hez"] == "0 0 0") and
+        (arg_dict["pdx"] == "0 0 0" and arg_dict["pdy"] == "0 0 0" and arg_dict["pdz"] == "0 0 0") and
+        (arg_dict["gd"] == "0 0 0")):
+      skip = True
+
+    # Skip cases where halo extents in X and Z are unequal as these cases are redundant
+    if (arg_dict["hex"] != arg_dict["hez"]):
+      skip = True
+
+    # Skip cases where padding in X and Z are unequal as these cases are redundant
+    if (arg_dict["pdx"] != arg_dict["pdz"]):
+      skip = True
+
+  elif key == 'transpose_mix':
+    skip = should_skip_case(arg_dict, 'transpose')
+
+    # Skip cases where all halo extents are zero
+    if (arg_dict["hex"] == "0 0 0" and arg_dict["hey"] == "0 0 0" and arg_dict["hez"] == "0 0 0"):
+      skip = True
+
+    # Skip cases where all padding is zero
+    if (arg_dict["pdx"] == "0 0 0" and arg_dict["pdy"] == "0 0 0" and arg_dict["pdz"] == "0 0 0"):
+      skip = True
+
+  elif key == 'halo':
     # No need to test periodic flags if halo extent is zero
     if arg_dict["hex"] == 0 and arg_dict["hpx"] == 1:
       skip = True
@@ -40,8 +66,14 @@ def should_skip_case(arg_dict):
     # Skip cases with all halo extents as zero
     if arg_dict["hex"] == 0 and arg_dict["hey"] == 0 and arg_dict["hez"] == 0:
       skip = True
-  except:
-    pass
+
+  elif key == 'halo_padding':
+    skip = should_skip_case(arg_dict, 'halo')
+
+    # Skip cases where all padding is zero
+    if (arg_dict["pdx"] == 0 and arg_dict["pdy"] == 0 and arg_dict["pdz"] == 0):
+      skip = True
+
 
   return skip
 
@@ -61,6 +93,11 @@ def generate_command_lines(config, args):
 
   cmds = []
   prs = get_factors(args.ngpu)
+  if len(prs) > 3:
+    prs = [prs[0], prs[len(pr) // 2], prs[-1]]
+  if config['use_single_pdim']:
+    prs = [prs[min(len(prs) + 1, 1)]]
+
   pcs = [args.ngpu // x for x in prs]
   if config['run_autotuning']:
     prs = [0] + prs
@@ -81,7 +118,8 @@ def generate_command_lines(config, args):
           continue
 
       # Check additional skip conditions
-      if should_skip_case(arg_dict): continue
+      if (config['apply_skips']):
+        if should_skip_case(arg_dict, config['apply_skips']): continue
 
       extra_flags = []
       extra_flags.append(['-m' if x else '' for x in config['managed_memory']])

From 534f2cf84b6a9e7030ca71aa3c5fa3bff5038d62 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 10 Mar 2025 15:54:56 -0700
Subject: [PATCH 5/8] Fix in test_runner.py

---
 tests/test_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_runner.py b/tests/test_runner.py
index e988709..e641b92 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -94,7 +94,7 @@ def generate_command_lines(config, args):
   cmds = []
   prs = get_factors(args.ngpu)
   if len(prs) > 3:
-    prs = [prs[0], prs[len(pr) // 2], prs[-1]]
+    prs = [prs[0], prs[len(prs) // 2], prs[-1]]
   if config['use_single_pdim']:
     prs = [prs[min(len(prs) + 1, 1)]]
 

From 9ad04d066f2a04040b0765e322834497e4070de9 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Tue, 11 Mar 2025 14:00:21 -0700
Subject: [PATCH 6/8] Use workspace caching in test implementation to improve
 efficiency.

---
 tests/cc/halo_test.cc            | 30 +++++++++++++++++++++++++++---
 tests/cc/transpose_test.cc       | 31 ++++++++++++++++++++++++++++---
 tests/fortran/halo_test.f90      | 21 +++++++++++++++++----
 tests/fortran/transpose_test.f90 | 21 ++++++++++++++++++---
 4 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc
index 2a231be..1ece724 100644
--- a/tests/cc/halo_test.cc
+++ b/tests/cc/halo_test.cc
@@ -340,6 +340,7 @@ static haloTestArgs parse_arguments(const std::string& arguments) {
 int rank, nranks;
 cudecompHandle_t handle;
 std::unordered_map<cudecompHaloCommBackend_t, cudecompGridDesc_t> grid_desc_cache;
+std::tuple<int, real_t*, size_t> workspace{-1, nullptr, 0};
 
 // Cache a single grid descriptor per backend type. This keeps NCCL/NVSHMEM initialized between tests for
 // better throughput.
@@ -437,8 +438,32 @@ static int run_test(const std::string& arguments, bool silent) {
     }
     int64_t dtype_size;
     CHECK_CUDECOMP(cudecompGetDataTypeSize(get_cudecomp_datatype(real_t(0)), &dtype_size));
-    CHECK_CUDECOMP(
-        cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+
+    // Allocate workspace (reuse exising workspace if able)
+    if (std::get<0>(workspace) == static_cast<int>(config.halo_comm_backend)){
+      work_d = std::get<1>(workspace);
+      if (std::get<2>(workspace) < workspace_num_elements * dtype_size) {
+        CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d));
+        CHECK_CUDECOMP(
+            cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+        std::get<1>(workspace) = work_d;
+        std::get<2>(workspace) = workspace_num_elements * dtype_size;
+      }
+    } else if (std::get<0>(workspace) > 0 && std::get<0>(workspace) != static_cast<int>(config.halo_comm_backend)) {
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache[static_cast<cudecompHaloCommBackend_t>(std::get<0>(workspace))],
+                                  std::get<1>(workspace)));
+      CHECK_CUDECOMP(
+          cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+      std::get<0>(workspace) = static_cast<int>(config.halo_comm_backend);
+      std::get<1>(workspace) = work_d;
+      std::get<2>(workspace) = workspace_num_elements * dtype_size;
+    } else {
+      CHECK_CUDECOMP(
+          cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+      std::get<0>(workspace) = static_cast<int>(config.halo_comm_backend);
+      std::get<1>(workspace) = work_d;
+      std::get<2>(workspace) = workspace_num_elements * dtype_size;
+    }
 
     // Running correctness tests
     if (!silent && rank == 0) printf("running correctness tests...\n");
@@ -470,7 +495,6 @@ static int run_test(const std::string& arguments, bool silent) {
     }
 
     CHECK_CUDA(cudaFree(data_d));
-    CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d));
   } catch (const std::exception& e) {
     return 1;
   }
diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc
index 0e1ddc8..6f3c19b 100644
--- a/tests/cc/transpose_test.cc
+++ b/tests/cc/transpose_test.cc
@@ -36,6 +36,7 @@
 #include <numeric>
 #include <sstream>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>
 
@@ -358,6 +359,7 @@ static transposeTestArgs parse_arguments(const std::string& arguments) {
 int rank, nranks;
 cudecompHandle_t handle;
 std::unordered_map<cudecompTransposeCommBackend_t, cudecompGridDesc_t> grid_desc_cache;
+std::tuple<int, real_t*, size_t> workspace{-1, nullptr, 0};
 
 // Cache a single grid descriptor per backend type. This keeps NCCL/NVSHMEM initialized between tests for
 // better throughput.
@@ -458,8 +460,32 @@ static int run_test(const std::string& arguments, bool silent) {
     }
     int64_t dtype_size;
     CHECK_CUDECOMP(cudecompGetDataTypeSize(get_cudecomp_datatype(real_t(0)), &dtype_size));
-    CHECK_CUDECOMP(
-        cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+
+    // Allocate workspace (reuse exising workspace if able)
+    if (std::get<0>(workspace) == static_cast<int>(config.transpose_comm_backend)){
+      work_d = std::get<1>(workspace);
+      if (std::get<2>(workspace) < workspace_num_elements * dtype_size) {
+        CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d));
+        CHECK_CUDECOMP(
+            cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+        std::get<1>(workspace) = work_d;
+        std::get<2>(workspace) = workspace_num_elements * dtype_size;
+      }
+    } else if (std::get<0>(workspace) > 0 && std::get<0>(workspace) != static_cast<int>(config.transpose_comm_backend)) {
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache[static_cast<cudecompTransposeCommBackend_t>(std::get<0>(workspace))],
+                                  std::get<1>(workspace)));
+      CHECK_CUDECOMP(
+          cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+      std::get<0>(workspace) = static_cast<int>(config.transpose_comm_backend);
+      std::get<1>(workspace) = work_d;
+      std::get<2>(workspace) = workspace_num_elements * dtype_size;
+    } else {
+      CHECK_CUDECOMP(
+          cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_d), workspace_num_elements * dtype_size));
+      std::get<0>(workspace) = static_cast<int>(config.transpose_comm_backend);
+      std::get<1>(workspace) = work_d;
+      std::get<2>(workspace) = workspace_num_elements * dtype_size;
+    }
 
     real_t* data_2_d = nullptr;
     if (args.out_of_place) {
@@ -524,7 +550,6 @@ static int run_test(const std::string& arguments, bool silent) {
 
     CHECK_CUDA(cudaFree(data_d));
     if (data_2_d) CHECK_CUDA(cudaFree(data_2_d));
-    CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d));
   } catch (const std::exception& e) {
     return 1;
   }
diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90
index 041398b..589f455 100644
--- a/tests/fortran/halo_test.f90
+++ b/tests/fortran/halo_test.f90
@@ -64,6 +64,8 @@ module halo_CUDECOMP_DOUBLE_COMPLEX_mod
   integer :: rank, nranks
   type(cudecompGridDesc) :: grid_desc_cache(5)
   logical :: grid_desc_cache_set(5) = .false.
+  ARRTYPE, pointer, device, contiguous :: work_d(:)
+  integer :: work_backend = -1
 
   contains
   function compare_pencils(ref, res, pinfo) result(mismatch)
@@ -249,7 +251,6 @@ function run_test(arguments, silent) result(res)
     ARRTYPE, allocatable :: ref(:, :, :), init(:, :, :), data(:)
     ARRTYPE, allocatable, device, target:: data_d(:)
     ARRTYPE, allocatable, managed, target:: data_m(:)
-    ARRTYPE, pointer, device, contiguous :: work_d(:)
     ARRTYPE, pointer, device:: input(:)
     integer :: dtype = DTYPE
 
@@ -465,7 +466,21 @@ function run_test(arguments, silent) result(res)
     else
       allocate(data_d(data_num_elements))
     endif
-    CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+
+    ! Allocate workspace (reuse exising workspace if able)
+    if (work_backend == config%halo_comm_backend) then
+      if (size(work_d) < workspace_num_elements) then
+        CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d))
+        CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+      endif
+    elseif (work_backend > 0 .and. work_backend /= config%halo_comm_backend) then
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache(work_backend), work_d))
+      CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+      work_backend = config%halo_comm_backend;
+    else
+      CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+      work_backend = config%halo_comm_backend;
+    endif
 
     ! Running correctness tests
     if (.not. silent .and. rank == 0) write(*,"('Running correctness tests using ', a, ' backend ...')") &
@@ -508,8 +523,6 @@ function run_test(arguments, silent) result(res)
       deallocate(data_d)
     endif
 
-    CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d))
-
   end function
 end module
 
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index 908042d..29b9cc0 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -64,6 +64,8 @@ module transpose_CUDECOMP_DOUBLE_COMPLEX_mod
   integer :: rank, nranks
   type(cudecompGridDesc) :: grid_desc_cache(7)
   logical :: grid_desc_cache_set(7) = .false.
+  ARRTYPE, pointer, device, contiguous :: work_d(:)
+  integer :: work_backend = -1
 
   contains
   function compare_pencils(ref, res, pinfo) result(mismatch)
@@ -198,7 +200,6 @@ function run_test(arguments, silent) result(res)
     ARRTYPE, allocatable :: xref(:, :, :), yref(:, :, :), zref(:, :, :), data(:)
     ARRTYPE, allocatable, device, target:: data_d(:), data_2_d(:)
     ARRTYPE, allocatable, managed, target:: data_m(:), data_2_m(:)
-    ARRTYPE, pointer, device, contiguous :: work_d(:)
     ARRTYPE, pointer, device:: input(:), output(:)
     integer :: dtype = DTYPE
 
@@ -417,7 +418,22 @@ function run_test(arguments, silent) result(res)
     else
       allocate(data_d(data_num_elements))
     endif
-    CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+
+    ! Allocate workspace (reuse exising workspace if able)
+    if (work_backend == config%transpose_comm_backend) then
+      if (size(work_d) < workspace_num_elements) then
+        CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d))
+        CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+      endif
+    elseif (work_backend > 0 .and. work_backend /= config%transpose_comm_backend) then
+      CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache(work_backend), work_d))
+      CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+      work_backend = config%transpose_comm_backend;
+    else
+      CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements))
+      work_backend = config%transpose_comm_backend;
+    endif
+
     if (out_of_place) then
       if (use_managed_memory) then
         allocate(data_2_m(data_num_elements))
@@ -519,7 +535,6 @@ function run_test(arguments, silent) result(res)
       deallocate(data_d)
       if (out_of_place) deallocate(data_2_d)
     endif
-    CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d))
 
   end function run_test
 end module

From 3a7ad3cd220e5abb3edd38a46b1c60517bcb8e21 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Tue, 11 Mar 2025 14:52:10 -0700
Subject: [PATCH 7/8] Update documentation.

---
 docs/api/f_api.rst   | 58 ++++++++++++++++++++++++++++----------------
 docs/basic_usage.rst | 38 ++++++++++++++++-------------
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/docs/api/f_api.rst b/docs/api/f_api.rst
index 48df4c7..0fef5b4 100644
--- a/docs/api/f_api.rst
+++ b/docs/api/f_api.rst
@@ -94,12 +94,13 @@ ____________________
 
   A data structure containing geometry information about a pencil data buffer.
 
-  :f integer shape(3): pencil shape (in local order, including halo elements)
-  :f integer lo(3): lower bound coordinates (in local order, excluding halo elements)
-  :f integer hi(3): upper bound coordinates (in local order, excluding halo elements)
+  :f integer shape(3): pencil shape (in local order, including halo and padding elements)
+  :f integer lo(3): lower bound coordinates (in local order, excluding halo and padding elements)
+  :f integer hi(3): upper bound coordinates (in local order, excluding halo and padding elements)
   :f integer order(3): data layout order (e.g. 3,2,1 means memory is ordered Z,Y,X)
   :f integer halo_extents(3): halo extents by dimension (in global order)
-  :f int64 size: number of elements in pencil (including halo elements)
+  :f integer padding(3): padding by dimension (in global order)
+  :f int64 size: number of elements in pencil (including halo and padding elements)
 
 Communication Backends
 ---------------------------------
@@ -345,7 +346,7 @@ Helper Functions
 cudecompGetPencilInfo
 _____________________
 
-.. f:function:: cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis[, halo_extents])
+.. f:function:: cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis[, halo_extents, padding])
 
   Collects geometry information about assigned pencils, by domain axis.
 
@@ -355,7 +356,8 @@ _____________________
   :p cudecompGridDesc grid_desc [in]: A cuDecomp grid descriptor.
   :p cudecompPencilInfo pencil_info [out]: A cuDecomp pencil information structure.
   :p integer axis [in]: The domain axis the desired pencil is aligned with.
-  :p integer halo_extents(3) [in, optional]: An array of three integers to define halo region extents of the pencil, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side).
+  :p integer halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the pencil, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side).
+  :p integer padding(3) [in,optional]: An array of three integers to define padding of the pencil, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 ------
@@ -434,7 +436,7 @@ Transposition Functions
 cudecompTransposeXToY
 _____________________
 
-.. f:function:: cudecompTransposeXToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream])
+.. f:function:: cudecompTransposeXToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream])
 
   Function to transpose data from X-axis aligned pencils to a Y-axis aligned pencils.
 
@@ -447,8 +449,10 @@ _____________________
   :p T work(*) [in]: Device array to use for transpose workspace.
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos.
-  :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer output_halo_extents(3) [in,optional]: Similar to :code:`input_halo_extents` but for the output data. If not provided, output data is assumed to have no halos.
+  :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 ------
@@ -458,7 +462,7 @@ _____________________
 cudecompTransposeYtoZ
 _____________________
 
-.. f:function:: cudecompTransposeYToZ(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream])
+.. f:function:: cudecompTransposeYToZ(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream])
 
   Function to transpose data from Y-axis aligned pencils to a Z-axis aligned pencils.
 
@@ -472,7 +476,9 @@ _____________________
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos.
   :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 ------
@@ -482,7 +488,7 @@ _____________________
 cudecompTransposeZToY
 _____________________
 
-.. f:function:: cudecompTransposeZToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream])
+.. f:function:: cudecompTransposeZToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream])
 
   Function to transpose data from Z-axis aligned pencils to a Y-axis aligned pencils.
 
@@ -496,7 +502,9 @@ _____________________
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos.
   :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 
@@ -507,7 +515,7 @@ _____________________
 cudecompTransposeYToX
 _____________________
 
-.. f:function:: cudecompTransposeYToX(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream])
+.. f:function:: cudecompTransposeYToX(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream])
 
   Function to transpose data from Y-axis aligned pencils to a X-axis aligned pencils.
 
@@ -521,7 +529,9 @@ _____________________
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos.
   :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 ------
@@ -534,7 +544,7 @@ Halo Exchange Functions
 cudecompUpdateHalosX
 ____________________
 
-.. f:function:: cudecompUpdateHalosX(handle, grid_desc, input, work, dtype, halo_extents, halo_periods[, stream])
+.. f:function:: cudecompUpdateHalosX(handle, grid_desc, input, work, dtype, halo_extents, halo_periods, dim[, padding, stream])
 
   Function to perform halo communication of X-axis aligned pencil data.
 
@@ -547,7 +557,9 @@ ____________________
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer halo_extents(3) [in]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side).
   :p logical halo_periods(3) [in]: An array of three boolean values to define halo periodicity of the input data, in global order. If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer dim [in]: Which pencil dimension (global indexed) to perform the halo update.
+  :p integer padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 ------
@@ -557,7 +569,7 @@ ____________________
 cudecompUpdateHalosY
 ____________________
 
-.. f:function:: cudecompUpdateHalosY(handle, grid_desc, input, work, dtype, halo_extents, halo_periods[, stream])
+.. f:function:: cudecompUpdateHalosY(handle, grid_desc, input, work, dtype, halo_extents, halo_periods, dim[, padding, stream])
 
   Function to perform halo communication of Y-axis aligned pencil data.
 
@@ -570,7 +582,9 @@ ____________________
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer halo_extents(3) [in]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side).
   :p logical halo_periods(3) [in]: An array of three boolean values to define halo periodicity of the input data, in global order. If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer dim [in]: Which pencil dimension (global indexed) to perform the halo update.
+  :p integer padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
 
 ------
@@ -580,7 +594,7 @@ ____________________
 cudecompUpdateHalosZ
 ____________________
 
-.. f:function:: cudecompUpdateHalosZ(handle, grid_desc, input, work, dtype, halo_extents, halo_periods[, stream])
+.. f:function:: cudecompUpdateHalosZ(handle, grid_desc, input, work, dtype, halo_extents, halo_periods, dim[, padding, stream])
 
   Function to perform halo communication of Z-axis aligned pencil data.
 
@@ -593,5 +607,7 @@ ____________________
   :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation.
   :p integer halo_extents(3) [in]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side).
   :p logical halo_periods(3) [in]: An array of three boolean values to define halo periodicity of the input data, in global order. If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis.
-  :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
+  :p integer dim [in]: Which pencil dimension (global indexed) to perform the halo update.
+  :p integer padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis.
+  :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream.
   :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure.
diff --git a/docs/basic_usage.rst b/docs/basic_usage.rst
index 365403f..abfe6e6 100644
--- a/docs/basic_usage.rst
+++ b/docs/basic_usage.rst
@@ -221,10 +221,9 @@ First, we can query basic information (i.e. metadata) about the pencil configura
 assigned to this process using the :ref:`cudecompGetPencilInfo-ref` function. This function returns a
 pencil info structure (:ref:`cudecompPencilInfo_t-ref`) that contains the shape, global lower and upper
 index bounds (:code:`lo` and :code:`hi`), size of the pencil, and an :code:`order` array to indicate the memory layout
-that will be used (to handle permuted, `axis-contiguous` layouts). Additionally, there is a :code:`halo_extents` data
-member that indicates the depth of halos for the pencil, by axis, if the argument was provided
-to this function. This data member is a copy of the argument provided to the function
-and is stored for convenience.
+that will be used (to handle permuted layouts). Additionally, there are :code:`halo_extents`  and :code:`padding` data
+members that indicates halo and padding configurations for the pencil, by axis. This data member is a copy of the
+argument provided to the function and is stored for convenience.
 
 It should be noted that these metadata structures are provided solely for users to
 interpret and access data from the data buffers used as input/output arguments to the different
@@ -234,7 +233,8 @@ for pencil buffers, nor uses these pencil information structures as input argume
 In this example, we apply halo elements to the :math:`X`-pencils only. For the other pencils,
 we instead pass a :code:`nullptr` for the :code:`halo_extents` argument, which is equivalent
 to setting :code:`halo_extents = [0, 0, 0]` in C/C++. For Fortran, :code:`halo_extents` is optional
-and defaults to no halo regions.
+and defaults to no halo regions. Similarly, we pass a :code:`nullptr` for the :code:`padding` argument to specify
+no padding for all pencils. For Fortran, :code:`padding` is optional, and defaults to no padding. 
 
 .. tabs::
 
@@ -243,15 +243,15 @@ and defaults to no halo regions.
     // Get X-pencil information (with halo elements).
     cudecompPencilInfo_t pinfo_x;
     int32_t halo_extents_x[3]{1, 1, 1};
-    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x));
+    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x, nullptr));
 
     // Get Y-pencil information
     cudecompPencilInfo_t pinfo_y;
-    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr));
+    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr));
 
     // Get Z-pencil information
     cudecompPencilInfo_t pinfo_z;
-    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr));
+    CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr));
 
   .. code-tab:: fortran
 
@@ -572,8 +572,10 @@ Transposing the data
 --------------------
 Now, we can use cuDecomp's transposition routines to transpose our data. In these calls, we are using
 the :code:`data_d` array as both input and output (in-place), but you can also use distinct input and output buffers for
-out-of-place operations. For the transposes between :math:`Y`- and :math:`Z`-pencils, we can pass
-null pointers to the halo extent arguments to the routines to ignore them in C/C++, or leave them unspecified in Fortran.
+out-of-place operations. For the transposes between :math:`Y`- and :math:`Z`-pencils, we pass
+null pointers to the halo extent arguments to the routines in C/C++, or leave them unspecified in Fortran.
+For all transposes, we pass null pointers to the padding arguments to the routines to disable padding in C/C++, or leave
+them unspecified in Fortran.
 
 .. tabs::
 
@@ -581,19 +583,19 @@ null pointers to the halo extent arguments to the routines to ignore them in C/C
 
     // Transpose from X-pencils to Y-pencils.
     CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, data_d, data_d, transpose_work_d,
-                                              CUDECOMP_DOUBLE, pinfo_x.halo_extents, nullptr, 0));
+                                              CUDECOMP_DOUBLE, pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0));
 
     // Transpose from Y-pencils to Z-pencils.
     CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d,
-                                              CUDECOMP_DOUBLE, nullptr, nullptr, 0));
+                                              CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
 
     // Transpose from Z-pencils to Y-pencils.
     CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d,
-                                              CUDECOMP_DOUBLE, nullptr, nullptr, 0));
+                                              CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
 
     // Transpose from Y-pencils to X-pencils.
     CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d,
-                                              CUDECOMP_DOUBLE, nullptr, pinfo_x.halo_extents, 0));
+                                              CUDECOMP_DOUBLE, nullptr, pinfo_x.halo_extents, nullptr, nullptr, 0));
 
   .. code-tab:: fortran
 
@@ -618,6 +620,8 @@ Updating halo regions
 In this example, we have halos for the :math:`X`-pencils only. We can use cuDecomp's halo update
 routines to update the halo regions of this pencil in the three domain directions. In this example,
 we set the :code:`halo_periods` argument to enable periodic halos along all directions.
+We pass null pointers to the padding arguments to the routines to disable padding in C/C++, or leave
+them unspecified in Fortran.
 
 .. tabs::
 
@@ -629,17 +633,17 @@ we set the :code:`halo_periods` argument to enable periodic halos along all dire
     // Update X-pencil halos in X direction
     CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d,
                                              CUDECOMP_DOUBLE, pinfo_x.halo_extents, halo_periods,
-                                             0, 0));
+                                             0, nullptr, 0));
 
     // Update X-pencil halos in Y direction
     CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d,
                                              CUDECOMP_DOUBLE, pinfo_x.halo_extents, halo_periods,
-                                             1, 0));
+                                             1, nullptr, 0));
 
     // Update X-pencil halos in Z direction
     CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d,
                                              CUDECOMP_DOUBLE, pinfo_x.halo_extents, halo_periods,
-                                             2, 0));
+                                             2, nullptr, 0));
 
   .. code-tab:: fortran
 

From a3805be878052f2106455811d6c5ca145e2616b0 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 17 Mar 2025 10:32:53 -0700
Subject: [PATCH 8/8] Remove unused CHECK_CUDA macros from Fortran tests.

---
 tests/fortran/halo_test.f90      | 1 -
 tests/fortran/transpose_test.f90 | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90
index 589f455..d64c681 100644
--- a/tests/fortran/halo_test.f90
+++ b/tests/fortran/halo_test.f90
@@ -29,7 +29,6 @@
 #define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
 #define CHECK_CUDECOMP(f) if (f /= CUDECOMP_RESULT_SUCCESS) then; res = 1; return; endif
 #define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1)
-#define CHECK_CUDA(f) if (f /= cudaSuccess) thenl res = 1; return; endif
 
 #ifdef R32
 #define ARRTYPE real(real32)
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index 29b9cc0..02810df 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -29,7 +29,6 @@
 #define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
 #define CHECK_CUDECOMP(f) if (f /= CUDECOMP_RESULT_SUCCESS) then; res = 1; return; endif
 #define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1)
-#define CHECK_CUDA(f) if (f /= cudaSuccess) thenl res = 1; return; endif
 
 #ifdef R32
 #define ARRTYPE real(real32)