NVIDIA · bernhardmgruber · Jan 22, 2025 · Jan 21, 2025
@@ -202,8 +202,8 @@ private:
   /// Internal specialization.
   using InternalBlockHistogram =
     ::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
-                     BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
-                     BlockHistogramAtomic<BINS>>;
+                     detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
+                     detail::BlockHistogramAtomic<BINS>>;
 
   /// Shared memory storage layout type for BlockHistogram
   using _TempStorage = typename InternalBlockHistogram::TempStorage;

@@ -250,9 +250,9 @@ private:
     BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
   };
 
-  using WarpReductions        = BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-  using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-  using Raking                = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using WarpReductions        = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using Raking                = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
   /// Internal specialization type
   using InternalBlockReduce =

@@ -250,9 +250,9 @@ private:
       ? BLOCK_SCAN_RAKING
       : ALGORITHM;
 
-  using WarpScans = BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
   using Raking =
-    BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
+    detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
 
   /// Define the delegate type for the desired algorithm
   using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;

@@ -45,7 +45,8 @@
 #endif // no system header
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
  *        histograms from data samples partitioned across a CUDA thread block.
@@ -72,13 +73,19 @@ struct BlockHistogramAtomic
   template <typename T, typename CounterT, int ITEMS_PER_THREAD>
   _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
   {
-// Update histogram
+    // Update histogram
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; ++i)
     {
       atomicAdd(histogram + items[i], 1);
     }
   }
 };
+} // namespace detail
+
+template <int BINS>
+using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockHistogramAtomic<BINS>;
 
 CUB_NAMESPACE_END
@@ -49,7 +49,8 @@
 #include <cub/util_ptx.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
  *        histograms from data samples partitioned across a CUDA thread block.
@@ -243,5 +244,18 @@ struct BlockHistogramSort
     }
   }
 };
+} // namespace detail
+
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          int BINS,
+          int BLOCK_DIM_Y,
+          int BLOCK_DIM_Z,
+          int LEGACY_PTX_ARCH = 0>
+using BlockHistogramSort CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") =
+  detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -50,7 +50,8 @@
 #include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
  *        block. Supports non-commutative reduction operators.
@@ -257,5 +258,11 @@ struct BlockReduceRaking
     return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockReduceRaking CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -50,7 +50,8 @@
 #include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
  *        across a CUDA thread block. Does not support non-commutative reduction operators. Does not
@@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly
 
   // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have
   // valid values
-  using FallBack = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using FallBack = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
   /// Constants
   enum
@@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly
     return partial;
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -51,7 +51,8 @@
 #include <cuda/ptx>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
  *        across a CUDA thread block. Supports non-commutative reduction operators.
@@ -256,5 +257,11 @@ struct BlockReduceWarpReductions
     return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -52,7 +52,8 @@
 #include <cub/warp/warp_scan.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
  * thread block.
@@ -794,5 +795,11 @@ struct BlockScanRaking
     }
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int LEGACY_PTX_ARCH = 0>
+using BlockScanRaking CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -50,7 +50,8 @@
 #include <cuda/ptx>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
  *        thread block.
@@ -537,5 +538,10 @@ struct BlockScanWarpScans
     exclusive_output = scan_op(block_prefix, exclusive_output);
   }
 };
+} // namespace detail
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -83,8 +83,6 @@ template <class T>
 struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
 {};
 
-} // namespace detail
-
 /**
  * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned
  *        across a CUDA thread warp.
@@ -739,5 +737,11 @@ struct WarpReduceShfl
     return output;
   }
 };
+} // namespace detail
+
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpReduceShfl CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -52,7 +52,8 @@
 #include <cuda/ptx>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
  *        across a CUDA thread warp.
@@ -411,5 +412,10 @@ struct WarpReduceSmem
     return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
   }
 };
+} // namespace detail
 
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpReduceSmem CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 CUB_NAMESPACE_END
@@ -51,7 +51,8 @@
 #include <cuda/ptx>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
  *        across a CUDA thread warp.
@@ -513,7 +514,7 @@ struct WarpScanShfl
     // Iterate scan steps
     int segment_first_lane = 0;
 
-// Iterate scan steps
+    // Iterate scan steps
 #pragma unroll
     for (int STEP = 0; STEP < STEPS; STEP++)
     {
@@ -550,7 +551,7 @@ struct WarpScanShfl
     // Find index of first set bit
     int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
 
-// Iterate scan steps
+    // Iterate scan steps
 #pragma unroll
     for (int STEP = 0; STEP < STEPS; STEP++)
     {
@@ -674,5 +675,11 @@ struct WarpScanShfl
     Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
   }
 };
+} // namespace detail
+
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpScanShfl CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpScanShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -52,7 +52,8 @@
 #include <cuda/ptx>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
  *        across a CUDA thread warp.
@@ -432,5 +433,11 @@ struct WarpScanSmem
     }
   }
 };
+} // namespace detail
+
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpScanSmem CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpScanSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
@@ -174,8 +174,8 @@ public:
 
   /// Internal specialization.
   /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpReduce =
-    ::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpReduce = ::cuda::std::
+    _If<IS_POW_OF_TWO, detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 

@@ -180,8 +180,8 @@ private:
 
   /// Internal specialization.
   /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpScan =
-    ::cuda::std::_If<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpScan = ::cuda::std::
+    _If<IS_POW_OF_TWO, detail::WarpScanShfl<T, LOGICAL_WARP_THREADS>, detail::WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
 
   /// Shared memory storage layout type for WarpScan
   using _TempStorage = typename InternalWarpScan::TempStorage;

@@ -239,8 +239,8 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement
 
     using InternalWarpReduce = cuda::std::conditional_t<
       IS_POW_OF_TWO,
-      WarpReduceShfl<T, LOGICAL_WARP_THREADS>,  // shuffle-based implementation
-      WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation
+      detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>,  // shuffle-based implementation
+      detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation
 
 Specializations provide different shared memory requirements,
 so the actual ``_TempStorage`` type is defined as: