Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cub/cub/block/block_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ private:
/// Internal specialization.
using InternalBlockHistogram =
::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
BlockHistogramAtomic<BINS>>;
detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
detail::BlockHistogramAtomic<BINS>>;

/// Shared memory storage layout type for BlockHistogram
using _TempStorage = typename InternalBlockHistogram::TempStorage;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,9 @@ private:
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};

using WarpReductions = BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using Raking = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using WarpReductions = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using Raking = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;

/// Internal specialization type
using InternalBlockReduce =
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,9 @@ private:
? BLOCK_SCAN_RAKING
: ALGORITHM;

using WarpScans = BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using Raking =
BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;

/// Define the delegate type for the desired algorithm
using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
Expand Down
11 changes: 9 additions & 2 deletions cub/cub/block/specializations/block_histogram_atomic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
#endif // no system header

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
* histograms from data samples partitioned across a CUDA thread block.
Expand All @@ -72,13 +73,19 @@ struct BlockHistogramAtomic
template <typename T, typename CounterT, int ITEMS_PER_THREAD>
_CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
{
// Update histogram
// Update histogram
#pragma unroll
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
atomicAdd(histogram + items[i], 1);
}
}
};
} // namespace detail

template <int BINS>
using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockHistogramAtomic<BINS>;

CUB_NAMESPACE_END
16 changes: 15 additions & 1 deletion cub/cub/block/specializations/block_histogram_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
#include <cub/util_ptx.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
* histograms from data samples partitioned across a CUDA thread block.
Expand Down Expand Up @@ -243,5 +244,18 @@ struct BlockHistogramSort
}
}
};
} // namespace detail

template <typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
int BINS,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
int LEGACY_PTX_ARCH = 0>
using BlockHistogramSort CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") =
detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
9 changes: 8 additions & 1 deletion cub/cub/block/specializations/block_reduce_raking.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
#include <cub/warp/warp_reduce.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
* block. Supports non-commutative reduction operators.
Expand Down Expand Up @@ -257,5 +258,11 @@ struct BlockReduceRaking
return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockReduceRaking CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
#include <cub/warp/warp_reduce.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
* across a CUDA thread block. Does not support non-commutative reduction operators. Does not
Expand Down Expand Up @@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly

// The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have
// valid values
using FallBack = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using FallBack = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;

/// Constants
enum
Expand Down Expand Up @@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly
return partial;
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
* across a CUDA thread block. Supports non-commutative reduction operators.
Expand Down Expand Up @@ -256,5 +257,11 @@ struct BlockReduceWarpReductions
return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
9 changes: 8 additions & 1 deletion cub/cub/block/specializations/block_scan_raking.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
#include <cub/warp/warp_scan.cuh>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
* thread block.
Expand Down Expand Up @@ -794,5 +795,11 @@ struct BlockScanRaking
}
}
};
} // namespace detail

template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int LEGACY_PTX_ARCH = 0>
using BlockScanRaking CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
8 changes: 7 additions & 1 deletion cub/cub/block/specializations/block_scan_warp_scans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
* thread block.
Expand Down Expand Up @@ -537,5 +538,10 @@ struct BlockScanWarpScans
exclusive_output = scan_op(block_prefix, exclusive_output);
}
};
} // namespace detail
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
8 changes: 6 additions & 2 deletions cub/cub/warp/specializations/warp_reduce_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ template <class T>
struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
{};

} // namespace detail

/**
* @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -739,5 +737,11 @@ struct WarpReduceShfl
return output;
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpReduceShfl CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
8 changes: 7 additions & 1 deletion cub/cub/warp/specializations/warp_reduce_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -411,5 +412,10 @@ struct WarpReduceSmem
return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpReduceSmem CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
CUB_NAMESPACE_END
13 changes: 10 additions & 3 deletions cub/cub/warp/specializations/warp_scan_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -513,7 +514,7 @@ struct WarpScanShfl
// Iterate scan steps
int segment_first_lane = 0;

// Iterate scan steps
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
Expand Down Expand Up @@ -550,7 +551,7 @@ struct WarpScanShfl
// Find index of first set bit
int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));

// Iterate scan steps
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
Expand Down Expand Up @@ -674,5 +675,11 @@ struct WarpScanShfl
Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpScanShfl CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpScanShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
9 changes: 8 additions & 1 deletion cub/cub/warp/specializations/warp_scan_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
#include <cuda/ptx>

CUB_NAMESPACE_BEGIN

namespace detail
{
/**
* @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
* across a CUDA thread warp.
Expand Down Expand Up @@ -432,5 +433,11 @@ struct WarpScanSmem
}
}
};
} // namespace detail

template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
using WarpScanSmem CCCL_DEPRECATED_BECAUSE(
"This class is considered an implementation detail and the public interface will be "
"removed.") = detail::WarpScanSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;

CUB_NAMESPACE_END
4 changes: 2 additions & 2 deletions cub/cub/warp/warp_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ public:

/// Internal specialization.
/// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
using InternalWarpReduce =
::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
using InternalWarpReduce = ::cuda::std::
_If<IS_POW_OF_TWO, detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;

#endif // _CCCL_DOXYGEN_INVOKED

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/warp/warp_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ private:

/// Internal specialization.
/// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
using InternalWarpScan =
::cuda::std::_If<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
using InternalWarpScan = ::cuda::std::
_If<IS_POW_OF_TWO, detail::WarpScanShfl<T, LOGICAL_WARP_THREADS>, detail::WarpScanSmem<T, LOGICAL_WARP_THREADS>>;

/// Shared memory storage layout type for WarpScan
using _TempStorage = typename InternalWarpScan::TempStorage;
Expand Down
4 changes: 2 additions & 2 deletions docs/cub/developer_overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,8 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement

using InternalWarpReduce = cuda::std::conditional_t<
IS_POW_OF_TWO,
WarpReduceShfl<T, LOGICAL_WARP_THREADS>, // shuffle-based implementation
WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation
detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, // shuffle-based implementation
detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation

Specializations provide different shared memory requirements,
so the actual ``_TempStorage`` type is defined as:
Expand Down