diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 56e1d54db..ec3f7f025 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -346,91 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran } } -__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n) -{ - const int tid = threadIdx.x + (blockDim.x*blockIdx.x); - const int numThreads = blockDim.x*gridDim.x; - - for(int i = tid; i < n; i+=numThreads) - { - int idx = (index1[i]*maxidx1) + index2[i]; - atomicAdd(&histogram[idx], src[i]); - } -} - -#define THREADS_ESTIMATE 512 -#define NUM_ESTIMATE 8 -#define BLOCK_ESTIMATE 4096 - -template -__launch_bounds__(THREADS_ESTIMATE, 1) -__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n) -{ - const int n_full = (BLOCK_ESTIMATE*(n/BLOCK_ESTIMATE)) + (n % BLOCK_ESTIMATE == 0 ? 0 : BLOCK_ESTIMATE); - int valid_items = (blockIdx.x+1 == gridDim.x) ? n - (blockIdx.x*BLOCK_ESTIMATE) : BLOCK_ESTIMATE; - const int base_idx = (blockIdx.x * BLOCK_ESTIMATE); - const float reciprocal_num_blocks = 1.0f/(n < 4096 ? 1.0f : (n/BLOCK_ESTIMATE)); - - T vals[NUM_ESTIMATE]; - - typedef hipcub::BlockRadixSort BlockRadixSort; - typedef hipcub::BlockLoad LoadFloat; - - __shared__ union { - typename LoadFloat::TempStorage loadf; - typename BlockRadixSort::TempStorage sort; - int smem_qidx[BLOCK_ESTIMATE]; - } temp_storage; - - for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_ESTIMATE) - { - valid_items = n - i > BLOCK_ESTIMATE ? BLOCK_ESTIMATE : n - i; - - // do not process half-blocks - if(valid_items < BLOCK_ESTIMATE && n > BLOCK_ESTIMATE){ continue; } - - #pragma unroll 4 - for(int j = 0; j < NUM_ESTIMATE; j++) - vals[j] = max_val; - - __syncthreads(); - LoadFloat(temp_storage.loadf).Load(&(A[i]), vals, valid_items); - - #pragma unroll 4 - for(int j = 0; j < NUM_ESTIMATE; j++) - vals[j] = ((float)vals[j]) * reciprocal_num_blocks; - - - __syncthreads(); - // sort into striped pattern to mitigate bank conflicts - // striped pattern index for thread 0 [0, 1024, 2048, 3096] - // striped pattern index for thread 1 [1, 1025, 2049, 3097] - BlockRadixSort(temp_storage.sort).SortBlockedToStriped(vals); - - __syncthreads(); - for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x) - temp_storage.smem_qidx[j] = -1; - - __syncthreads(); - - if(threadIdx.x < 256) - { - float q_interval = (1.0f-(2.0f*offset))/255.0f; - int local_idx = round(((offset+(threadIdx.x*q_interval))*(valid_items-1))); - temp_storage.smem_qidx[local_idx] = threadIdx.x; - } - - __syncthreads(); - - for(int i = threadIdx.x; i < BLOCK_ESTIMATE; i+=blockDim.x) - { - if(temp_storage.smem_qidx[i] != -1) - atomicAdd(&code[temp_storage.smem_qidx[i]], vals[i/THREADS_ESTIMATE]); - } - } -} - - __launch_bounds__(TH, 4) __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n) { @@ -2984,9 +2899,6 @@ template __global__ void kdequant_mm_int32_fp16<4, 512>(int *__restrict__ const template __device__ unsigned char dQuantize<0>(float* smem_code, const float rand, float x); template __device__ unsigned char dQuantize<1>(float* smem_code, const float rand, float x); -template __global__ void kEstimateQuantiles(float *__restrict__ const A, float *code, const float offset, const float max_val, const int n); -template __global__ void kEstimateQuantiles(half *__restrict__ const A, float *code, const float offset, const half max_val, const int n); - #define MAKE_PreconditionOptimizer32bit1State(oname, gtype) \ template __global__ void kPreconditionOptimizer32bit1State(gtype* g, gtype* p, \ float* state1, float *unorm, \ diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh index 811299d05..00718071c 100644 --- a/csrc/kernels_hip.cuh +++ b/csrc/kernels_hip.cuh @@ -11,10 +11,6 @@ #ifndef kernels #define kernels -template -__global__ void - kEstimateQuantiles(T* __restrict__ const A, float* code, const float offset, const T max_val, const int n); - __global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n); __global__ void kDequantize(float* code, unsigned char* A, float* out, const int n); @@ -103,9 +99,6 @@ __global__ void kOptimizerStatic8bit1StateBlockwise( template __global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n); -__global__ void - kHistogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, const int maxidx1, const int n); - template __global__ void kspmm_coo_very_sparse_naive( int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out, diff --git a/csrc/ops.hip b/csrc/ops.hip index a9c3e0202..260b74b30 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -24,24 +24,6 @@ using namespace BinSearch; using std::cout; using std::endl; -void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n) -{ - int threads = 512; - int num_blocks = n/threads; - num_blocks = n % threads == 0 ? num_blocks : num_blocks + 1; - hipLaunchKernelGGL(( kHistogramScatterAdd2D), dim3(num_blocks), dim3(512), 0, 0, histogram, index1, index2, src, maxidx1, n); - CUDA_CHECK_RETURN(hipPeekAtLastError()); -} - -template void estimateQuantiles(T *A, float *code, float offset, int n) -{ - int num_blocks = n/4096; - num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1; - CUDA_CHECK_RETURN(hipMemset(code, 0, 256*sizeof(float))); - hipLaunchKernelGGL(( kEstimateQuantiles), dim3(num_blocks), dim3(512), 0, 0, A, code, offset, std::numeric_limits::max(), n); - CUDA_CHECK_RETURN(hipPeekAtLastError()); -} - void quantize(float *code, float *A, unsigned char *out, int n) { int num_blocks = n/1024; @@ -752,9 +734,6 @@ template int igemmlt<32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, con template int igemmlt<8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream); template int igemmlt<8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream); -template void estimateQuantiles(half *A, float *code, float offset, int n); -template void estimateQuantiles(float *A, float *code, float offset, int n); - template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh index 624ebe326..0f8db2ee4 100644 --- a/csrc/ops_hip.cuh +++ b/csrc/ops_hip.cuh @@ -124,8 +124,6 @@ class ContextHipsparse { } }; -template void estimateQuantiles(T* A, float* code, float offset, int n); - void quantize(float* code, float* A, unsigned char* out, int n); void dequantize(float* code, unsigned char* A, float* out, int n, hipStream_t stream); template @@ -160,8 +158,6 @@ void optimizerStatic8bitBlockwise( template void percentileClipping(T* g, float* gnorm_vec, int step, const int n); -void histogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, int maxidx1, int n); - void gemmex( Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda, int ldb, int ldc