From 3228ca86d74a50d4f7c5170bc473d29c30f3dec5 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:15:25 +0530 Subject: [PATCH 01/10] Update kernels_hip.cuh --- csrc/kernels_hip.cuh | 3 --- 1 file changed, 3 deletions(-) diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh index 811299d05..d902129a3 100644 --- a/csrc/kernels_hip.cuh +++ b/csrc/kernels_hip.cuh @@ -103,9 +103,6 @@ __global__ void kOptimizerStatic8bit1StateBlockwise( template __global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n); -__global__ void - kHistogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, const int maxidx1, const int n); - template __global__ void kspmm_coo_very_sparse_naive( int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out, From 94c1b7751bdd1d10014cf861a4e28ede66262530 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:21:11 +0530 Subject: [PATCH 02/10] Update kernels.hip --- csrc/kernels.hip | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 56e1d54db..53b2725a3 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -346,18 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran } } -__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n) -{ - const int tid = threadIdx.x + (blockDim.x*blockIdx.x); - const int numThreads = blockDim.x*gridDim.x; - - for(int i = tid; i < n; i+=numThreads) - { - int idx = (index1[i]*maxidx1) + index2[i]; - atomicAdd(&histogram[idx], src[i]); - } -} - #define THREADS_ESTIMATE 512 #define NUM_ESTIMATE 8 #define BLOCK_ESTIMATE 4096 From cd3f0b779f6c285cd969689dd509ad08698e0964 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:23:14 +0530 Subject: [PATCH 03/10] Update ops.hip --- csrc/ops.hip | 9 --------- 1 file changed, 9 deletions(-) diff --git a/csrc/ops.hip b/csrc/ops.hip index a9c3e0202..ccdbc1026 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -24,15 +24,6 @@ using namespace BinSearch; using std::cout; using std::endl; -void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n) -{ - int threads = 512; - int num_blocks = n/threads; - num_blocks = n % threads == 0 ? num_blocks : num_blocks + 1; - hipLaunchKernelGGL(( kHistogramScatterAdd2D), dim3(num_blocks), dim3(512), 0, 0, histogram, index1, index2, src, maxidx1, n); - CUDA_CHECK_RETURN(hipPeekAtLastError()); -} - template void estimateQuantiles(T *A, float *code, float offset, int n) { int num_blocks = n/4096; From 98bb06ed6245da3af44497c1df04c8da06f00d2a Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:25:32 +0530 Subject: [PATCH 04/10] Update ops_hip.cuh --- csrc/ops_hip.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh index 624ebe326..ebae292c4 100644 --- a/csrc/ops_hip.cuh +++ b/csrc/ops_hip.cuh @@ -160,8 +160,6 @@ void optimizerStatic8bitBlockwise( template void percentileClipping(T* g, float* gnorm_vec, int step, const int n); -void histogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, int maxidx1, int n); - void gemmex( Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda, int ldb, int ldc From 3bad4541e3d9fc186cf680009bfef7c980bb0aaa Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:17:59 +0530 Subject: [PATCH 05/10] Update kernels_hip.cuh --- csrc/kernels_hip.cuh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh index d902129a3..00718071c 100644 --- a/csrc/kernels_hip.cuh +++ b/csrc/kernels_hip.cuh @@ -11,10 +11,6 @@ #ifndef kernels #define kernels -template -__global__ void - kEstimateQuantiles(T* __restrict__ const A, float* code, const float offset, const T max_val, const int n); - __global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n); __global__ void kDequantize(float* code, unsigned char* A, float* out, const int n); From e0c766dcc34b6147d5a6e8aa505dbb15c08233a5 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:20:37 +0530 Subject: [PATCH 06/10] Update kernels.hip --- csrc/kernels.hip | 73 ------------------------------------------------ 1 file changed, 73 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 53b2725a3..6b0f1eac5 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -346,79 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran } } -#define THREADS_ESTIMATE 512 -#define NUM_ESTIMATE 8 -#define BLOCK_ESTIMATE 4096 - -template -__launch_bounds__(THREADS_ESTIMATE, 1) -__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n) -{ - const int n_full = (BLOCK_ESTIMATE*(n/BLOCK_ESTIMATE)) + (n % BLOCK_ESTIMATE == 0 ? 0 : BLOCK_ESTIMATE); - int valid_items = (blockIdx.x+1 == gridDim.x) ? n - (blockIdx.x*BLOCK_ESTIMATE) : BLOCK_ESTIMATE; - const int base_idx = (blockIdx.x * BLOCK_ESTIMATE); - const float reciprocal_num_blocks = 1.0f/(n < 4096 ? 1.0f : (n/BLOCK_ESTIMATE)); - - T vals[NUM_ESTIMATE]; - - typedef hipcub::BlockRadixSort BlockRadixSort; - typedef hipcub::BlockLoad LoadFloat; - - __shared__ union { - typename LoadFloat::TempStorage loadf; - typename BlockRadixSort::TempStorage sort; - int smem_qidx[BLOCK_ESTIMATE]; - } temp_storage; - - for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_ESTIMATE) - { - valid_items = n - i > BLOCK_ESTIMATE ? BLOCK_ESTIMATE : n - i; - - // do not process half-blocks - if(valid_items < BLOCK_ESTIMATE && n > BLOCK_ESTIMATE){ continue; } - - #pragma unroll 4 - for(int j = 0; j < NUM_ESTIMATE; j++) - vals[j] = max_val; - - __syncthreads(); - LoadFloat(temp_storage.loadf).Load(&(A[i]), vals, valid_items); - - #pragma unroll 4 - for(int j = 0; j < NUM_ESTIMATE; j++) - vals[j] = ((float)vals[j]) * reciprocal_num_blocks; - - - __syncthreads(); - // sort into striped pattern to mitigate bank conflicts - // striped pattern index for thread 0 [0, 1024, 2048, 3096] - // striped pattern index for thread 1 [1, 1025, 2049, 3097] - BlockRadixSort(temp_storage.sort).SortBlockedToStriped(vals); - - __syncthreads(); - for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x) - temp_storage.smem_qidx[j] = -1; - - __syncthreads(); - - if(threadIdx.x < 256) - { - float q_interval = (1.0f-(2.0f*offset))/255.0f; - int local_idx = round(((offset+(threadIdx.x*q_interval))*(valid_items-1))); - temp_storage.smem_qidx[local_idx] = threadIdx.x; - } - - __syncthreads(); - - for(int i = threadIdx.x; i < BLOCK_ESTIMATE; i+=blockDim.x) - { - if(temp_storage.smem_qidx[i] != -1) - atomicAdd(&code[temp_storage.smem_qidx[i]], vals[i/THREADS_ESTIMATE]); - } - } -} - - __launch_bounds__(TH, 4) __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n) { From f35a063db5bd5fb87c0ccf70df2687b7079b33af Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:22:55 +0530 Subject: [PATCH 07/10] Update kernels.hip --- csrc/kernels.hip | 3 --- 1 file changed, 3 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 6b0f1eac5..ec3f7f025 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -2899,9 +2899,6 @@ template __global__ void kdequant_mm_int32_fp16<4, 512>(int *__restrict__ const template __device__ unsigned char dQuantize<0>(float* smem_code, const float rand, float x); template __device__ unsigned char dQuantize<1>(float* smem_code, const float rand, float x); -template __global__ void kEstimateQuantiles(float *__restrict__ const A, float *code, const float offset, const float max_val, const int n); -template __global__ void kEstimateQuantiles(half *__restrict__ const A, float *code, const float offset, const half max_val, const int n); - #define MAKE_PreconditionOptimizer32bit1State(oname, gtype) \ template __global__ void kPreconditionOptimizer32bit1State(gtype* g, gtype* p, \ float* state1, float *unorm, \ From fca01f310358169d49b686bce1fae7a9c4d37c93 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:30:34 +0530 Subject: [PATCH 08/10] Update ops.hip --- csrc/ops.hip | 3 --- 1 file changed, 3 deletions(-) diff --git a/csrc/ops.hip b/csrc/ops.hip index ccdbc1026..1840b7864 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -743,9 +743,6 @@ template int igemmlt<32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, con template int igemmlt<8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream); template int igemmlt<8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream); -template void estimateQuantiles(half *A, float *code, float offset, int n); -template void estimateQuantiles(float *A, float *code, float offset, int n); - template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); From 5569c2de672006ed6353cf85e0a34b4ddeec59a1 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:34:01 +0530 Subject: [PATCH 09/10] Update ops_hip.cuh --- csrc/ops_hip.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh index ebae292c4..0f8db2ee4 100644 --- a/csrc/ops_hip.cuh +++ b/csrc/ops_hip.cuh @@ -124,8 +124,6 @@ class ContextHipsparse { } }; -template void estimateQuantiles(T* A, float* code, float offset, int n); - void quantize(float* code, float* A, unsigned char* out, int n); void dequantize(float* code, unsigned char* A, float* out, int n, hipStream_t stream); template From 7a17f2d6f7ecfb78cf72d94de4b3f3f3ef4e1453 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:44:51 +0530 Subject: [PATCH 10/10] Update ops.hip --- csrc/ops.hip | 9 --------- 1 file changed, 9 deletions(-) diff --git a/csrc/ops.hip b/csrc/ops.hip index 1840b7864..260b74b30 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -24,15 +24,6 @@ using namespace BinSearch; using std::cout; using std::endl; -template void estimateQuantiles(T *A, float *code, float offset, int n) -{ - int num_blocks = n/4096; - num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1; - CUDA_CHECK_RETURN(hipMemset(code, 0, 256*sizeof(float))); - hipLaunchKernelGGL(( kEstimateQuantiles), dim3(num_blocks), dim3(512), 0, 0, A, code, offset, std::numeric_limits::max(), n); - CUDA_CHECK_RETURN(hipPeekAtLastError()); -} - void quantize(float *code, float *A, unsigned char *out, int n) { int num_blocks = n/1024;