From 3228ca86d74a50d4f7c5170bc473d29c30f3dec5 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:15:25 +0530
Subject: [PATCH 01/10] Update kernels_hip.cuh

---
 csrc/kernels_hip.cuh | 3 ---
 1 file changed, 3 deletions(-)
diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh
index 811299d05..d902129a3 100644
--- a/csrc/kernels_hip.cuh
+++ b/csrc/kernels_hip.cuh
@@ -103,9 +103,6 @@ __global__ void kOptimizerStatic8bit1StateBlockwise(
 template <typename T, int BLOCK_SIZE, int NUM_VALS>
 __global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);
 
-__global__ void
-    kHistogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, const int maxidx1, const int n);
-
 template <typename T, int SPMM_ITEMS, int BITS>
 __global__ void kspmm_coo_very_sparse_naive(
     int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,

From 94c1b7751bdd1d10014cf861a4e28ede66262530 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:21:11 +0530
Subject: [PATCH 02/10] Update kernels.hip

---
 csrc/kernels.hip | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 56e1d54db..53b2725a3 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -346,18 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran
     }
 }
 
-__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n)
-{
-  const int tid = threadIdx.x + (blockDim.x*blockIdx.x);
-  const int numThreads = blockDim.x*gridDim.x;
-
-  for(int i = tid; i < n; i+=numThreads)
-  {
-      int idx = (index1[i]*maxidx1) + index2[i];
-      atomicAdd(&histogram[idx], src[i]);
-  }
-}
-
 #define THREADS_ESTIMATE 512
 #define NUM_ESTIMATE 8
 #define BLOCK_ESTIMATE 4096

From cd3f0b779f6c285cd969689dd509ad08698e0964 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:23:14 +0530
Subject: [PATCH 03/10] Update ops.hip

---
 csrc/ops.hip | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index a9c3e0202..ccdbc1026 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -24,15 +24,6 @@ using namespace BinSearch;
 using std::cout;
 using std::endl;
 
-void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
-{
-  int threads = 512;
-  int num_blocks = n/threads;
-  num_blocks = n % threads == 0 ? num_blocks : num_blocks + 1;
- hipLaunchKernelGGL(( kHistogramScatterAdd2D), dim3(num_blocks), dim3(512), 0, 0, histogram, index1, index2, src, maxidx1, n);
-  CUDA_CHECK_RETURN(hipPeekAtLastError());
-}
-
 template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
 {
   int num_blocks = n/4096;

From 98bb06ed6245da3af44497c1df04c8da06f00d2a Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:25:32 +0530
Subject: [PATCH 04/10] Update ops_hip.cuh

---
 csrc/ops_hip.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index 624ebe326..ebae292c4 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -160,8 +160,6 @@ void optimizerStatic8bitBlockwise(
 
 template <typename T> void percentileClipping(T* g, float* gnorm_vec, int step, const int n);
 
-void histogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, int maxidx1, int n);
-
 void gemmex(
     Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda,
     int ldb, int ldc

From 3bad4541e3d9fc186cf680009bfef7c980bb0aaa Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 15:17:59 +0530
Subject: [PATCH 05/10] Update kernels_hip.cuh

---
 csrc/kernels_hip.cuh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh
index d902129a3..00718071c 100644
--- a/csrc/kernels_hip.cuh
+++ b/csrc/kernels_hip.cuh
@@ -11,10 +11,6 @@
 #ifndef kernels
 #define kernels
 
-template <typename T>
-__global__ void
-    kEstimateQuantiles(T* __restrict__ const A, float* code, const float offset, const T max_val, const int n);
-
 __global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
 __global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);
 

From e0c766dcc34b6147d5a6e8aa505dbb15c08233a5 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 15:20:37 +0530
Subject: [PATCH 06/10] Update kernels.hip

---
 csrc/kernels.hip | 73 ------------------------------------------------
 1 file changed, 73 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 53b2725a3..6b0f1eac5 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -346,79 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran
     }
 }
 
-#define THREADS_ESTIMATE 512
-#define NUM_ESTIMATE 8
-#define BLOCK_ESTIMATE 4096
-
-template<typename T>
-__launch_bounds__(THREADS_ESTIMATE, 1)
-__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n)
-{
-  const int n_full = (BLOCK_ESTIMATE*(n/BLOCK_ESTIMATE)) + (n % BLOCK_ESTIMATE == 0 ? 0 : BLOCK_ESTIMATE);
-  int valid_items = (blockIdx.x+1 == gridDim.x) ? n - (blockIdx.x*BLOCK_ESTIMATE) : BLOCK_ESTIMATE;
-  const int base_idx = (blockIdx.x * BLOCK_ESTIMATE);
-  const float reciprocal_num_blocks = 1.0f/(n < 4096 ? 1.0f : (n/BLOCK_ESTIMATE));
-
-  T vals[NUM_ESTIMATE];
-
-  typedef hipcub::BlockRadixSort<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::NullType, 4, true, hipcub::BLOCK_SCAN_RAKING> BlockRadixSort;
-  typedef hipcub::BlockLoad<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
-
-  __shared__ union {
-      typename LoadFloat::TempStorage loadf;
-      typename BlockRadixSort::TempStorage sort;
-      int smem_qidx[BLOCK_ESTIMATE];
-  } temp_storage;
-
-  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_ESTIMATE)
-  {
-      valid_items = n - i > BLOCK_ESTIMATE ? BLOCK_ESTIMATE : n - i;
-
-      // do not process half-blocks
-      if(valid_items < BLOCK_ESTIMATE && n > BLOCK_ESTIMATE){ continue; }
-
-      #pragma unroll 4
-      for(int j = 0; j < NUM_ESTIMATE; j++)
-          vals[j] = max_val;
-
-      __syncthreads();
-      LoadFloat(temp_storage.loadf).Load(&(A[i]), vals, valid_items);
-
-      #pragma unroll 4
-      for(int j = 0; j < NUM_ESTIMATE; j++)
-          vals[j] = ((float)vals[j]) * reciprocal_num_blocks;
-
-
-      __syncthreads();
-      // sort into striped pattern to mitigate bank conflicts
-      // striped pattern index for thread 0 [0, 1024, 2048, 3096]
-      // striped pattern index for thread 1 [1, 1025, 2049, 3097]
-      BlockRadixSort(temp_storage.sort).SortBlockedToStriped(vals);
-
-      __syncthreads();
-      for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x)
-          temp_storage.smem_qidx[j] = -1;
-
-      __syncthreads();
-
-      if(threadIdx.x < 256)
-      {
-          float q_interval = (1.0f-(2.0f*offset))/255.0f;
-          int local_idx = round(((offset+(threadIdx.x*q_interval))*(valid_items-1)));
-          temp_storage.smem_qidx[local_idx] = threadIdx.x;
-      }
-
-      __syncthreads();
-
-      for(int i = threadIdx.x; i < BLOCK_ESTIMATE; i+=blockDim.x)
-      {
-          if(temp_storage.smem_qidx[i] != -1)
-              atomicAdd(&code[temp_storage.smem_qidx[i]], vals[i/THREADS_ESTIMATE]);
-      }
-  }
-}
-
-
 __launch_bounds__(TH, 4)
 __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n)
 {

From f35a063db5bd5fb87c0ccf70df2687b7079b33af Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 15:22:55 +0530
Subject: [PATCH 07/10] Update kernels.hip

---
 csrc/kernels.hip | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 6b0f1eac5..ec3f7f025 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -2899,9 +2899,6 @@ template __global__ void kdequant_mm_int32_fp16<4, 512>(int *__restrict__ const
 template __device__ unsigned char dQuantize<0>(float* smem_code, const float rand, float x);
 template __device__ unsigned char dQuantize<1>(float* smem_code, const float rand, float x);
 
-template __global__ void kEstimateQuantiles(float *__restrict__ const A, float *code, const float offset, const float max_val, const int n);
-template __global__ void kEstimateQuantiles(half *__restrict__ const A, float *code, const float offset, const half max_val, const int n);
-
 #define MAKE_PreconditionOptimizer32bit1State(oname, gtype) \
 template __global__ void kPreconditionOptimizer32bit1State<gtype, oname, 4096, 8>(gtype* g, gtype* p, \
                 float* state1, float *unorm, \

From fca01f310358169d49b686bce1fae7a9c4d37c93 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 15:30:34 +0530
Subject: [PATCH 08/10] Update ops.hip

---
 csrc/ops.hip | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index ccdbc1026..1840b7864 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -743,9 +743,6 @@ template int igemmlt<32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, con
 template int igemmlt<8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
 template int igemmlt<8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
 
-template void estimateQuantiles(half *A, float *code, float offset, int n);
-template void estimateQuantiles(float *A, float *code, float offset, int n);
-
 template void quantizeBlockwise<half, 1, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<half, 0, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<half, 0, FP4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);

From 5569c2de672006ed6353cf85e0a34b4ddeec59a1 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 15:34:01 +0530
Subject: [PATCH 09/10] Update ops_hip.cuh

---
 csrc/ops_hip.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index ebae292c4..0f8db2ee4 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -124,8 +124,6 @@ class ContextHipsparse {
     }
 };
 
-template <typename T> void estimateQuantiles(T* A, float* code, float offset, int n);
-
 void quantize(float* code, float* A, unsigned char* out, int n);
 void dequantize(float* code, unsigned char* A, float* out, int n, hipStream_t stream);
 template <typename T, int STOCHASTIC, int DATA_TYPE>

From 7a17f2d6f7ecfb78cf72d94de4b3f3f3ef4e1453 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 20 Jun 2025 15:44:51 +0530
Subject: [PATCH 10/10] Update ops.hip

---
 csrc/ops.hip | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index 1840b7864..260b74b30 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -24,15 +24,6 @@ using namespace BinSearch;
 using std::cout;
 using std::endl;
 
-template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
-{
-  int num_blocks = n/4096;
-  num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;
-	CUDA_CHECK_RETURN(hipMemset(code, 0, 256*sizeof(float)));
- hipLaunchKernelGGL(( kEstimateQuantiles<T>), dim3(num_blocks), dim3(512), 0, 0, A, code, offset, std::numeric_limits<T>::max(), n);
-  CUDA_CHECK_RETURN(hipPeekAtLastError());
-}
-
 void quantize(float *code, float *A, unsigned char *out, int n)
 {
   int num_blocks = n/1024;