From 867ddfe212a6aa2408064124b2f86444b0e5b8e7 Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Fri, 22 Nov 2024 10:34:29 +0800
Subject: [PATCH] Change conv mathType and add pytorch sync when profiling

---
 operatorspy/tests/conv.py | 18 ++++++++++++++++++
 src/ops/conv/cuda/conv.cc |  1 +
 2 files changed, 19 insertions(+)

diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py
index 21b699db..4658e8ad 100644
--- a/operatorspy/tests/conv.py
+++ b/operatorspy/tests/conv.py
@@ -41,14 +41,32 @@ class ConvDescriptor(Structure):
 def conv(x, w, stride, padding, dilation):
     match len(x.shape) - 2:
         case 1:
+            if PROFILE:
+                ans = F.conv1d(
+                    x, w, stride=stride, padding=padding, dilation=dilation
+                )
+                torch.cuda.synchronize()
+                return ans
             return F.conv1d(
                 x, w, stride=stride, padding=padding, dilation=dilation
             )
         case 2:
+            if PROFILE:
+                ans = F.conv2d(
+                        x, w, stride=stride, padding=padding, dilation=dilation
+                    )
+                torch.cuda.synchronize()
+                return ans
             return F.conv2d(
                 x, w, stride=stride, padding=padding, dilation=dilation
             )
         case 3:
+            if PROFILE:
+                ans = F.conv3d(
+                        x, w, stride=stride, padding=padding, dilation=dilation
+                    )
+                torch.cuda.synchronize()
+                return ans
             return F.conv3d(
                 x, w, stride=stride, padding=padding, dilation=dilation
             )
diff --git a/src/ops/conv/cuda/conv.cc b/src/ops/conv/cuda/conv.cc
index 9a352878..e7736667 100644
--- a/src/ops/conv/cuda/conv.cc
+++ b/src/ops/conv/cuda/conv.cc
@@ -92,6 +92,7 @@ infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle,
     checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
     checkCudnnError(cudnnSetTensorNdDescriptorEx(y_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), new_ndim, y_shape));
 
+    cudnnSetConvolutionMathType(op_desc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION);
 
     // tuning: get the best algorithm
     int requestedAlgoCount = 1;