apache · tqchen · Nov 14, 2024 · Sep 27, 2024
diff --git a/docker/bash.sh b/docker/bash.sh
@@ -247,6 +247,16 @@ while (( $# )); do
             shift 2
             ;;
 
+        -e)
+            DOCKER_ENV+=( --env "$2" )
+            shift 2
+            ;;
+
+        -v)
+            DOCKER_FLAGS+=( --volume "$2" )
+            shift 2
+            ;;
+
         --dry-run)
             DRY_RUN=true
             shift

diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
@@ -556,6 +556,14 @@ def check_depth_to_space(extract):
             return False
         return True
 
+    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
+    target_version = (
+        pass_context.config["relay.ext.clml.target_version"]
+        if "relay.ext.clml.target_version" in pass_context.config
+        else 3
+    )
+    print("CLML Target Version: ", target_version)
+
     return [
         ("clml.pad_conv2d", pad_conv_pattern(), check_conv),
         ("clml.conv2d", conv_pattern(), check_conv),

diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
@@ -35,6 +35,10 @@
 #include "../codegen_json/codegen_json.h"
 
 namespace tvm {
+
+constexpr const char* kCLMLTargetVersion = "relay.ext.clml.target_version";
+TVM_REGISTER_PASS_CONFIG_OPTION(kCLMLTargetVersion, Integer);
+
 namespace relay {
 namespace contrib {
 

diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h
@@ -23,6 +23,12 @@
  */
 #ifndef TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
 #define TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#define CL_TARGET_OPENCL_VERSION 300
+#endif
+
 #include <CL/cl.h>
 #include <CL/opencl.h>
 #include <stdlib.h>
@@ -48,8 +54,110 @@
 
 #define CAT_I(a, b) a##b
 #define CAT(a, b) CAT_I(a, b)
-#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
-#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+
+#define CLML_CHECK_ERROR(e, API) \
+  { ICHECK(e == CL_SUCCESS) << "CLML Error:" #API " code=" << e; }
+
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION > 3
+#define V4_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV4QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+#else
+#define V4_API(API, ...) LOG(FATAL) << "CLML Error:" #API " - Incompatible V4 API call\n";
+#endif
+
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION > 2
+#define V3_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV3QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+#else
+#define V3_API(API, ...) LOG(FATAL) << "CLML Error:" #API " - Incompatible V3 API call\n";
+#endif
+
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION > 1
+#define V2_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV2QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+#else
+#define V2_API(API, ...) LOG(FATAL) << "CLML Error:" #API " - Incompatible V2 API call\n";
+#endif
+
+#define V1_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV1QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+
+#define CLML_CALL(API, ...)                                                  \
+  {                                                                          \
+    cl_int e;                                                                \
+    switch (CLMLWorkspace::Global()->target_major) {                         \
+      case 1:                                                                \
+        V1_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      case 2:                                                                \
+        V2_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      case 3:                                                                \
+        V3_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      case 4:                                                                \
+        V4_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      default:                                                               \
+        LOG(FATAL) << "CLML Error:" #API " - Unsupported target version \n"; \
+    }                                                                        \
+  }
+
+#define CLML_CALL_VERSIONED(APICALL, VERSION, ...) CAT(CAT(V, VERSION), _API)(APICALL, __VA_ARGS__)
+
+#define CALL_CASE(VERSION, API, ...)                \
+  case VERSION:                                     \
+    CLML_CALL_VERSIONED(API, VERSION, __VA_ARGS__); \
+    break;
+
+// clCreateMLOpClipQCOM
+#define CLML_CALL_clCreateMLOpClipQCOM(...)                        \
+  cl_int e;                                                        \
+  switch (CLMLWorkspace::Global()->target_major) {                 \
+    CALL_CASE(2, clCreateMLOpClipQCOM, __VA_ARGS__)                \
+    CALL_CASE(3, clCreateMLOpClipQCOM, __VA_ARGS__)                \
+    CALL_CASE(4, clCreateMLOpClipQCOM, __VA_ARGS__)                \
+    default:                                                       \
+      LOG(FATAL) << "CLML Error: - Unsupported target version \n"; \
+  }
+
+// clCreateMLTensorQCOM and clCreateMLTensorWithUsageQCOM
+#define CALL_clCreateMLTensorQCOM(VERSION, CONTEXT, TENSORPROPS, TENSORDESC, USAGE, TENSOR) \
+  CALL_CASE(VERSION, clCreateMLTensorQCOM, CONTEXT, TENSORPROPS, TENSORDESC, TENSOR)
+
+#define CALL_clCreateMLTensorWithUsageQCOM(VERSION, CONTEXT, TENSORPROPS, TENSORDESC, USAGE, \
+                                           TENSOR)                                           \
+  CALL_CASE(VERSION, clCreateMLTensorWithUsageQCOM, CONTEXT, TENSORPROPS, TENSORDESC, USAGE, TENSOR)
+
+#define CLML_CALL_clCreateMLTensorQCOM(...)                        \
+  cl_int e;                                                        \
+  switch (CLMLWorkspace::Global()->target_major) {                 \
+    CALL_clCreateMLTensorQCOM(1, __VA_ARGS__);                     \
+    CALL_clCreateMLTensorQCOM(2, __VA_ARGS__);                     \
+    CALL_clCreateMLTensorQCOM(3, __VA_ARGS__);                     \
+    CALL_clCreateMLTensorWithUsageQCOM(4, __VA_ARGS__);            \
+    default:                                                       \
+      LOG(FATAL) << "CLML Error: - Unsupported target version \n"; \
+  }
+
+/* Version compatibility for CLML Tensor creation */
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION < 4
+typedef enum _cl_ml_tensor_usage_qcom {
+  CL_TENSOR_USAGE_INVALID_QCOM = 0,
+  CL_TENSOR_USAGE_UNUSED_QCOM = 1,
+  CL_TENSOR_USAGE_PARAMETER_QCOM = 2,
+  CL_TENSOR_USAGE_CNN_QCOM = 3,
+  CL_TENSOR_USAGE_TNN_QCOM = 4,
+} cl_ml_tensor_usage_qcom;
+#endif
 
 /*! \brief Magic number for CLML Tuning cache entry */
 static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
@@ -81,7 +189,7 @@ class CLMLWorkspace {
   virtual CLMLThreadEntry* GetThreadEntry();
 
   /* CLML Context */
-  GET_ML_API_INTERFACE* h_ClmlIntf = nullptr;
+  void* h_ClmlIntf = nullptr;
   cl::OpenCLWorkspace* workspace = nullptr;
   cl::OpenCLThreadEntry* tentry = nullptr;
   cl_device_id device_id;
@@ -107,6 +215,10 @@ class CLMLWorkspace {
 
   /* DDR memory management */
   std::map<cl_mem, std::pair<int, int>> ddr_global_pool;  // buf, size and ref count
+
+  /* Device API version information */
+  int target_major;
+  int target_minor;
 };
 
 /*! \brief Thread local workspace */
@@ -172,7 +284,6 @@ struct tensor_dims_t {
   uint32_t n, c, h, w;
 };
 
-#define CLML_INTF CLMLWorkspace::Global()->h_ClmlIntf
 #define CLML_QUEUE \
   CLMLWorkspace::Global()->workspace->GetQueue(CLMLWorkspace::Global()->tentry->device)
 #define CLML_CTX CLMLWorkspace::Global()->workspace->contexts[CLMLWorkspace::Global()->platform_id]

diff --git a/src/runtime/contrib/clml/clml_utils.cc b/src/runtime/contrib/clml/clml_utils.cc
@@ -40,14 +40,10 @@ using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
  */
 void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
                           cl_ml_tensor_layout_qcom layout) {
-  cl_int result = 0;
   cl_event evt = nullptr;
-  result = CLML_INTF->clEnqueueWriteMLTensorDataQCOM(CLML_QUEUE, data, layout, tensor->tensor,
-                                                     tensor->memory,
-                                                     0,        // n waitlist
-                                                     nullptr,  // waitlist
-                                                     &evt);    // event
-  ICHECK((evt != nullptr) && result == CL_SUCCESS) << "clEnqueueWriteMLTensorDataQCOM:" << result;
+  CLML_CALL(clEnqueueWriteMLTensorDataQCOM, CLML_QUEUE, data, layout, tensor->tensor,
+            tensor->memory, 0, nullptr, &evt);
+  ICHECK(evt != nullptr) << "clEnqueueWriteMLTensorDataQCOM";
 }
 
 /*!
@@ -62,13 +58,8 @@ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tenso
   cl_int result = 0;
   cl_event readEvent = nullptr;
   // Read the output tensor
-  result = CLML_INTF->clEnqueueReadMLTensorDataQCOM(CLML_QUEUE, tensor->tensor, tensor->memory,
-                                                    data, layout,
-                                                    0,            // n waitlist
-                                                    nullptr,      // waitlist
-                                                    &readEvent);  // event
-  ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
-
+  CLML_CALL(clEnqueueReadMLTensorDataQCOM, CLML_QUEUE, tensor->tensor, tensor->memory, data, layout,
+            0, nullptr, &readEvent);
   result = clWaitForEvents(1, &readEvent);
   ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
 }
@@ -83,14 +74,14 @@ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tenso
  * \return CLML tensor
  */
 cl_ml_tensor_qcom DeviceMakeCLMLTensor(cl_context context, tensor_dims_t dims,
-                                       cl_ml_tensor_layout_qcom layout, cl_channel_type dtype) {
+                                       cl_ml_tensor_layout_qcom layout, cl_channel_type dtype,
+                                       cl_ml_tensor_usage_qcom usage) {
   cl_ml_tensor_qcom tensor;
-  cl_int result = CL_OUT_OF_RESOURCES;
 
   cl_ml_tensor_desc_qcom desc = {
       dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {0}};
-  result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &tensor);
-  ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+  CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, usage, &tensor);
+  ICHECK(tensor) << "clCreateMLTensorQCOM";
   return tensor;
 }
 
@@ -195,11 +186,9 @@ cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
  * \param dtype tensor data type
  * \return CLML Tensor descriptor.
  */
-std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNode& tensor_rep,
-                                                              void* data,
-                                                              std::vector<size_t> c_shape,
-                                                              cl_ml_tensor_layout_qcom layout,
-                                                              cl_uint dtype) {
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+    const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+    cl_ml_tensor_layout_qcom layout, cl_uint dtype, cl_ml_tensor_usage_qcom usage) {
   std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
   std::vector<size_t> clml_shape(shape.begin(), shape.end());
   if (c_shape.size() > 0) {
@@ -217,7 +206,7 @@ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNod
   dims.w = clml_shape[3];
 
   auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-  tensor_dsc->tensor = DeviceMakeCLMLTensor(CLML_CTX, dims, layout, dtype);
+  tensor_dsc->tensor = DeviceMakeCLMLTensor(CLML_CTX, dims, layout, dtype, usage);
   return tensor_dsc;
 }
 
@@ -232,9 +221,9 @@ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNod
  * \return CLML Tensor descriptor.
  */
 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
-    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_uint dtype, void* data,
-    std::vector<size_t> shape) {
-  return MakeCLMLTensor(node, data, shape, layout, dtype);
+    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_ml_tensor_usage_qcom usage,
+    cl_uint dtype, void* data, std::vector<size_t> shape) {
+  return MakeCLMLTensor(node, data, shape, layout, dtype, usage);
 }
 
 /*!

diff --git a/src/runtime/contrib/clml/clml_utils.h b/src/runtime/contrib/clml/clml_utils.h
@@ -45,7 +45,7 @@ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tenso
 cl_ml_tensor_qcom DeviceMakeCLMLTensor(
     cl_context context, tensor_dims_t dims,
     cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-    cl_channel_type dtype = CL_FLOAT);
+    cl_channel_type dtype = CL_FLOAT, cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_INVALID_QCOM);
 
 cl_mem AllocateOnChipTensorMemory(size_t size, cl_uint on_chip_mem_offset);
 
@@ -58,15 +58,13 @@ cl_channel_type MakeCLDataType(const DLDataType& data_type);
 cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
                                         const cl_channel_type& acc_type = CL_FLOAT);
 
-std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNode& tensor_rep,
-                                                              void* data,
-                                                              std::vector<size_t> c_shape,
-                                                              cl_ml_tensor_layout_qcom layout,
-                                                              cl_uint dtype);
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+    const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+    cl_ml_tensor_layout_qcom layout, cl_uint dtype, cl_ml_tensor_usage_qcom usage);
 
 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
-    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_uint dtype, void* data = nullptr,
-    std::vector<size_t> shape = {});
+    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_ml_tensor_usage_qcom usage,
+    cl_uint dtype, void* data = nullptr, std::vector<size_t> shape = {});
 
 std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val);
 

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
@@ -370,6 +370,7 @@ def generate_command(
     precheck: Optional[Callable[[], None]] = None,
     post_build: Optional[List[str]] = None,
     additional_flags: Optional[Dict[str, str]] = None,
+    env: Optional[Dict[str, str]] = None,
 ):
     """
     Helper to generate CLIs that:
@@ -424,17 +425,22 @@ def fn(
             if kwargs.get(option_name, False):
                 scripts.extend(script.format(build_dir=build_dir) for script in extra_scripts)
 
+        docker_env = {
+            # Need to specify the library path manually or else TVM can't
+            # determine which build directory to use (i.e. if there are
+            # multiple copies of libtvm.so laying around)
+            "TVM_LIBRARY_PATH": str(REPO_ROOT / get_build_dir(name)),
+            "VERBOSE": "true" if verbose else "false",
+        }
+
+        if env is not None:
+            docker_env.update(env)
+
         docker(
             name=gen_name(f"ci-{name}"),
             image=f"ci_{name}" if docker_image is None else docker_image,
             scripts=scripts,
-            env={
-                # Need to specify the library path manually or else TVM can't
-                # determine which build directory to use (i.e. if there are
-                # multiple copies of libtvm.so laying around)
-                "TVM_LIBRARY_PATH": str(REPO_ROOT / get_build_dir(name)),
-                "VERBOSE": "true" if verbose else "false",
-            },
+            env=docker_env,
             interactive=interactive,
             additional_flags=additional_flags,
         )
@@ -719,10 +725,13 @@ def add_subparser(
         help="Run Adreno build and test(s)",
         post_build=["./tests/scripts/task_build_adreno_bins.sh"],
         additional_flags={
-            "--volume": os.environ.get("ADRENO_OPENCL", "") + ":/adreno-opencl",
-            "--env": "ADRENO_OPENCL=/adreno-opencl",
+            "--volume": os.environ.get("ADRENO_OPENCL", "/tmp/") + ":/adreno-opencl",
             "--net": "host",
         },
+        env={
+            "ADRENO_OPENCL": "/adreno-opencl",
+            "ADRENO_TARGET_CLML_VERSION": os.environ.get("ADRENO_TARGET_CLML_VERSION", "3"),
+        },
         options={
             "test": (
                 "run Adreno API/Python tests",