diff --git a/docker/bash.sh b/docker/bash.sh
index a3d57bfd42c0..f83becbf6148 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -247,6 +247,16 @@ while (( $# )); do
             shift 2
             ;;
 
+        -e)
+            DOCKER_ENV+=( --env "$2" )
+            shift 2
+            ;;
+
+        -v)
+            DOCKER_FLAGS+=( --volume "$2" )
+            shift 2
+            ;;
+
         --dry-run)
             DRY_RUN=true
             shift
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index dace7aaab913..1bfe947a845e 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -556,6 +556,14 @@ def check_depth_to_space(extract):
             return False
         return True
 
+    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
+    target_version = (
+        pass_context.config["relay.ext.clml.target_version"]
+        if "relay.ext.clml.target_version" in pass_context.config
+        else 3
+    )
+    print("CLML Target Version: ", target_version)
+
     return [
         ("clml.pad_conv2d", pad_conv_pattern(), check_conv),
         ("clml.conv2d", conv_pattern(), check_conv),
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
index 5d6fc0c2cf02..83c6ac31e328 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -35,6 +35,10 @@
 #include "../codegen_json/codegen_json.h"
 
 namespace tvm {
+
+constexpr const char* kCLMLTargetVersion = "relay.ext.clml.target_version";
+TVM_REGISTER_PASS_CONFIG_OPTION(kCLMLTargetVersion, Integer);
+
 namespace relay {
 namespace contrib {
 
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 8e69cb8bd13b..c580123b1347 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -73,7 +73,7 @@ CLMLWorkspace::CLMLWorkspace() {
 
   if (is_on_chip_memory) {
     result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
-                             sizeof(onchip_mem_size), &onchip_mem_size, NULL);
+                             sizeof(onchip_mem_size), &onchip_mem_size, nullptr);
     ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
                                  << result;
     LOG(WARNING) << "On chip memory size:" << onchip_mem_size;
@@ -92,17 +92,31 @@ CLMLWorkspace::CLMLWorkspace() {
   result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
   ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
 
-  for (cl_uint i = 0; i < numVersions; ++i) {
-    if (majorVersions[i] == CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
-      h_ClmlIntf = GET_ML_INTERFACE(0);
-      LOG(WARNING) << "CLML Target version:" << majorVersions[i];
-      break;
-    }
+  target_major = majorVersions[numVersions - 1];
+  target_minor = minorVersions[numVersions - 1];
+
+  LOG(WARNING) << "CLML Target Version:" << target_major << "." << target_minor;
+
+  if (target_major > CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
+    LOG(WARNING) << "Runtime is compiled with " << CL_QCOM_ML_OPS_H_MAJOR_VERSION
+                 << "where as target supports " << target_major
+                 << "\nTrying to use API interface version:" << CL_QCOM_ML_OPS_H_MAJOR_VERSION
+                 << "\nSome functionality may not work as expected ...";
+    target_major = CL_QCOM_ML_OPS_H_MAJOR_VERSION;
+    target_minor = 0;
   }
-  ICHECK(h_ClmlIntf != nullptr)
-      << "clGetMLInterfaceVxQCOM:" << result
-      << " Perhaps there is mispatch between CLML SDK version to target supported version:"
-      << majorVersions[numVersions - 1];
+
+  // ICHECK(target_minor <= CL_QCOM_ML_OPS_H_MINOR_VERSION)
+  //    << "CLML runtime compiled with minor version " << CL_QCOM_ML_OPS_H_MINOR_VERSION
+  //    << " where as the target supports higher version " << target_minor;
+
+  clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor);
+
+  ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported."
+                                << "Compiled version: " << CL_QCOM_ML_OPS_H_MAJOR_VERSION << "."
+                                << CL_QCOM_ML_OPS_H_MINOR_VERSION
+                                << "Target Version:" << target_major << "." << target_minor;
+
   char* tune_flag;
   if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
     is_tuning_run = std::stoi(tune_flag);
@@ -135,13 +149,11 @@ class CLMLRuntime : public JSONRuntimeBase {
 #ifdef TVM_GRAPH_EXECUTOR_CLML
     cl_int result = 0;
     if (this->layer_.tuning_cache) {
-      result = CLML_INTF->clReleaseMLTuningCacheQCOM(this->layer_.tuning_cache);
-      ICHECK(result == CL_SUCCESS) << "clReleaseMLTuningCacheQCOM:" << result;
+      CLML_CALL(clReleaseMLTuningCacheQCOM, this->layer_.tuning_cache);
     }
     for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
       auto tensor_desc = it->second.first;
-      result = CLML_INTF->clReleaseMLTensorQCOM(tensor_desc->tensor);
-      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+      CLML_CALL(clReleaseMLTensorQCOM, tensor_desc->tensor)
       if (this->layer_.ddr_storage_ref_map.find(tensor_desc->memory) !=
           this->layer_.ddr_storage_ref_map.end()) {
         ReleaseDDRMemory(tensor_desc->memory);
@@ -151,21 +163,17 @@ class CLMLRuntime : public JSONRuntimeBase {
       }
     }
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-      result = CLML_INTF->clReleaseMLOpQCOM(this->layer_.function[i]);
-      ICHECK(result == CL_SUCCESS) << "clReleaseMLOpQCOM:" << result;
+      CLML_CALL(clReleaseMLOpQCOM, this->layer_.function[i])
     }
     for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
          it++) {
-      result = CLML_INTF->clReleaseMLTensorQCOM(it->second->tensor);
-      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+      CLML_CALL(clReleaseMLTensorQCOM, it->second->tensor)
     }
     for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
          it++) {
-      result = CLML_INTF->clReleaseMLTensorQCOM((*it)->tensor);
-      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+      CLML_CALL(clReleaseMLTensorQCOM, (*it)->tensor)
     }
-    result = CLML_INTF->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
-    ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorMemoryDescriptorSetQCOM:" << result;
+    CLML_CALL(clReleaseMLTensorMemoryDescriptorSetQCOM, layer_.descriptorSet)
 
     if (this->layer_.recordable_queue) {
       clReleaseCommandQueue(this->layer_.recordable_queue);
@@ -214,8 +222,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
 
     // A Tuning run, so create the cache from scratch
-    result = CLML_INTF->clCreateMLTuningCacheQCOM(&layer_.tuning_cache);
-    ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
+    CLML_CALL(clCreateMLTuningCacheQCOM, &layer_.tuning_cache)
     if (!cws->is_tuning_run && cws->tuning_file) {
       std::vector<unsigned char> tune_buffer;
       std::string tune_blob;
@@ -241,9 +248,8 @@ class CLMLRuntime : public JSONRuntimeBase {
       if (tune_buffer.size()) {
         LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
                   << " size:" << tune_buffer.size();
-        result = CLML_INTF->clLoadMLTuningCacheQCOM(layer_.tuning_cache, tune_buffer.size(),
-                                                    tune_buffer.data());
-        ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+        CLML_CALL(clLoadMLTuningCacheQCOM, layer_.tuning_cache, tune_buffer.size(),
+                  tune_buffer.data())
       } else {
         LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
                      << cws->tuning_file;
@@ -259,7 +265,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \return Status of inference.
    */
   void Run() override {
-    cl_int result = 0;
     cl_command_queue queue = CLML_QUEUE;
     std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -282,10 +287,9 @@ class CLMLRuntime : public JSONRuntimeBase {
             evts.resize(evts.size() + 1);
             evt = &(evts.back());
           }
-          result = CLML_INTF->clEnqueueCopyMLTensorDataQCOM(
-              queue, layer_.in_placeholder[nid]->tensor, layer_.in_placeholder[nid]->memory,
-              layer_.inputs[nid]->tensor, layer_.inputs[nid]->memory, 0, NULL, evt);
-          ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
+          CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
+                    layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
+                    layer_.inputs[nid]->memory, 0, nullptr, evt);
         } else {
           DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
           cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -309,17 +313,13 @@ class CLMLRuntime : public JSONRuntimeBase {
         queue = CLML_QUEUE;
         evts.resize(evts.size() + 1);
         cl_event* evt = &(evts.back());
-        result = CLML_INTF->clEnqueueRecordingMLOpQCOM(queue, this->layer_.recording, 0, nullptr, 0,
-                                                       nullptr, 0, nullptr, 0, nullptr, 0, nullptr,
-                                                       0, nullptr, 0, nullptr, 0, nullptr, evt);
-        ICHECK(result == CL_SUCCESS) << "clEnqueueRecordingMLOpQCOM:" << result;
+        CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
+                  0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, evt);
         t->Stop();
         duration += t->SyncAndGetElapsedNanos();
       } else {
-        result = CLML_INTF->clEnqueueRecordingMLOpQCOM(queue, this->layer_.recording, 0, nullptr, 0,
-                                                       nullptr, 0, nullptr, 0, nullptr, 0, nullptr,
-                                                       0, nullptr, 0, nullptr, 0, nullptr, nullptr);
-        ICHECK(result == CL_SUCCESS) << "clEnqueueRecordingMLOpQCOM:" << result;
+        CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
+                  0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, nullptr);
       }
     } else {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
@@ -332,17 +332,16 @@ class CLMLRuntime : public JSONRuntimeBase {
           queue = CLML_QUEUE;
           evts.resize(evts.size() + 1);
           cl_event* evt = &(evts.back());
-          result = CLML_INTF->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
-                                                this->layer_.descriptorSet, 0, nullptr, evt);
+          CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
+                    0, nullptr, evt);
           t->Stop();
           duration += t->SyncAndGetElapsedNanos();
           LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
                        << " Duration:" << t->SyncAndGetElapsedNanos();
         } else {
-          result = CLML_INTF->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
-                                                this->layer_.descriptorSet, 0, nullptr, nullptr);
+          CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
+                    0, nullptr, nullptr);
         }
-        ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
       }
     }
     if (getenv("CLML_PROFILING")) {
@@ -368,10 +367,9 @@ class CLMLRuntime : public JSONRuntimeBase {
           evts.resize(evts.size() + 1);
           evt = &(evts.back());
         }
-        result = CLML_INTF->clEnqueueCopyMLTensorDataQCOM(
-            queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
-            layer_.out_placeholder[i]->tensor, layer_.out_placeholder[i]->memory, 0, nullptr, evt);
-        ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
+        CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
+                  layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
+                  layer_.out_placeholder[i]->memory, 0, nullptr, evt);
       } else {
         DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
         cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -447,10 +445,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       uint32_t size = 0;
-      cl_int result = CL_OUT_OF_HOST_MEMORY;
-      result = CLML_INTF->clGetMLTensorMemorySizeQCOM(CLML_CTX,
-                                                      layer_.storage_map[nid].first->tensor, &size);
-      ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, layer_.storage_map[nid].first->tensor,
+                &size);
 
       if ((node.GetOpType() == "kernel") || (node.GetOpType() == "input")) {
         std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
@@ -522,16 +518,17 @@ class CLMLRuntime : public JSONRuntimeBase {
   std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
       size_t nid, std::vector<size_t> shape, cl_ml_tensor_layout_qcom layout, cl_uint dtype) {
     const JSONGraphNode node = nodes_[nid];
+    cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_CNN_QCOM;
 
     if (this->layer_.storage_map.find(nid) == this->layer_.storage_map.end()) {
       void* node_data = nullptr;
       if (node.GetOpType() == "const") {
         uint32_t eid = EntryID(nid, 0);
         node_data = data_entry_[eid]->data;
+        usage = CL_TENSOR_USAGE_PARAMETER_QCOM;
       }
 
-      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape);
-
+      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape);
       this->layer_.storage_map.insert({nid, std::make_pair(clml_tensor, node)});
 
       if ("input" == node.GetOpType()) {
@@ -539,11 +536,11 @@ class CLMLRuntime : public JSONRuntimeBase {
         // Input copy placeholder Tensor
         if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
           this->layer_.in_placeholder.insert(
-              {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, dtype, node_data,
-                                               shape)});
+              {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, usage, dtype,
+                                               node_data, shape)});
         } else {
           this->layer_.in_placeholder.insert(
-              {nid, MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape)});
+              {nid, MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape)});
         }
       }
 
@@ -631,12 +628,12 @@ class CLMLRuntime : public JSONRuntimeBase {
       this->layer_.outputs.push_back(this->layer_.storage_map[nid].first);
       if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
         // Handle customized shapes here
-        this->layer_.out_placeholder.push_back(
-            MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype, nullptr,
-                                       this->layer_.out_shapes[nid]));
+        this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
+            nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype, nullptr,
+            this->layer_.out_shapes[nid]));
       } else {
-        this->layer_.out_placeholder.push_back(
-            MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
+        this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
+            nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype));
       }
     }
 
@@ -652,8 +649,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       auto tensor_desc = it->second.first;
       uint32_t mem_size = 0;
       result = CL_OUT_OF_HOST_MEMORY;
-      result = CLML_INTF->clGetMLTensorMemorySizeQCOM(CLML_CTX, tensor_desc->tensor, &mem_size);
-      ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, &mem_size);
 
       JSONGraphNode node = it->second.second;
       void* node_data = nullptr;
@@ -707,13 +703,11 @@ class CLMLRuntime : public JSONRuntimeBase {
     LOG_STATS << "Total Local Pool:" << ddr_local_pool;
 
     // Setup descriptor set
-    result = CLML_INTF->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
-    ICHECK(result == CL_SUCCESS) << "clCreateMLTensorMemoryDescriptorSetQCOM:" << result;
+    CLML_CALL(clCreateMLTensorMemoryDescriptorSetQCOM, &this->layer_.descriptorSet);
 
-    result = CLML_INTF->clUpdateMLTensorMemoryDescriptorSetQCOM(
-        this->layer_.descriptorSet, static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
-        this->layer_.tensorMemDescs.data());
-    ICHECK(result == CL_SUCCESS) << "clUpdateMLTensorMemoryDescriptorSetQCOM:" << result;
+    CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, this->layer_.descriptorSet,
+              static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
+              this->layer_.tensorMemDescs.data());
 
     if (cws->is_tuning_run) {
       LOG(WARNING) << "CLML Tunning In Progress:";
@@ -721,23 +715,18 @@ class CLMLRuntime : public JSONRuntimeBase {
       cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i];
-        result = CLML_INTF->clTuneMLOpQCOM(CLML_QUEUE, this->layer_.function[i],
-                                           this->layer_.descriptorSet, this->layer_.tuning_cache,
-                                           nullptr);
-        ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
+        CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
+                  this->layer_.tuning_cache, nullptr);
       }
       cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);
 
       size_t cache_len_bytes = 0;
       size_t len_ret = 0;
-      result =
-          CLML_INTF->clSaveMLTuningCacheQCOM(layer_.tuning_cache, 0, nullptr, &cache_len_bytes);
-      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
+      CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, 0, nullptr, &cache_len_bytes);
 
       std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
-      result = CLML_INTF->clSaveMLTuningCacheQCOM(layer_.tuning_cache, saved_cache.size(),
-                                                  saved_cache.data(), &len_ret);
-      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM" << result;
+      CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, saved_cache.size(),
+                saved_cache.data(), &len_ret);
 
       std::string tune_str;
       dmlc::MemoryStringStream mstrm(&tune_str);
@@ -757,10 +746,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
     if (cws->is_recordable_queue) {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-        result =
-            CLML_INTF->clEnqueueMLOpQCOM(this->layer_.recordable_queue, this->layer_.function[i],
-                                         this->layer_.descriptorSet, 0, nullptr, nullptr);
-        ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM - Recordable Queue:" << result;
+        CLML_CALL(clEnqueueMLOpQCOM, this->layer_.recordable_queue, this->layer_.function[i],
+                  this->layer_.descriptorSet, 0, nullptr, nullptr);
       }
 
       result = clEndRecordingQCOM(this->layer_.recording);
@@ -797,7 +784,6 @@ class CLMLRuntime : public JSONRuntimeBase {
     std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
     cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
     cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};
-    cl_int result = 0;
 
     cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
     if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
@@ -846,8 +832,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-      result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &layer_.unusedTensor);
-      ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+      CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
+                                     &layer_.unusedTensor);
+      ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
       bias->tensor = layer_.unusedTensor;
     }
     // Output
@@ -865,15 +852,12 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_op_qcom op = nullptr;
     if (!has_bn) {
       if (!has_act) {
-        result = CLML_INTF->clCreateMLOpConvolutionForwardQCOM(
-            CLML_CTX, nullptr, &conv_desc, input->tensor, weight->tensor, bias->tensor,
-            output->tensor, &op, nullptr);
-        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+        CLML_CALL(clCreateMLOpConvolutionForwardQCOM, CLML_CTX, nullptr, &conv_desc, input->tensor,
+                  weight->tensor, bias->tensor, output->tensor, &op, nullptr);
       } else {
-        result = CLML_INTF->clCreateMLOpFusedConvolutionActivationForwardQCOM(
-            CLML_CTX, nullptr, &conv_desc, &act_desc, input->tensor, weight->tensor, bias->tensor,
-            nullptr, output->tensor, &op, layer_.tuning_cache);
-        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+        CLML_CALL(clCreateMLOpFusedConvolutionActivationForwardQCOM, CLML_CTX, nullptr, &conv_desc,
+                  &act_desc, input->tensor, weight->tensor, bias->tensor, nullptr, output->tensor,
+                  &op, layer_.tuning_cache);
       }
       layer->function.push_back(op);
     } else {
@@ -897,18 +881,15 @@ class CLMLRuntime : public JSONRuntimeBase {
 
       cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
       if (!has_act) {
-        result = CLML_INTF->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
-            CLML_CTX, nullptr, &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor,
-            output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
-            layer_.tuning_cache);
-        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, nullptr, &conv_desc,
+                  &bn_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
+                  bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
+                  layer_.tuning_cache);
       } else {
-        result = CLML_INTF->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
-            CLML_CTX, nullptr, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor,
-            bias->tensor, output->tensor, nullptr, bn_mean->tensor, bn_var->tensor,
-            bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
-
-        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, CLML_CTX, nullptr,
+                  &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor, bias->tensor,
+                  output->tensor, nullptr, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
+                  bn_bias->tensor, &op, layer_.tuning_cache);
       }
       layer->function.push_back(op);
     }
@@ -924,7 +905,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    */
   void CreateReLULayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid,
                        cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -938,13 +918,13 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &layer_.unusedTensor);
-    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+    CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
+                                   &layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
 
-    result = CLML_INTF->clCreateMLOpActivationForwardQCOM(CLML_CTX, nullptr, &act_desc,
-                                                          input->tensor, layer_.unusedTensor,
-                                                          output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
+    CLML_CALL(clCreateMLOpActivationForwardQCOM, CLML_CTX, nullptr, &act_desc, input->tensor,
+              layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "Activation Error";
 
     layer->function.push_back(op);
     return;
@@ -959,7 +939,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchNormLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -994,10 +973,10 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpBatchNormForwardQCOM(
-        CLML_CTX, opProperties.data(), &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor,
-        bn_scale->tensor, bn_bias->tensor, output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
+    CLML_CALL(clCreateMLOpBatchNormForwardQCOM, CLML_CTX, opProperties.data(), &bn_desc,
+              input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor,
+              output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "Batchnorm Error";
 
     layer->function.push_back(op);
     return;
@@ -1013,7 +992,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1044,13 +1022,13 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_tensor_desc_qcom desc = {};
     cl_ml_tensor_qcom unusedTensor = nullptr;
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &unusedTensor);
-    ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result;
+    CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
+                                   &unusedTensor);
+    ICHECK(unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
 
-    result = CLML_INTF->clCreateMLOpPoolingForwardQCOM(CLML_CTX, nullptr, &pool_desc, input->tensor,
-                                                       unusedTensor, output->tensor, &op,
-                                                       layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+    CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
+              unusedTensor, output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "Pooling Error";
 
     layer->function.push_back(op);
     return;
@@ -1066,7 +1044,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1089,13 +1066,13 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &layer_.unusedTensor);
-    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+    CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
+                                   &layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
 
-    result = CLML_INTF->clCreateMLOpPoolingForwardQCOM(CLML_CTX, nullptr, &pool_desc, input->tensor,
-                                                       layer_.unusedTensor, output->tensor, &op,
-                                                       layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+    CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
+              layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "Pooling Error";
 
     layer->function.push_back(op);
     return;
@@ -1110,8 +1087,6 @@ class CLMLRuntime : public JSONRuntimeBase {
 
   void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_tensor_layout_qcom layout;
-    cl_int result = 0;
-    cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
@@ -1140,9 +1115,8 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_ml_tensor_layout_qcom layout;
+    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
     cl_softmax_mode_qcom mode = CL_SOFTMAX_MODE_SPATIAL_QCOM;
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1151,9 +1125,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
     cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, mode,
                                                cl_arithmetic_mode};
-    result = CLML_INTF->clCreateMLOpSoftmaxQCOM(CLML_CTX, nullptr, &softmax_desc, input->tensor,
-                                                output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
+    CLML_CALL(clCreateMLOpSoftmaxQCOM, CLML_CTX, nullptr, &softmax_desc, input->tensor,
+              output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "SoftMax Error";
     layer->function.push_back(op);
     return;
   }
@@ -1166,7 +1140,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreatePadLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1195,9 +1168,9 @@ class CLMLRuntime : public JSONRuntimeBase {
         {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
         cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpPadQCOM(CLML_CTX, nullptr, &pad_desc, input->tensor,
-                                            output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
+    CLML_CALL(clCreateMLOpPadQCOM, CLML_CTX, nullptr, &pad_desc, input->tensor, output->tensor, &op,
+              layer_.tuning_cache);
+    ICHECK(op) << "Pad Error";
 
     layer->function.push_back(op);
     return;
@@ -1211,7 +1184,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchFlattenLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1219,9 +1191,9 @@ class CLMLRuntime : public JSONRuntimeBase {
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    result = CLML_INTF->clCreateMLOpReshapeQCOM(CLML_CTX, nullptr, input->tensor, output->tensor,
-                                                &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+    CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
+              layer_.tuning_cache);
+    ICHECK(op) << "Reshape Error";
 
     layer->function.push_back(op);
     return;
@@ -1235,7 +1207,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1243,9 +1214,9 @@ class CLMLRuntime : public JSONRuntimeBase {
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    result = CLML_INTF->clCreateMLOpReshapeQCOM(CLML_CTX, nullptr, input->tensor, output->tensor,
-                                                &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+    CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
+              layer_.tuning_cache);
+    ICHECK(op) << "Reshape Error";
 
     layer->function.push_back(op);
     return;
@@ -1260,7 +1231,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateConcatLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
     DLDataType tvm_dtype = node.GetOpDataType()[0];
@@ -1277,9 +1247,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
     cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpConcatQCOM(CLML_CTX, nullptr, &concatDesc, concatInputs,
-                                               output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result;
+    CLML_CALL(clCreateMLOpConcatQCOM, CLML_CTX, nullptr, &concatDesc, concatInputs, output->tensor,
+              &op, layer_.tuning_cache);
+    ICHECK(op) << "Concat Error";
 
     layer->function.push_back(op);
 
@@ -1296,7 +1266,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateDenseLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1333,10 +1302,9 @@ class CLMLRuntime : public JSONRuntimeBase {
       cl_ml_op_fully_connected_desc_qcom fc_desc{1,  // refer clml_ops.txt for struct
                                                  w_transform, cl_arithmetic_mode};
 
-      result = CLML_INTF->clCreateMLOpFullyConnectedQCOM(CLML_CTX, nullptr, &fc_desc, input->tensor,
-                                                         weight->tensor, bias->tensor,
-                                                         output->tensor, &op, layer_.tuning_cache);
-      ICHECK(op && result == CL_SUCCESS) << "FC layer Error:" << result;
+      CLML_CALL(clCreateMLOpFullyConnectedQCOM, CLML_CTX, nullptr, &fc_desc, input->tensor,
+                weight->tensor, bias->tensor, output->tensor, &op, layer_.tuning_cache);
+      ICHECK(op) << "FC layer Error";
       layer->function.push_back(op);
     } else {
       cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
@@ -1351,10 +1319,9 @@ class CLMLRuntime : public JSONRuntimeBase {
                                           {{0.0}, CL_FLOAT},            // beta
                                           cl_arithmetic_mode};
 
-      result =
-          CLML_INTF->clCreateMLOpGemmQCOM(CLML_CTX, 0, &gemmDesc, input->tensor, weight->tensor,
-                                          output->tensor, &op, layer_.tuning_cache);
-      ICHECK(op && result == CL_SUCCESS) << "Gemm layer Error:" << result;
+      CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
+                output->tensor, &op, layer_.tuning_cache);
+      ICHECK(op) << "Gemm layer Error";
       layer->function.push_back(op);
       if (has_bias) {
         cl_ml_op_binary_desc_qcom binaryDesc = {CL_TENSOR_OP_ADD_QCOM,
@@ -1362,9 +1329,9 @@ class CLMLRuntime : public JSONRuntimeBase {
                                                 {{1.0}, CL_FLOAT},  // beta
                                                 {{1.0}, CL_FLOAT},  // gamma
                                                 cl_arithmetic_mode};
-        result = CLML_INTF->clCreateMLOpBinaryQCOM(CLML_CTX, 0, &binaryDesc, bias->tensor,
-                                                   layer_.unusedTensor, output->tensor, &op,
-                                                   layer_.tuning_cache);
+        CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &binaryDesc, bias->tensor,
+                  layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
+        ICHECK(op) << "Binary Op Error";
         layer->function.push_back(op);
       }
     }
@@ -1381,8 +1348,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateDenseLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
-    cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
@@ -1410,7 +1375,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchMatmulLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1446,9 +1410,9 @@ class CLMLRuntime : public JSONRuntimeBase {
                                         {{0.0}, CL_FLOAT},            // beta
                                         cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpGemmQCOM(CLML_CTX, 0, &gemmDesc, input->tensor, weight->tensor,
-                                             output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "BatchMatmul Error:" << result;
+    CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
+              output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "BatchMatmul Error";
 
     layer->function.push_back(op);
     return;
@@ -1463,11 +1427,8 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchMatmulLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
-    cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
@@ -1495,7 +1456,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateClipLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1509,9 +1469,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_op_clip_desc_qcom clip_desc = {
         CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpClipQCOM(CLML_CTX, nullptr, &clip_desc, input->tensor,
-                                             output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
+    CLML_CALL_clCreateMLOpClipQCOM(CLML_CTX, nullptr, &clip_desc, input->tensor, output->tensor,
+                                   &op, layer_.tuning_cache);
+    ICHECK(op) << "Clip Error";
 
     layer->function.push_back(op);
     return;
@@ -1525,7 +1485,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBinaryLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1550,10 +1509,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_op_binary_desc_qcom add_desc = {
         binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpBinaryQCOM(CLML_CTX, nullptr, &add_desc, input_a->tensor,
-                                               input_b->tensor, output->tensor, &op,
-                                               layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result;
+    CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &add_desc, input_a->tensor,
+              input_b->tensor, output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << op_name << " Node Error";
 
     layer->function.push_back(op);
     return;
@@ -1567,7 +1525,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateDepthToSpaceLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1578,9 +1535,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_uint block_size = std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);
 
     cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode};
-    result = CLML_INTF->clCreateMLOpDepthToSpaceQCOM(CLML_CTX, nullptr, &dtos_desc, input->tensor,
-                                                     output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "DepthToSpace Layer Error:" << result;
+    CLML_CALL(clCreateMLOpDepthToSpaceQCOM, CLML_CTX, nullptr, &dtos_desc, input->tensor,
+              output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "DepthToSpace Layer Error";
 
     layer->function.push_back(op);
     return;
@@ -1594,7 +1551,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateResizeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
-    cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -1605,9 +1561,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_bool align_corners = std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);
 
     cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode};
-    result = CLML_INTF->clCreateMLOpResizeBilinearQCOM(
-        CLML_CTX, nullptr, &resize_desc, input->tensor, output->tensor, &op, layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Resize Layer Error:" << result;
+    CLML_CALL(clCreateMLOpResizeBilinearQCOM, CLML_CTX, nullptr, &resize_desc, input->tensor,
+              output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op) << "Resize Layer Error";
 
     layer->function.push_back(op);
     return;
diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h
index 2a6ce02626d4..f346ce7af696 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -23,6 +23,12 @@
  */
 #ifndef TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
 #define TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#define CL_TARGET_OPENCL_VERSION 300
+#endif
+
 #include <CL/cl.h>
 #include <CL/opencl.h>
 #include <stdlib.h>
@@ -48,8 +54,110 @@
 
 #define CAT_I(a, b) a##b
 #define CAT(a, b) CAT_I(a, b)
-#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
-#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+
+#define CLML_CHECK_ERROR(e, API) \
+  { ICHECK(e == CL_SUCCESS) << "CLML Error:" #API " code=" << e; }
+
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION > 3
+#define V4_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV4QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+#else
+#define V4_API(API, ...) LOG(FATAL) << "CLML Error:" #API " - Incompatible V4 API call\n";
+#endif
+
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION > 2
+#define V3_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV3QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+#else
+#define V3_API(API, ...) LOG(FATAL) << "CLML Error:" #API " - Incompatible V3 API call\n";
+#endif
+
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION > 1
+#define V2_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV2QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+#else
+#define V2_API(API, ...) LOG(FATAL) << "CLML Error:" #API " - Incompatible V2 API call\n";
+#endif
+
+#define V1_API(API, ...)                                                            \
+  e = (reinterpret_cast<CLMLInterfaceV1QCOM*>(CLMLWorkspace::Global()->h_ClmlIntf)) \
+          ->API(__VA_ARGS__);                                                       \
+  CLML_CHECK_ERROR(e, API);
+
+#define CLML_CALL(API, ...)                                                  \
+  {                                                                          \
+    cl_int e;                                                                \
+    switch (CLMLWorkspace::Global()->target_major) {                         \
+      case 1:                                                                \
+        V1_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      case 2:                                                                \
+        V2_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      case 3:                                                                \
+        V3_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      case 4:                                                                \
+        V4_API(API, __VA_ARGS__);                                            \
+        break;                                                               \
+      default:                                                               \
+        LOG(FATAL) << "CLML Error:" #API " - Unsupported target version \n"; \
+    }                                                                        \
+  }
+
+#define CLML_CALL_VERSIONED(APICALL, VERSION, ...) CAT(CAT(V, VERSION), _API)(APICALL, __VA_ARGS__)
+
+#define CALL_CASE(VERSION, API, ...)                \
+  case VERSION:                                     \
+    CLML_CALL_VERSIONED(API, VERSION, __VA_ARGS__); \
+    break;
+
+// clCreateMLOpClipQCOM
+#define CLML_CALL_clCreateMLOpClipQCOM(...)                        \
+  cl_int e;                                                        \
+  switch (CLMLWorkspace::Global()->target_major) {                 \
+    CALL_CASE(2, clCreateMLOpClipQCOM, __VA_ARGS__)                \
+    CALL_CASE(3, clCreateMLOpClipQCOM, __VA_ARGS__)                \
+    CALL_CASE(4, clCreateMLOpClipQCOM, __VA_ARGS__)                \
+    default:                                                       \
+      LOG(FATAL) << "CLML Error: - Unsupported target version \n"; \
+  }
+
+// clCreateMLTensorQCOM and clCreateMLTensorWithUsageQCOM
+#define CALL_clCreateMLTensorQCOM(VERSION, CONTEXT, TENSORPROPS, TENSORDESC, USAGE, TENSOR) \
+  CALL_CASE(VERSION, clCreateMLTensorQCOM, CONTEXT, TENSORPROPS, TENSORDESC, TENSOR)
+
+#define CALL_clCreateMLTensorWithUsageQCOM(VERSION, CONTEXT, TENSORPROPS, TENSORDESC, USAGE, \
+                                           TENSOR)                                           \
+  CALL_CASE(VERSION, clCreateMLTensorWithUsageQCOM, CONTEXT, TENSORPROPS, TENSORDESC, USAGE, TENSOR)
+
+#define CLML_CALL_clCreateMLTensorQCOM(...)                        \
+  cl_int e;                                                        \
+  switch (CLMLWorkspace::Global()->target_major) {                 \
+    CALL_clCreateMLTensorQCOM(1, __VA_ARGS__);                     \
+    CALL_clCreateMLTensorQCOM(2, __VA_ARGS__);                     \
+    CALL_clCreateMLTensorQCOM(3, __VA_ARGS__);                     \
+    CALL_clCreateMLTensorWithUsageQCOM(4, __VA_ARGS__);            \
+    default:                                                       \
+      LOG(FATAL) << "CLML Error: - Unsupported target version \n"; \
+  }
+
+/* Version compatibility for CLML Tensor creation */
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION < 4
+typedef enum _cl_ml_tensor_usage_qcom {
+  CL_TENSOR_USAGE_INVALID_QCOM = 0,
+  CL_TENSOR_USAGE_UNUSED_QCOM = 1,
+  CL_TENSOR_USAGE_PARAMETER_QCOM = 2,
+  CL_TENSOR_USAGE_CNN_QCOM = 3,
+  CL_TENSOR_USAGE_TNN_QCOM = 4,
+} cl_ml_tensor_usage_qcom;
+#endif
 
 /*! \brief Magic number for CLML Tuning cache entry */
 static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
@@ -81,7 +189,7 @@ class CLMLWorkspace {
   virtual CLMLThreadEntry* GetThreadEntry();
 
   /* CLML Context */
-  GET_ML_API_INTERFACE* h_ClmlIntf = nullptr;
+  void* h_ClmlIntf = nullptr;
   cl::OpenCLWorkspace* workspace = nullptr;
   cl::OpenCLThreadEntry* tentry = nullptr;
   cl_device_id device_id;
@@ -107,6 +215,10 @@ class CLMLWorkspace {
 
   /* DDR memory management */
   std::map<cl_mem, std::pair<int, int>> ddr_global_pool;  // buf, size and ref count
+
+  /* Device API version information */
+  int target_major;
+  int target_minor;
 };
 
 /*! \brief Thread local workspace */
@@ -172,7 +284,6 @@ struct tensor_dims_t {
   uint32_t n, c, h, w;
 };
 
-#define CLML_INTF CLMLWorkspace::Global()->h_ClmlIntf
 #define CLML_QUEUE \
   CLMLWorkspace::Global()->workspace->GetQueue(CLMLWorkspace::Global()->tentry->device)
 #define CLML_CTX CLMLWorkspace::Global()->workspace->contexts[CLMLWorkspace::Global()->platform_id]
diff --git a/src/runtime/contrib/clml/clml_utils.cc b/src/runtime/contrib/clml/clml_utils.cc
index e1e6fc754231..354bd104b81f 100644
--- a/src/runtime/contrib/clml/clml_utils.cc
+++ b/src/runtime/contrib/clml/clml_utils.cc
@@ -40,14 +40,10 @@ using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
  */
 void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
                           cl_ml_tensor_layout_qcom layout) {
-  cl_int result = 0;
   cl_event evt = nullptr;
-  result = CLML_INTF->clEnqueueWriteMLTensorDataQCOM(CLML_QUEUE, data, layout, tensor->tensor,
-                                                     tensor->memory,
-                                                     0,        // n waitlist
-                                                     nullptr,  // waitlist
-                                                     &evt);    // event
-  ICHECK((evt != nullptr) && result == CL_SUCCESS) << "clEnqueueWriteMLTensorDataQCOM:" << result;
+  CLML_CALL(clEnqueueWriteMLTensorDataQCOM, CLML_QUEUE, data, layout, tensor->tensor,
+            tensor->memory, 0, nullptr, &evt);
+  ICHECK(evt != nullptr) << "clEnqueueWriteMLTensorDataQCOM";
 }
 
 /*!
@@ -62,13 +58,8 @@ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tenso
   cl_int result = 0;
   cl_event readEvent = nullptr;
   // Read the output tensor
-  result = CLML_INTF->clEnqueueReadMLTensorDataQCOM(CLML_QUEUE, tensor->tensor, tensor->memory,
-                                                    data, layout,
-                                                    0,            // n waitlist
-                                                    nullptr,      // waitlist
-                                                    &readEvent);  // event
-  ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
-
+  CLML_CALL(clEnqueueReadMLTensorDataQCOM, CLML_QUEUE, tensor->tensor, tensor->memory, data, layout,
+            0, nullptr, &readEvent);
   result = clWaitForEvents(1, &readEvent);
   ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
 }
@@ -83,14 +74,14 @@ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tenso
  * \return CLML tensor
  */
 cl_ml_tensor_qcom DeviceMakeCLMLTensor(cl_context context, tensor_dims_t dims,
-                                       cl_ml_tensor_layout_qcom layout, cl_channel_type dtype) {
+                                       cl_ml_tensor_layout_qcom layout, cl_channel_type dtype,
+                                       cl_ml_tensor_usage_qcom usage) {
   cl_ml_tensor_qcom tensor;
-  cl_int result = CL_OUT_OF_RESOURCES;
 
   cl_ml_tensor_desc_qcom desc = {
       dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {0}};
-  result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &tensor);
-  ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+  CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, usage, &tensor);
+  ICHECK(tensor) << "clCreateMLTensorQCOM";
   return tensor;
 }
 
@@ -195,11 +186,9 @@ cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
  * \param dtype tensor data type
  * \return CLML Tensor descriptor.
  */
-std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNode& tensor_rep,
-                                                              void* data,
-                                                              std::vector<size_t> c_shape,
-                                                              cl_ml_tensor_layout_qcom layout,
-                                                              cl_uint dtype) {
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+    const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+    cl_ml_tensor_layout_qcom layout, cl_uint dtype, cl_ml_tensor_usage_qcom usage) {
   std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
   std::vector<size_t> clml_shape(shape.begin(), shape.end());
   if (c_shape.size() > 0) {
@@ -217,7 +206,7 @@ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNod
   dims.w = clml_shape[3];
 
   auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-  tensor_dsc->tensor = DeviceMakeCLMLTensor(CLML_CTX, dims, layout, dtype);
+  tensor_dsc->tensor = DeviceMakeCLMLTensor(CLML_CTX, dims, layout, dtype, usage);
   return tensor_dsc;
 }
 
@@ -232,9 +221,9 @@ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNod
  * \return CLML Tensor descriptor.
  */
 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
-    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_uint dtype, void* data,
-    std::vector<size_t> shape) {
-  return MakeCLMLTensor(node, data, shape, layout, dtype);
+    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_ml_tensor_usage_qcom usage,
+    cl_uint dtype, void* data, std::vector<size_t> shape) {
+  return MakeCLMLTensor(node, data, shape, layout, dtype, usage);
 }
 
 /*!
diff --git a/src/runtime/contrib/clml/clml_utils.h b/src/runtime/contrib/clml/clml_utils.h
index 79a8312aeb5e..2051793cf18b 100644
--- a/src/runtime/contrib/clml/clml_utils.h
+++ b/src/runtime/contrib/clml/clml_utils.h
@@ -45,7 +45,7 @@ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tenso
 cl_ml_tensor_qcom DeviceMakeCLMLTensor(
     cl_context context, tensor_dims_t dims,
     cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-    cl_channel_type dtype = CL_FLOAT);
+    cl_channel_type dtype = CL_FLOAT, cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_INVALID_QCOM);
 
 cl_mem AllocateOnChipTensorMemory(size_t size, cl_uint on_chip_mem_offset);
 
@@ -58,15 +58,13 @@ cl_channel_type MakeCLDataType(const DLDataType& data_type);
 cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
                                         const cl_channel_type& acc_type = CL_FLOAT);
 
-std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNode& tensor_rep,
-                                                              void* data,
-                                                              std::vector<size_t> c_shape,
-                                                              cl_ml_tensor_layout_qcom layout,
-                                                              cl_uint dtype);
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+    const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+    cl_ml_tensor_layout_qcom layout, cl_uint dtype, cl_ml_tensor_usage_qcom usage);
 
 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
-    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_uint dtype, void* data = nullptr,
-    std::vector<size_t> shape = {});
+    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_ml_tensor_usage_qcom usage,
+    cl_uint dtype, void* data = nullptr, std::vector<size_t> shape = {});
 
 std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val);
 
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index a2facff6d6db..b9bc93343b6b 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -370,6 +370,7 @@ def generate_command(
     precheck: Optional[Callable[[], None]] = None,
     post_build: Optional[List[str]] = None,
     additional_flags: Optional[Dict[str, str]] = None,
+    env: Optional[Dict[str, str]] = None,
 ):
     """
     Helper to generate CLIs that:
@@ -424,17 +425,22 @@ def fn(
             if kwargs.get(option_name, False):
                 scripts.extend(script.format(build_dir=build_dir) for script in extra_scripts)
 
+        docker_env = {
+            # Need to specify the library path manually or else TVM can't
+            # determine which build directory to use (i.e. if there are
+            # multiple copies of libtvm.so laying around)
+            "TVM_LIBRARY_PATH": str(REPO_ROOT / get_build_dir(name)),
+            "VERBOSE": "true" if verbose else "false",
+        }
+
+        if env is not None:
+            docker_env.update(env)
+
         docker(
             name=gen_name(f"ci-{name}"),
             image=f"ci_{name}" if docker_image is None else docker_image,
             scripts=scripts,
-            env={
-                # Need to specify the library path manually or else TVM can't
-                # determine which build directory to use (i.e. if there are
-                # multiple copies of libtvm.so laying around)
-                "TVM_LIBRARY_PATH": str(REPO_ROOT / get_build_dir(name)),
-                "VERBOSE": "true" if verbose else "false",
-            },
+            env=docker_env,
             interactive=interactive,
             additional_flags=additional_flags,
         )
@@ -719,10 +725,13 @@ def add_subparser(
         help="Run Adreno build and test(s)",
         post_build=["./tests/scripts/task_build_adreno_bins.sh"],
         additional_flags={
-            "--volume": os.environ.get("ADRENO_OPENCL", "") + ":/adreno-opencl",
-            "--env": "ADRENO_OPENCL=/adreno-opencl",
+            "--volume": os.environ.get("ADRENO_OPENCL", "/tmp/") + ":/adreno-opencl",
             "--net": "host",
         },
+        env={
+            "ADRENO_OPENCL": "/adreno-opencl",
+            "ADRENO_TARGET_CLML_VERSION": os.environ.get("ADRENO_TARGET_CLML_VERSION", "3"),
+        },
         options={
             "test": (
                 "run Adreno API/Python tests",