From 99600201d0c6e6c0b9d710300cd6474749405c8c Mon Sep 17 00:00:00 2001
From: Siva Rama Krishna Reddy B <quic_sivb@quicinc.com>
Date: Wed, 25 Jan 2023 18:22:03 +0530
Subject: [PATCH 1/3] [RUNTIME][CLML] OpenCLML tuning and profiling enhanced

Tuning cache bin is serialized through DMLC::Stream to support multiple
CLML sub graphs with in a tvm module. Individual tuning cache blobs are
saved to same output file.

New API on OpenCLWorkspace to enable or disable profiling on command queue
rather doing this only when Timer is invoked. This is required to perform
CLML operator tuning.

CLML layer profiling now uses OpenCL Timer interface.

This PR also fix avoiding pad operator offloading at the very first layer
(to be specific before at least one convolution layer) due to the limitation
of CLML pad operator is concerned about layout. Please refer to CLML SDK
documentation for more details.
---
 src/runtime/contrib/clml/clml_runtime.cc | 144 +++++++++++++++--------
 src/runtime/opencl/opencl_common.h       |  42 ++++---
 2 files changed, 117 insertions(+), 69 deletions(-)
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 1fb694a91201..4c32aedffe90 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -29,16 +29,26 @@
 #endif
 #include <stdlib.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <fstream>
 #include <map>
 #include <utility>
 
+#include "../../file_utils.h"
 #include "../../opencl/opencl_common.h"
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 
+#define CAT_I(a, b) a##b
+#define CAT(a, b) CAT_I(a, b)
+#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+
+/*! \brief Magic number for CLML Tuning cache entry */
+static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
+
 namespace tvm {
 namespace runtime {
 namespace contrib {
@@ -58,7 +68,7 @@ class CLMLRuntime : public JSONRuntimeBase {
    */
   explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
                        const Array<String>& const_names)
-      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+      : JSONRuntimeBase(symbol_name, graph_json, const_names), clml_symbol(symbol_name) {}
 
   ~CLMLRuntime() {
 #ifdef TVM_GRAPH_EXECUTOR_CLML
@@ -153,40 +163,59 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
 
     for (cl_uint i = 0; i < numVersions; ++i) {
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2
-      if (majorVersions[i] == 2) {
-        h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
-        LOG(WARNING) << "CLML Target version:" << majorVersions[i];
-        break;
-      }
-#endif
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3
-      if (majorVersions[i] == 3) {
-        h_ClmlIntf = clGetMLInterfaceV3QCOM(0);
+      if (majorVersions[i] == CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
+        h_ClmlIntf = GET_ML_INTERFACE(0);
         LOG(WARNING) << "CLML Target version:" << majorVersions[i];
         break;
       }
-#endif
     }
     ICHECK(h_ClmlIntf != NULL)
         << "clGetMLInterfaceVxQCOM:" << result
         << " Perhaps there is mispatch between CLML SDK version to target supported version:"
         << majorVersions[numVersions - 1];
     char* tune_flag;
-    if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
+    if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
       this->is_tuning_run = std::stoi(tune_flag);
     else
       this->is_tuning_run = 0;
 
-    if (!(tuning_file = getenv("CLML_TUNNING_CACHE"))) this->is_tuning_run = 0;
+    if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
     // A Tuning run, so create the cache from scratch
     result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
     ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
     if (!this->is_tuning_run && this->tuning_file) {
-      std::vector<unsigned char> buffer;
-      buffer = readBinFile(this->tuning_file);
-      result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, buffer.size(), buffer.data());
-      ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+      std::vector<unsigned char> tune_buffer;
+      std::string tune_blob;
+      LoadBinaryFromFile(this->tuning_file, &tune_blob);
+      dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&tune_blob));
+      dmlc::Stream* strm = &mstrm;
+
+      uint64_t header, reserve;
+      std::string tune_symbol;
+      while (strm->Read(&header)) {
+        if (header != kTVMCLMLTuningCacheMagic) break;
+        if (!strm->Read(&reserve)) break;
+        if (!strm->Read(&tune_symbol)) break;
+        LOG(INFO) << "Tuning Cache Symbol:" << tune_symbol;
+        if (tune_symbol == clml_symbol) {
+          strm->Read(&tune_buffer);
+          break;
+        } else {
+          std::vector<unsigned char> tmp_buf;
+          if (!strm->Read(&tmp_buf)) break;
+        }
+      }
+
+      if (tune_buffer.size()) {
+        LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
+                  << " size:" << tune_buffer.size();
+        result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, tune_buffer.size(),
+                                                     tune_buffer.data());
+        ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+      } else {
+        LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
+                     << this->tuning_file;
+      }
     }
   }
 
@@ -281,32 +310,34 @@ class CLMLRuntime : public JSONRuntimeBase {
       }
     }
 
+    int64_t duration = 0;
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
       // Make CLML subgraphs accounted by OpenCLTimerNode.
-      if (getenv("CLML_PROFILING") || workspace->IsProfiling(tentry->device)) {
+
+      // if (workspace->IsProfiling(tentry->device)) {
+      if (getenv("CLML_PROFILING")) {
+        Timer t;
+        auto f = Registry::Get(std::string("profiling.timer.opencl"));
+        t = f->operator()(tentry->device);
+        t->Start();
+        queue = workspace->GetQueue(tentry->device);
         evts.resize(evts.size() + 1);
         cl_event* evt = &(evts.back());
+
         result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
                                                this->layer_.descriptorSet, 0, NULL, evt);
+        t->Stop();
+        duration += t->SyncAndGetElapsedNanos();
+        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
+                     << " Duration:" << t->SyncAndGetElapsedNanos();
       } else {
         result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
                                                this->layer_.descriptorSet, 0, NULL, NULL);
       }
       ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
     }
-
     if (getenv("CLML_PROFILING")) {
-      cl_ulong start, end;
-      cl_ulong duration = 0;
-      clWaitForEvents(1, &(evts.back()));
-      for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
-        clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,
-                                nullptr);
-        clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr);
-        duration += (end - start);
-        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " Duration:" << (end - start);
-      }
-      LOG(WARNING) << "Total Duration:" << duration;
+      LOG(WARNING) << "Total Duration for " << clml_symbol << " is:" << duration;
     }
 
     for (size_t i = 0; i < outputs_.size(); ++i) {
@@ -476,30 +507,42 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     if (this->is_tuning_run) {
       LOG(WARNING) << "CLML Tunning In Progress:";
+      // Let the command queue recreated in profiling mode.
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-        LOG(WARNING) << "CLML Tunning:" << i;
+        LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i];
         result = h_ClmlIntf->clTuneMLOpQCOM(workspace->GetQueue(tentry->device),
                                             this->layer_.function[i], this->layer_.descriptorSet,
                                             this->tuning_cache, NULL);
         ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
       }
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(tentry->device, false);
 
-      size_t cacheLenBytes = 0;
-      size_t lenRet = 0;
-      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cacheLenBytes);
+      size_t cache_len_bytes = 0;
+      size_t len_ret = 0;
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cache_len_bytes);
       ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
 
-      std::vector<unsigned char> savedCache(cacheLenBytes, 0);
-      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, savedCache.size(),
-                                                   savedCache.data(), &lenRet);
-      assert(result == CL_SUCCESS);
-
-      std::ofstream cache_out(tuning_file, std::ios_base::binary);
-      if (cache_out) {
-        cache_out.write(reinterpret_cast<char*>(savedCache.data()), savedCache.size());
-        cache_out.close();
-      }
-      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file;
+      std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, saved_cache.size(),
+                                                   saved_cache.data(), &len_ret);
+      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM" << result;
+
+      std::string tune_str;
+      dmlc::MemoryStringStream mstrm(&tune_str);
+      dmlc::Stream* strm = &mstrm;
+      uint64_t header = kTVMCLMLTuningCacheMagic;
+      uint64_t reserved = 0x0;
+      strm->Write(header);
+      strm->Write(reserved);
+      strm->Write(clml_symbol);
+      strm->Write(saved_cache);
+
+      std::ofstream fs(tuning_file, std::ios::app | std::ios::binary);
+      ICHECK(!fs.fail()) << "Cannot open " << tuning_file;
+      fs.write(&tune_str[0], tune_str.length());
+      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file << " size" << tune_str.length()
+                   << " with tuning blob len " << saved_cache.size();
     }
   }
 
@@ -1308,12 +1351,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 
   CachedLayer layer_;
   // CLML Context
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2
-  CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
-#endif
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3
-  CLMLInterfaceV3QCOM* h_ClmlIntf = NULL;
-#endif
+  GET_ML_API_INTERFACE* h_ClmlIntf = NULL;
   cl::OpenCLWorkspace* workspace = NULL;
   cl::OpenCLThreadEntry* tentry = NULL;
   cl_ml_tuningcache_qcom tuning_cache = NULL;
@@ -1330,6 +1368,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                  << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }
 #endif
+  /*! CLML sub graph symbol in TVM main module */
+  std::string clml_symbol;
 };
 
 runtime::Module CLMLRuntimeCreate(const String& symbol_name, const String& graph_json,
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 7bbb358f8f92..1718d7e34cb1 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -284,6 +284,29 @@ class OpenCLWorkspace : public DeviceAPI {
 
     return prop & CL_QUEUE_PROFILING_ENABLE;
   }
+  // Enable queue profiling, recreate if required
+  void EnableQueueProfiling(Device dev, bool enable) {
+    bool is_enabled = cl::OpenCLWorkspace::Global()->IsProfiling(dev);
+    if (is_enabled == enable) {
+      return;
+    }
+    cl_command_queue_properties prop = 0;
+    if (enable) {
+      prop = CL_QUEUE_PROFILING_ENABLE;
+    } else {
+      prop = 0;
+    }
+    auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev);
+    OPENCL_CALL(clFlush(queue));
+    OPENCL_CALL(clFinish(queue));
+    OPENCL_CALL(clReleaseCommandQueue(queue));
+    cl_int err_code;
+    cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev.device_id];
+    auto profiling_queue =
+        clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did, prop, &err_code);
+    OPENCL_CHECK_ERROR(err_code);
+    cl::OpenCLWorkspace::Global()->queues[dev.device_id] = profiling_queue;
+  }
 
   // override device API
   void SetDevice(Device dev) final;
@@ -508,26 +531,11 @@ class OpenCLTimerNode : public TimerNode {
   Device dev_;
 
   void recreateCommandQueue() {
-    cl_command_queue_properties prop;
-
     if (!cl::OpenCLWorkspace::Global()->IsProfiling(dev_)) {
-      prop = CL_QUEUE_PROFILING_ENABLE;
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, true);
     } else {
-      prop = 0;
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, false);
     }
-
-    auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev_);
-
-    OPENCL_CALL(clFlush(queue));
-    OPENCL_CALL(clFinish(queue));
-    OPENCL_CALL(clReleaseCommandQueue(queue));
-
-    cl_int err_code;
-    cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev_.device_id];
-    auto profiling_queue =
-        clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did, prop, &err_code);
-    OPENCL_CHECK_ERROR(err_code);
-    cl::OpenCLWorkspace::Global()->queues[dev_.device_id] = profiling_queue;
   }
 };
 }  // namespace runtime

From 7e77a7eff8291c8d620e01775ea0ec4dd15649e6 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 27 Jan 2023 12:24:11 +0530
Subject: [PATCH 2/3] Update src/runtime/opencl/opencl_common.h

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>
---
 src/runtime/opencl/opencl_common.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 1718d7e34cb1..ad0afb167bc9 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -290,12 +290,7 @@ class OpenCLWorkspace : public DeviceAPI {
     if (is_enabled == enable) {
       return;
     }
-    cl_command_queue_properties prop = 0;
-    if (enable) {
-      prop = CL_QUEUE_PROFILING_ENABLE;
-    } else {
-      prop = 0;
-    }
+    cl_command_queue_properties prop = (enable) ? CL_QUEUE_PROFILING_ENABLE : 0;
     auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev);
     OPENCL_CALL(clFlush(queue));
     OPENCL_CALL(clFinish(queue));

From 535d1e061d116c84a00b1598390bf3d45ab657c3 Mon Sep 17 00:00:00 2001
From: Siva Rama Krishna Reddy B <quic_sivb@quicinc.com>
Date: Fri, 27 Jan 2023 12:27:45 +0530
Subject: [PATCH 3/3] * review comments

---
 src/runtime/contrib/clml/clml_runtime.cc | 1 -
 src/runtime/opencl/opencl_common.h       | 7 ++-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 4c32aedffe90..739a90c1bf6f 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -314,7 +314,6 @@ class CLMLRuntime : public JSONRuntimeBase {
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
       // Make CLML subgraphs accounted by OpenCLTimerNode.
 
-      // if (workspace->IsProfiling(tentry->device)) {
       if (getenv("CLML_PROFILING")) {
         Timer t;
         auto f = Registry::Get(std::string("profiling.timer.opencl"));
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index ad0afb167bc9..c172a0f94539 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -526,11 +526,8 @@ class OpenCLTimerNode : public TimerNode {
   Device dev_;
 
   void recreateCommandQueue() {
-    if (!cl::OpenCLWorkspace::Global()->IsProfiling(dev_)) {
-      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, true);
-    } else {
-      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, false);
-    }
+    cl::OpenCLWorkspace::Global()->EnableQueueProfiling(
+        dev_, !cl::OpenCLWorkspace::Global()->IsProfiling(dev_));
   }
 };
 }  // namespace runtime