From 9ce7ce34eb810cee6f0428681ae8130583d8dd6b Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Mon, 3 Oct 2022 11:50:21 -0500
Subject: [PATCH 01/13] [Hexagon] Add support for instrumentation based
 profiling for Hexagon

This's done by instrumenting the code with profiling builtin calls using a TIR pass.
During codegen, these builtin calls are replaced with the calls to a hexagon specific
handler which records the runtime information into a buffer. This buffer is written
into a JSON file ('lwp.json') which is processed to construct function and loop-level
profiling information as a csv file.

At a high-level, this PR makes the following changes:

1) Add a TIR pass (src/tir/transforms/profile_instrumentation.cc) to instrument the
functions and loops with profilging builtins.
2) Hexagon codegen changes to replace profilng builtin calls with the call to Hexagon
specific handler. This handler record the runtime data into a buffer. For all other
targets, these builtin calls are ignored.
3) Add API to RPC Launcher to get the profiling data as a JSON file
4) A python script to process the profiling data and construct a CSV file
5) Add TVM script based unit tests to test and demonstrate various profiling config
flags: tests/python/unittest/test_tir_transform_profiling_instr.py
6) Adds two tests in tests/python/contrib/test_hexagon/test_launcher.py to demonstrate
necessary changes to enable profiling and to collect and process runtime data.

For additional details, please refer to src/runtime/hexagon/profiler/README.md
---
 apps/hexagon_launcher/README.md               |  40 ++
 .../cmake/hexagon/CMakeLists.txt              |   5 +-
 apps/hexagon_launcher/launcher_android.cc     |   9 +-
 apps/hexagon_launcher/launcher_core.h         |   3 +
 apps/hexagon_launcher/launcher_hexagon.cc     |  10 +-
 apps/hexagon_launcher/launcher_main.cc        |  14 +-
 apps/hexagon_launcher/launcher_rpc.idl        |   2 +-
 cmake/modules/Hexagon.cmake                   |   6 +
 cmake/modules/HexagonSDK.cmake                |   6 +
 include/tvm/tir/builtin.h                     |   5 +
 include/tvm/tir/transform.h                   |   6 +
 python/tvm/contrib/hexagon/build.py           |  79 ++++
 .../tvm/contrib/hexagon/hexagon_profiler.py   |  36 ++
 .../hexagon/profiling/process_lwp_data.py     | 387 ++++++++++++++++++
 python/tvm/contrib/hexagon/session.py         |   5 +
 python/tvm/tir/transform/transform.py         |  11 +
 src/driver/driver_api.cc                      |   7 +
 src/runtime/dso_library.cc                    |  14 +
 src/runtime/hexagon/profiler/README.md        | 152 +++++++
 src/runtime/hexagon/profiler/lwp_handler.S    |  87 ++++
 src/runtime/hexagon/profiler/prof_utils.cc    |  80 ++++
 src/runtime/hexagon/profiler/prof_utils.h     |  20 +
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc |  13 +
 .../hexagon/rpc/simulator/rpc_server.cc       |  13 +
 src/target/llvm/codegen_hexagon.cc            |  20 +
 src/target/llvm/codegen_llvm.cc               |   3 +
 src/tir/op/builtin.cc                         |   3 +
 src/tir/transforms/profile_instrumentation.cc | 287 +++++++++++++
 .../contrib/test_hexagon/test_launcher.py     | 245 +++++++++++
 .../test_tir_transform_profiling_instr.py     | 340 +++++++++++++++
 30 files changed, 1897 insertions(+), 11 deletions(-)
 create mode 100755 python/tvm/contrib/hexagon/hexagon_profiler.py
 create mode 100644 python/tvm/contrib/hexagon/profiling/process_lwp_data.py
 create mode 100644 src/runtime/hexagon/profiler/README.md
 create mode 100644 src/runtime/hexagon/profiler/lwp_handler.S
 create mode 100644 src/runtime/hexagon/profiler/prof_utils.cc
 create mode 100644 src/runtime/hexagon/profiler/prof_utils.h
 create mode 100644 src/tir/transforms/profile_instrumentation.cc
 create mode 100644 tests/python/unittest/test_tir_transform_profiling_instr.py

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index cc433f245759..48c76a16ae96 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -189,6 +189,46 @@ lowered = tvm.relay.build(
 lowered.export_library("model-aot.so", tvm.contrib.hexagon.link_shared)
 ```
 
+
+## Profiling using hexagon launcher
+
+### Enabling lightweight profiling (LWP) instrumentation
+
+This profiling option can be used to get function and loop level processor cycles.
+This needs to be enabled explicitly while compiling a model. For example:
+
+```
+with tvm.transform.PassContext(config={'tir.instrument_lwp':True} ):
+    lib = relay.build(...)
+```
+
+Here, `instrument_lwp` is used to enable the tir pass which instruments the code with the builtin calls.
+
+During codegen, profiling builtin calls can be replaced with a target specific handler to record runtime
+information into a buffer. This buffer is written into a JSON file which is proccessed to construct
+function and loop level profiling information.
+
+To generate LWP JSON file, add `--gen_lwp_json` flag to launcher_android:
+
+```
+./launcher_android --in_config input.json --out_config output.json --gen_lwp_json
+```
+
+Please note that `--gen_lwp_json` flag by itself doesn't enable profiling and is only used to dump
+the profiling data into a json file called lwp.json. This file will be created at the same location
+on the device where launcher_android is executed from. To generate the data, profiling instrumentation
+must be enabled while compiling a model as mentioned above.
+
+Use this command to pull `lwp.json` from the device:
+
+```
+adb -s <DEVICE-ID> pull /path/to/lwp.json
+```
+
+**Note:** Please refer to src/runtime/hexagon/profiler/README.md for information on how
+to enable profiling using Hexagon RPC launcher and also to learn about additional profiling related
+config options.
+
 # Disclaimer
 
 The launcher does not perform any correctness verification. In order to verify
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index fa17dcb4778b..4f5d0d34a08e 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -16,7 +16,7 @@
 # under the License.
 
 cmake_minimum_required(VERSION 3.2)
-project(HexagonLauncherRPCSkel C CXX)
+project(HexagonLauncherRPCSkel C CXX ASM)
 
 include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
 # From the include above get
@@ -68,11 +68,14 @@ set(SKEL_SRCS
   "${LAUNCHER_SRC}/launcher_core.cc"
   "${LAUNCHER_SRC}/launcher_hexagon.cc"
 )
+set(PROFILER_DIR "${TVM_SOURCE_DIR}/src/runtime/hexagon/profiler")
 
 add_library(launcher_rpc_skel SHARED
   "${LAUNCHER_RPC_H}"
   "${LAUNCHER_RPC_SKEL_C}"
   "${SKEL_SRCS}"
+  "${PROFILER_DIR}/prof_utils.cc"
+  "${PROFILER_DIR}/lwp_handler.S"
 )
 
 ExternalProject_Add(static_hexagon_tvm_runtime
diff --git a/apps/hexagon_launcher/launcher_android.cc b/apps/hexagon_launcher/launcher_android.cc
index 008e4fdfe1c4..34db0bdacb60 100644
--- a/apps/hexagon_launcher/launcher_android.cc
+++ b/apps/hexagon_launcher/launcher_android.cc
@@ -55,7 +55,8 @@ AEEResult set_remote_stack_size(int size) {
 }
 
 struct RPCChannel : public ExecutionSession {
-  explicit RPCChannel(const std::string& uri) {
+  explicit RPCChannel(const std::string& uri, bool gen_lwp_json = false)
+      : ExecutionSession(gen_lwp_json) {
     enable_unsigned_pd(true);
     set_remote_stack_size(128 * 1024);
 
@@ -127,7 +128,7 @@ struct RPCChannel : public ExecutionSession {
   }
 
   bool run(uint64_t* pcycles, uint64_t* usecs) override {
-    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs);
+    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs, gen_lwp_json);
     if (rc != AEE_SUCCESS) {
       std::cout << "error running model: " << std::hex << rc << '\n';
     }
@@ -158,8 +159,8 @@ struct RPCChannel : public ExecutionSession {
   std::vector<void*> allocations;
 };
 
-ExecutionSession* create_execution_session() {
-  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN);
+ExecutionSession* create_execution_session(bool gen_lwp_json) {
+  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN, gen_lwp_json);
   if (session->handle == -1) {
     delete session;
     session = nullptr;
diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
index a32bf937af58..da0dfcbbd5a6 100644
--- a/apps/hexagon_launcher/launcher_core.h
+++ b/apps/hexagon_launcher/launcher_core.h
@@ -94,6 +94,8 @@ struct Model {
 };
 
 struct ExecutionSession {
+  explicit ExecutionSession(bool lwp_json = false) : gen_lwp_json(lwp_json) {}
+
   template <typename T>
   T* alloc(size_t bytes, size_t align = 1) {
     return reinterpret_cast<T*>(alloc_mem(bytes, align));
@@ -111,6 +113,7 @@ struct ExecutionSession {
   virtual bool get_num_outputs(int* num_outputs) = 0;
   virtual bool get_output(int output_idx, tensor_meta* output_meta, int meta_size,
                           void* output_data, int data_size) = 0;
+  bool gen_lwp_json = false;
 };
 
 bool read_model_config(const std::string& file_name, ModelConfig* model_config);
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index d4fbf4bf5d73..84bef6fae2f1 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -36,6 +36,7 @@ extern "C" {
 #include "launcher_rpc.h"
 
 static std::unique_ptr<Model> TheModel;
+bool WriteLWPOutput(const std::string&);
 
 static AEEResult error_too_small(const std::string& func_name, const std::string& value_name,
                                  int given, int needed) {
@@ -204,7 +205,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
 }
 
 AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcycles,
-                                          uint64_t* usecs) {
+                                          uint64_t* usecs, int gen_lwp_json) {
   if (!TheModel) {
     // No model created.
     LOG(ERROR) << __func__ << ": no model created";
@@ -238,6 +239,13 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   *pcycles = pc_end - pc_begin;
   *usecs = us_end - us_begin;
 
+  if (gen_lwp_json) {
+    if (!WriteLWPOutput("lwp.json")) {
+      LOG(ERROR) << "ERROR: failed to generate lwp json file";
+      return AEE_EFAILED;
+    }
+  }
+
   // Unlock HVX.
   int unl = qurt_hvx_unlock();
   if (unl != 0) {
diff --git a/apps/hexagon_launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc
index 163d582db440..1ef3b5d2ff3c 100644
--- a/apps/hexagon_launcher/launcher_main.cc
+++ b/apps/hexagon_launcher/launcher_main.cc
@@ -27,12 +27,14 @@
 #include "launcher_core.h"
 #include "launcher_util.h"
 
-ExecutionSession* create_execution_session();
+ExecutionSession* create_execution_session(bool gen_lwp_json);
 
-int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path) {
+int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path,
+                       bool* gen_lwp_json) {
   static option long_options[] = {
       {"in_config", required_argument, nullptr, 0},
       {"out_config", required_argument, nullptr, 0},
+      {"gen_lwp_json", optional_argument, nullptr, 0},
   };
 
   bool show_usage = false;
@@ -49,6 +51,9 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string
       case 1:
         *out_path = std::string(optarg);
         break;
+      case 2:
+        *gen_lwp_json = true;
+        break;
     }
   }
   if (in_path->empty() || out_path->empty() || show_usage) {
@@ -61,7 +66,8 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string
 
 int main(int argc, char* argv[]) {
   std::string in_path, out_path;
-  if (parse_command_line(argc, argv, &in_path, &out_path) != 0) {
+  bool gen_lwp_json;
+  if (parse_command_line(argc, argv, &in_path, &out_path, &gen_lwp_json) != 0) {
     return 1;
   }
 
@@ -70,7 +76,7 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  ExecutionSession* session_ptr = create_execution_session();
+  ExecutionSession* session_ptr = create_execution_session(gen_lwp_json);
   if (session_ptr == nullptr) {
     return 1;
   }
diff --git a/apps/hexagon_launcher/launcher_rpc.idl b/apps/hexagon_launcher/launcher_rpc.idl
index 6677108a76f0..27e5d1d15d68 100644
--- a/apps/hexagon_launcher/launcher_rpc.idl
+++ b/apps/hexagon_launcher/launcher_rpc.idl
@@ -29,5 +29,5 @@ interface launcher_rpc : remote_handle64 {
   AEEResult set_input(in long input_idx, in buffer input_meta, in buffer input_value);
   AEEResult get_num_outputs(rout long num_outputs);
   AEEResult get_output(in long output_idx, rout buffer output_meta, rout buffer output_value);
-  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs);
+  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs, in long gen_lwp_json);
 };
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index aad770120201..75ccb75435f3 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -244,10 +244,14 @@ if(USE_HEXAGON_RPC)
       # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it
       "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc"
     )
+    set(PROFILER_DIR "${TVMRT_SOURCE_DIR}/hexagon/profiler")
     # Add the hardware-specific RPC code into the skel library.
+    set_property(SOURCE ${PROFILER_DIR}/lwp_handler.S PROPERTY LANGUAGE C)
     add_library(hexagon_rpc_skel SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon/rpc_server.cc"
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_skel.c"
+      "${PROFILER_DIR}/prof_utils.cc"
+      "${PROFILER_DIR}/lwp_handler.S"
     )
     target_include_directories(hexagon_rpc_skel
       SYSTEM PRIVATE "${TVMRT_SOURCE_DIR}/hexagon/rpc"
@@ -256,6 +260,8 @@ if(USE_HEXAGON_RPC)
     # executed via run_main_on_sim.
     add_library(hexagon_rpc_sim SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/simulator/rpc_server.cc"
+      "${PROFILER_DIR}/prof_utils.cc"
+      "${PROFILER_DIR}/lwp_handler.S"
     )
     target_link_libraries(hexagon_rpc_sim
       -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 173f0f3b2d67..ddb158cad95e 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -157,9 +157,15 @@ function(_get_hexagon_sdk_property_impl
     if(_property STREQUAL "SDK_INCLUDE")
       set(_dirs "${_hexagon_sdk_root}/incs" "${_hexagon_sdk_root}/incs/stddef")
     elseif(_property STREQUAL "QURT_INCLUDE")
+      # Set the Hexagon arch directory for runtime linker.
+      set(_rtld_dir "hexagon_toolv84_${_hexagon_arch}")
+      if(_hexagon_arch STREQUAL "v69")
+        set(_rtld_dir "hexagon_toolv84_v68") # Use hexagon_toolv84_v68 for v69
+      endif()
       set(_dirs
         "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/posix"
         "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/qurt"
+        "${_hexagon_sdk_root}/ipc/fastrpc/rtld/ship/${_rtld_dir}"
       )
     elseif(_property STREQUAL "QURT_LIB")
       set(_dirs "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/lib/pic")
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index a1a97595bfd8..eee26a3c7efa 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -747,6 +747,11 @@ TVM_DLL const Op& assume();
  */
 TVM_DLL const Op& undef();
 
+/*!
+ * \brief Profiling intrinsic
+ */
+TVM_DLL const Op& profile_intrinsic();
+
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
   // array head address
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 6aa1aca69970..27bceed2776e 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -682,6 +682,12 @@ TVM_DLL Pass RemoveWeightLayoutRewriteBlock();
  */
 TVM_DLL Pass ManifestSharedMemoryLocalStage();
 
+/*!
+ * \brief Insert intrinsic calls to instrument function and loop level profiling.
+ * \return The pass.
+ */
+TVM_DLL Pass InstrumentProfileIntrinsics();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 8960d110b85e..62ae145d9271 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -34,6 +34,8 @@
 import tvm
 from ..._ffi import libinfo
 from .session import Session
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
+from tvm.contrib.utils import TempDirectory
 
 
 HEXAGON_RPC_LIB_DIR = os.environ.get("HEXAGON_RPC_LIB_DIR")
@@ -336,6 +338,35 @@ def get_graph_debug_executor(
         """
         return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root)
 
+    @abc.abstractmethod
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+        remote_path: Union[str, pathlib.Path] = None,
+        temp_dir: TempDirectory = None,
+    ):
+        """Extract profile output.
+
+        Parameters
+        ----------
+        hex_profiler : HexagonProfiler
+            HexagonProfiler object that contains the profiling related information.
+        session : Session
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
+        remote_path: Union[str, pathlib.Path]
+            Remote path for on-device runs (ignored for the simulator run)
+        temp_dir : TempDirectory
+            Local directory where profile output can be saved (ignored for the simulator run)
+
+        Returns
+        -------
+        profile_data : str
+            Path of the profiling data file
+        """
+        ...
+
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
     """Hexagon Launcher for Android."""
@@ -516,6 +547,32 @@ def stop_server(self, cleanup=True):
         if cleanup:
             self.cleanup_directory()
 
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+        remote_path: Union[str, pathlib.Path] = None,
+        temp_dir: TempDirectory = None,
+    ):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        profile_data = ""
+        if hex_profiler.is_lwp_enabled():
+            if not temp_dir:
+                raise RuntimeError("tempdir not passed")
+            fname = "lwp.json"
+            out_path = os.path.join(remote_path, fname)
+            profile_data = temp_dir.relpath(fname)
+            rv = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if rv:
+                subprocess.check_call(self._adb_device_sub_cmd + ["pull", out_path, profile_data])
+            else:
+                raise RuntimeError("Error generating profile output")
+        elif hex_profiler.profiling_mode == "etm":
+            hex_profiler.pull_files_for_etm_processing(self._workspace)
+        else:
+            raise RuntimeError("Profiling not enabled")
+        return profile_data
+
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
     """Hexagon Launcher for Hexagon simulator."""
@@ -622,6 +679,28 @@ def stop_server(self, cleanup=True):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
 
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+        remote_path: Union[str, pathlib.Path] = None,
+        temp_dir: TempDirectory = None,
+    ):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        profile_data = ""
+        if hex_profiler.is_lwp_enabled():
+            fname = "lwp.json"
+            profile_data = f"{self._workspace}/{fname}"
+            rv = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if not rv:
+                raise RuntimeError("Error generating profile output")
+        elif hex_profiler.profiling_mode == "etm":
+            raise RuntimeError("ETM Profiling not supported on the simulator")
+        else:
+            raise RuntimeError("Profiling not enabled")
+
+        return profile_data
+
 
 # https://stackoverflow.com/a/52872579/2689797
 def _is_port_in_use(port: int) -> bool:
diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
new file mode 100755
index 000000000000..51f768b71f07
--- /dev/null
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tvm.ir.transform import PassContext
+
+class HexagonProfiler:
+    """Hexagon Profiler"""
+
+    def __init__(self):
+        """Configure HexagonProfiler"""
+        self.profiling_mode = None
+        config = PassContext.current().config
+        if ("tir.instrument_lwp", True) in config.items():
+            self.profiling_mode = "lwp"
+        if self.profiling_mode is None:
+            raise "Profiling mode was not set or was not a valid one."
+
+    def get_mode(self):
+        return self.profiling_mode
+
+    def is_lwp_enabled(self):
+        return self.profiling_mode == "lwp"
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
new file mode 100644
index 000000000000..ef20c43f8a83
--- /dev/null
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -0,0 +1,387 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import csv
+import subprocess
+import argparse
+import os
+from re import search, compile
+from collections import OrderedDict
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
+
+ENABLE_DEBUG = False
+"""
+Process lightweight profiling output and generate a .csv filea
+
+Please note that some assumptions have been made while processing
+the lightweight profiling output. They are as follows:
+
+1) We don't expect profiled functions to call another profiled functions.
+  This constraint can be relaxed if needed but it simplifies the processing
+  significantly without introducing any limitations for our use case.
+2) For now, it's also assumed that every unique section (loop) ID has same start
+  and end offset which will not be true while a loop gets unrolled as it will
+  create multiple profiling section with the same ID. The current
+  implementation doesn't handle this case.
+
+"""
+
+
+def get_func_info(model_so):
+    """Get all the .text sections along with their start and end offset values"""
+    hexagon_nm_path = os.environ["HEXAGON_TOOLCHAIN"] + "/bin/hexagon-nm"
+    out = subprocess.Popen(
+        [hexagon_nm_path, "--print-size", model_so],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    stdo, stde = out.communicate()
+    stdo = stdo.decode("utf-8")
+
+    func_info = []
+    for l in stdo.split("\n"):
+        info = {}
+        if search(" (T|t) ", l):  # If .text section
+            parts = l.split(" ")
+            assert len(parts) == 4
+            info["start"] = int(parts[0], base=16)
+            info["end"] = int(parts[0], base=16) + int(parts[1], base=16)
+            info["name"] = parts[3]
+            func_info.append(info)
+
+    # Sort the entries in the increasing order of the start offset value.
+    func_info = sorted(func_info, key=lambda d: d["start"])
+
+    if ENABLE_DEBUG:
+        print("func_info :\n ")
+        for f in func_info:
+            print(f)
+    return func_info
+
+
+def find_func(func_info, offset):
+    """For a given offset, find the function it belongs to."""
+    fidx = 0
+    lidx = len(func_info) - 1
+    while fidx <= lidx:
+        midx = (fidx + lidx) // 2
+        ms = func_info[midx]["start"]
+        me = func_info[midx]["end"]
+        if fidx == lidx:
+            assert offset >= ms and offset <= me, (
+                f"Couldn't find a function for this offset: {offset}"
+            )
+            return fidx
+        else:
+            if offset > me:
+                fidx = midx + 1
+            elif offset < ms:
+                lidx = midx - 1
+            else:
+                return midx
+    assert False, "Possible mismatch between model .so and LWP data"
+
+
+def accumulate_cycles(overall_cycles, func_cycles, func_name):
+    """Accumulate function cycles"""
+    acc_cycles = overall_cycles[func_name]
+    for id in func_cycles:
+        assert id in acc_cycles, f"id [{id}] missing in the existing function record"
+        assert acc_cycles[id]["start"] == func_cycles[id]["start"], (
+            "Offset value doesn't match with the existing function record."
+        )
+        acc_cycles[id]["cycles"] += func_cycles[id]["cycles"]
+        acc_cycles[id]["count"] += func_cycles[id]["count"]
+    overall_cycles.update({func_name: acc_cycles})
+    return overall_cycles
+
+
+def adjust_per_loop_counts(overall_cycles, data):
+    """
+    Use execution count and the number of entries recorded for each function/loop
+    to compute the overall cycles spent on them.
+    """
+    for func in overall_cycles:
+        func_cycles = overall_cycles[func]
+        for id in func_cycles:
+            exec_count = data["loop_counts"][id]
+            rec_count = func_cycles[id]["count"]
+            assert exec_count != 0, "Execution count should have been non-zero."
+            assert rec_count != 0, "Entry count should have been non-zero."
+            exec_cycles = ((int(func_cycles[id]["cycles"])) * exec_count) // rec_count
+            func_cycles[id]["cycles"] = exec_cycles
+            func_cycles[id]["count"] = exec_count
+        overall_cycles.update({func: OrderedDict(sorted(func_cycles.items()))})
+    return overall_cycles
+
+
+def create_csv_report(overall_cycles, fname):
+    """Create csv report"""
+    header = [
+        "function name",
+        "loop/function id",
+        "loop depth",
+        "start offset",
+        "end offset",
+        "pcycles",
+        "parent count",
+    ]
+    with open(fname, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        for func in overall_cycles:
+            func_cycles = overall_cycles[func]
+            data = []
+            root = -1
+            outer_most = -1
+            for key, value in func_cycles.items():
+                if value["parent"] == -1:
+                    assert root == -1, "Can't have multiple root nodes."
+                    root = key
+
+                data.append(func)
+                data.append(key)
+                if value["parent"] == -1:
+                    data.append("-")  # Total cycles over all invocations of this function.
+                elif value["parent"] == root:
+                    data.append(0)
+                    outer_most = key
+                else:
+                    if outer_most > -1:
+                        data.append(key - outer_most)
+                    else:
+                        data.append(key - value["parent"])
+                data.append(hex(value["start"]))
+                data.append(hex(value["end"]))
+                data.append(value["cycles"])
+                data.append(value["count"])
+                writer.writerow(data)
+                data.clear()
+
+
+def process_data(data, func_info, so_ld_addr):
+    """Process data"""
+    # Keep an ordered list of loop IDs as they are being visited. This is used
+    # to match entry and exit pairs. Once the function/loop is processed, it's
+    # removed from the list.
+    ordered_visited_list = []
+    # Store information regarding visited nodes as they are being processed. Once
+    # the function/loop is processed, it's removed from the set.
+    visited_set = {}
+    # Dictionary to store cycles for the entire model which is grouped into functions.
+    overall_cycles = {}
+    func_cycles = {}
+
+    func_idx = -1
+    func_name = ""
+    prev_func_name = ""
+    func_start = 0
+    func_end = 0
+    save_data = False
+    # Iterate over all the entries in the LWP data file and process them
+    # to construct a report.
+    for entry in data["entries"]:
+        id = entry["id"]
+        offset = entry["ret"] - so_ld_addr
+
+        # Recorded return address should fall within the function begin and end
+        # offsets. If not, find the function it belongs to.
+        if offset < func_start or offset > func_end:
+            prev_func_name = func_name
+            if ENABLE_DEBUG:
+                print("offset : ", offset)
+                print("id : ", id)
+
+            func_idx = find_func(func_info, offset)
+            func_name = func_info[func_idx]["name"]
+            func_start = func_info[func_idx]["start"]
+            func_end = func_info[func_idx]["end"]
+            if ENABLE_DEBUG:
+                print("func_name : ", func_name)
+
+            if save_data:
+                # overall_cycles = save_func_cycles(prev_func_name, overall_cycles, func_cycles, ordered_visited_list)
+                # Done processing the previous function, copy its info into 'overall_cycles'.
+                if prev_func_name not in overall_cycles:
+                    overall_cycles[prev_func_name] = func_cycles.copy()
+                else:
+                    # Accumulate cycles into existing function entry.
+                    overall_cycles = accumulate_cycles(overall_cycles, func_cycles, prev_func_name)
+                # We don't allow for fused operators (functions) calling another operator.
+                if ENABLE_DEBUG:
+                    print("ordered_visited_list : ", ordered_visited_list)
+
+                assert len(ordered_visited_list) == 0, (
+                    f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
+                    f"\t Possible reasons -- \n"
+                    f"\t\t1) Mismatch between model .so and json file.\n"
+                    f"\t\t2) LWP buffer may have overflowed resulting into missing entries!"
+                )
+                func_cycles.clear()
+
+            save_data = True
+
+        if id not in visited_set:  # Found 'entry' record
+            visited_info = {"func_idx": func_idx, "ret": offset, "cyc": entry["cyc"]}
+            visited_set[id] = visited_info
+            ordered_visited_list.append(id)
+        else:  # Found 'exit' record
+            # This should be the last entry in the ordered_visited_list. If not, error out.
+            assert ordered_visited_list[-1] == id, (
+                "Problem with LWP output - Interleaved handler calls found."
+                f"Loop [{ordered_visited_list[-1]}] hasn't exited yet."
+            )
+            ordered_visited_list.pop()
+            entry_node = visited_set.pop(id)
+            assert entry_node["func_idx"] == func_idx, (
+                f'Error - Found under a different function name : {entry_node["func_idx"]}'
+            )
+            cycles = entry["cyc"] - entry_node["cyc"]
+            parent = -1
+            if ordered_visited_list:
+                parent = int(ordered_visited_list[-1])
+            if id in func_cycles:
+                fcycles = func_cycles[id]
+                fcycles["cycles"] += cycles
+                fcycles["count"] += 1
+                func_cycles[id] = fcycles
+            else:
+                func_cycles[id] = {
+                    "cycles": cycles,
+                    "start": entry_node["ret"],
+                    "end": offset,
+                    "parent": parent,
+                    "count": 1,
+                }
+
+    # Done processing the previous function, copy its info into 'overall_cycles'.
+    if func_name not in overall_cycles:
+        overall_cycles[func_name] = func_cycles.copy()
+    else:
+        # Accumulate cycles into existing function entry.
+        overall_cycles = accumulate_cycles(overall_cycles, func_cycles, func_name)
+    # We don't allow for fused operators (functions) calling another operator.
+    if ENABLE_DEBUG:
+        print("ordered_visited_list : ", ordered_visited_list)
+
+    assert len(ordered_visited_list) == 0, (
+        f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
+        f"\t Possible reasons -- \n"
+        f"\t\t1) Mismatch between model .so and json file.\n"
+        f"\t\t2) LWP buffer may have overflowed resulting into missing entries!"
+        % prev_func_name
+    )
+
+    overall_cycles = adjust_per_loop_counts(overall_cycles, data)
+    return overall_cycles
+
+
+def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log: str):
+    """Get load address of the binary file"""
+    if serial_number == "simulator":
+        basedir = os.path.dirname(lwp_json)
+        if run_log is None:
+            run_log = os.path.join(basedir, "stdout.txt")
+        else:
+            # If the directory name is specified for the run_log of the
+            # simulator (stdout.txt) then it must be same as lwp_json.
+            run_log_dir = os.path.dirname(run_log)
+            assert run_log_dir == "" or run_log_dir == basedir, (
+                f"stdout.txt and {os.path.basename(lwp_json)} must be in the same directory"
+            )
+            run_log = os.path.join(basedir, os.path.basename(run_log))
+        # To extract load address for the simulator run
+        pattern = compile(r"Model.*: (\w+):")
+    else:
+        # To extract load address for on-device run
+        binary_name = os.path.basename(binary_path)
+        pattern = compile(r"{}, len \w+, laddr (\w+)".format(binary_name))
+
+    with open(run_log, "r") as f:
+        lines = f.read()
+        a = pattern.search(lines)
+        load_addr = int(a.group(1), 16)
+
+    return load_addr
+
+
+def process_lwp_output(
+    binary_path: str,
+    serial_number: str,
+    lwp_json: str,
+    run_log: str,
+    lwp_out: str,
+    enable_debug: bool = False,
+):
+    """Process lightweight profiling data"""
+    # Enable debug messages
+    global ENABLE_DEBUG
+    ENABLE_DEBUG = enable_debug
+
+    # Get load address for the binary
+    load_addr = get_load_addr(binary_path, serial_number, lwp_json, run_log)
+    # Opening JSON file
+    with open(lwp_json, "r") as f:
+      # Returns JSON object as a dictionary
+      data = json.load(f)
+
+    # Get function names, and their start and end offsets from the model .so
+    func_info = get_func_info(binary_path)
+
+    # Get the load address for model .so.
+    so_ld_addr = load_addr
+
+    # Process profiling data to construct a CSV report.
+    overall_cycles = process_data(data, func_info, so_ld_addr)
+    create_csv_report(overall_cycles, lwp_out)
+    print("lwp processed output written to -- ", lwp_out)
+
+
+def get_args():
+    """Add commandline arguments to run the script manually if needed"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lwp-json", help="LWP json file", required=True)
+    parser.add_argument("--serial-num", help="device-id/simulator", required=True)
+    parser.add_argument("--test-so", help="Test shared library", required=True)
+    parser.add_argument(
+        "--run-log",
+        help="Logcat file for on-device run and stdout.txt for simulator run",
+        required=True,
+    )
+    parser.add_argument("--lwp-out", help="LWP output file name", required=True)
+    parser.add_argument(
+        "--debug",
+        help="Enable debug output from the script",
+        dest="debug",
+        action="store_true",
+        required=False,
+    )
+    parser.set_defaults(debug=False)
+    args = parser.parse_args()
+
+    global ENABLE_DEBUG
+    ENABLE_DEBUG = args.debug
+
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    process_lwp_output(
+        args.test_so, args.serial_num, args.lwp_json, args.run_log, args.lwp_out, args.debug
+    )
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index e242a95aa8b8..ef9bc7b2e46a 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -391,3 +391,8 @@ def _aot_executor_from_factory(
             remote_file_path = self.upload(binary_path, binary_name)
 
         return self.get_aot_executor(remote_file_path)
+
+    def get_profile_output(self, mode: str, path: str):
+        assert isinstance(mode, str), f"Invalid mode type, {type(mode)} != str"
+        assert isinstance(path, str), f"Invalid path type, {type(path)} != str"
+        return self._rpc.get_function("tvm.hexagon.get_profile_output")(mode, path)
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index d95d15c0dfbe..70451ae116c4 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -983,3 +983,14 @@ def ManifestSharedMemoryLocalStage():
         The result pass
     """
     return _ffi_api.ManifestSharedMemoryLocalStage()  # type: ignore
+
+
+def InstrumentProfileIntrinsics():
+    """Insert intrinsic calls to instrument function and loop level profiling.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InstrumentProfileIntrinsics()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index b0af0fb65e16..689f2be83950 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -51,6 +51,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -156,6 +157,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
       pass_ctx->GetConfig<Array<Array<ObjectRef>>>("tir.add_lower_pass", Array<Array<ObjectRef>>())
           .value();
 
+  bool instrument_lwp = pass_ctx->GetConfig<Bool>("tir.instrument_lwp", Bool(false)).value();
+
   Array<transform::Pass> user_lower_phase0 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase1 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase2 = Array<transform::Pass>();
@@ -252,6 +255,10 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(
       tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
+  if (instrument_lwp) {
+    pass_list.push_back(tir::transform::InstrumentProfileIntrinsics());
+  }
+
   return pass_list;
 }
 
diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
index 81eb30ee12d2..a0c6c48b5e44 100644
--- a/src/runtime/dso_library.cc
+++ b/src/runtime/dso_library.cc
@@ -34,6 +34,12 @@
 #include <dlfcn.h>
 #endif
 
+#if defined(__hexagon__)
+extern "C" {
+#include <HAP_farf.h>
+}
+#endif
+
 namespace tvm {
 namespace runtime {
 
@@ -118,6 +124,14 @@ void DSOLibrary::Load(const std::string& name) {
   lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
   ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
                                  << dlerror();
+#if defined(__hexagon__)
+  int p;
+  int rc = dlinfo(lib_handle_, RTLD_DI_LOAD_ADDR, &p);
+  if (rc)
+    FARF(ERROR, "error getting model .so start address : %u", rc);
+  else
+    FARF(ALWAYS, "Model .so Start Address : %x", p);
+#endif
 }
 
 void* DSOLibrary::GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
diff --git a/src/runtime/hexagon/profiler/README.md b/src/runtime/hexagon/profiler/README.md
new file mode 100644
index 000000000000..c629382b4010
--- /dev/null
+++ b/src/runtime/hexagon/profiler/README.md
@@ -0,0 +1,152 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Hexagon lightweight instrumentation based profiling (LWP)
+
+For Hexagon, LWP can be used to get function and loop level processor cycle count.
+This's done by instrumenting the code with profiling builtin calls using a TIR pass.
+During codegen, these builtin calls are replaced with the calls to a hexagon specific
+handler which records the runtime information into a buffer.
+This buffer is written into a JSON file ('lwp.json') which is processed to construct
+function and loop level profiling information as a csv file.
+
+**Note:** During codegen, the profiling builtin calls are ignored for other targets.
+
+The TIR pass offers several config flags to control the level of instrumentation
+as mentioned below:
+
+1) `lwp_disable_func_prof`: To disable function level profiling. By default, it's
+set to 'False', i.e., the function level profiling is enabled.
+
+2) `instr_siblings`: When enabled, only loops with siblings are instrumented and rest are
+ignored. The inner-most loops are always excluded from instrumentation unless overwritten
+using `lwp_min_height`. This is done to minimize the adverse effect of instrumentation on
+actual performance. By default, it's set to 'True'.
+
+3) `lwp_max_depth`: To instrument loops upto a certain depth. This flag is effective
+only when `instr_siblings` is disabled. By default, it's set to 0.
+
+4) `lwp_min_height`: To exclude inner loops upto a certain height from instrumentation.
+By default, it's set to 1.
+
+For additional usage information on various config flags, please refer to the tests in
+`tests/python/unittest/test_tir_transform_profiling_instr.py`
+
+
+## How to use lightweight profiling with RPC Launcher:
+
+`tests/python/contrib/test_hexagon/test_launcher.py` contains two tests, `test_lwp` and
+`test_lwp_multiple_conv2d`, to demonstrate lightweight profiling usage.
+
+The steps involved are as follows:
+
+1) While building a model, set `tir.instrument_lwp` to `True`.
+   By default, the builtin calls will only be inserted for the loops with siblings. But, it 
+   can be altered using LWP config options as described above.
+2) Save the binary file as it'll be needed to process the profiling data (lwp.json) later.
+3) Create `HexagonProfiler` object. It's passed to `get_profile_output` to check if the model was
+built with profiling enabled before copying the data from the device.
+
+```
+with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+    lowered = tvm.relay.build(
+        relay_mod,
+        tvm.target.Target(target_hexagon, host=target_hexagon),
+        ...
+    )
+
+    # Save binary file to post-process lwp output
+    lowered.get_lib().save(dso_binary_path)
+
+    # Create HexagonProfiler object. It sets the profiling mode based on the PassContext config.
+    profiler = HexagonProfiler()
+```
+
+4) Run the model and get profile data (`lwp.json`) from the device (or the simulator):
+
+**Note:**
+
+- For on-device runs, 'lwp.json' is genrated in the same remote directory where 'tvm_rpc_android'
+is copied. This remote path is needed to copy the file from the device and can be found in
+'hexagon_server_process["launcher"].workspace'.
+
+- For the simulator runs, the remote path is not needed as the 'lwp.json' file is generated in the
+simulator test output directory.
+
+```
+    remote_path = ""
+    if android_serial_number is not None and android_serial_number != "simulator":
+        # Get the workspace on the device to extract lwp output
+        remote_path = hexagon_server_process["launcher"]._workspace
+
+    # Get profile data (lwp.json) from the device
+    prof_out = hexagon_launcher.get_profile_output(profiler, hexagon_session, remote_path, temp)
+
+```
+
+5) Process `lwp.json` and construct an easy-to-read csv file.
+
+This step requires several parameters as explained below:
+
+- Path of the binary file
+- android_serial_number
+- Path of the lwp json file (lwp.json) which gets created in the current directory
+- Path to the run log depending on the environment:
+  - For on-device runs:
+     Use logcat output as the run log
+     To get the logcat output:
+     - Create /vendor/lib/rfsa/adsp/tvm_rpc_android.farf on the device
+     - Run logcat command in the background or in a separate terminal while
+       running the test:
+       adb -s <device-id> logcat -c && adb -s <device-id> logcat 2>&1 | tee /tmp//logcat.log
+  - For simulator runs:
+     Use "stdout.txt" as the run log. There is no need to specify the full path to
+     "stdout.txt" as it will be inferred based on 'prof_out' location.
+- lwp processed output file -  "lwp.csv"
+
+**Note:** For on-device run, the logcat output needs to be collected manually and its path
+must be passed to 'process_lwp_output' as mentioned above.
+
+```
+    lwp_csv = temp.relpath("lwp.csv")
+    if android_serial_number == "simulator":
+        process_lwp_output(dso_binary_path, android_serial_number, prof_out, "stdout.txt", lwp_csv)
+    else:
+        # For on-device run
+        if os.path.exists("/tmp/logcat.log"):
+            process_lwp_output(
+                dso_binary_path, android_serial_number, prof_out, "/tmp/logcat.log", lwp_csv
+            )
+        else:
+            print("WARNING: Error processing lwp output - missing logcat file")
+```
+
+**Helpful Hints:**
+
+1) The above code snippet generates 'lwp.csv' in the temporary directory which gets deleted when the
+test exits. To keep the temp directory, set `keep_for_debug` to `True` while creating it. Alternatively,
+you can set `lwp_csv` to "/tmp/lwp.csv".
+
+```
+temp = utils.tempdir(keep_for_debug=True)
+```
+
+2) To prevent the test directories on the Hexagon device from being deleted, pass `--hexagon-debug` to pytest.
+
+```
+python -m pytest --hexagon-debug tests/python/contrib/test_hexagon/test_launcher.py::test_lwp
+```
diff --git a/src/runtime/hexagon/profiler/lwp_handler.S b/src/runtime/hexagon/profiler/lwp_handler.S
new file mode 100644
index 000000000000..d457ad4f4378
--- /dev/null
+++ b/src/runtime/hexagon/profiler/lwp_handler.S
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+  .text
+  .globl  lwp_handler
+  .falign
+  .type  lwp_handler,@function
+lwp_handler:
+  { allocframe(#24)
+    memd(r29+#-16) = r5:4
+  }
+  {
+    memd(r29+#8) = r3:2
+    memd(r29+#0) = r1:0
+    r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+  }
+  {
+    r5 = memw(r2+##__lwp_counter@GOT)
+    r3 = memw(r2+##__lwp_buffer_count@GOT)
+  }
+  {
+    r5 = memw(r5+#0)
+    r3 = memw(r3+#0)
+  }
+  {
+    r4 = memw(r5+r0<<#2)
+    r1 = memw(r2+##__lwp_buffer_size@GOT)
+  }
+  {
+    r4 = add(r4,#1)
+    memw(r5+r0<<#2) = r4.new
+    r1 = memw(r1+#0)
+  }
+  {
+    p0 = cmp.gtu(r4,#100)
+    if (p0.new) jump:nt .LBB0_3
+    r5 = memw(r2+##__lwp_buffer_ptr@GOT)
+  }
+  {
+    r5 = memw(r5+#0)
+    r2 = memw(r2+##__lwp_buffer_count@GOT)
+  }
+  {
+    r4 = add(r3,#4)
+    if (!cmp.gtu(r1,r4.new)) jump:t .LBB0_3
+  }
+  {
+    r5 = addasl(r5,r3,#2)
+    memw(r2+#0) = r4
+  }
+  {
+    memw(r5+#0) = r31
+    r1:0 = C15:14
+    memw(r5+#4) = r0 // id
+  }
+  {
+    memw(r5+#12) = r1 // pcyclehi
+    memw(r5+#8) = r0 // pcyclelo
+  }
+  .falign
+.LBB0_3:
+  {
+    r5:4 = memd(r29+#16)
+    r3:2 = memd(r29+#8)
+  }
+  {
+    r1:0 = memd(r29+#0)
+    dealloc_return
+  }
+.Lfunc_end0:
+  .size  lwp_handler, .Lfunc_end0-lwp_handler
diff --git a/src/runtime/hexagon/profiler/prof_utils.cc b/src/runtime/hexagon/profiler/prof_utils.cc
new file mode 100644
index 000000000000..3682c40699e5
--- /dev/null
+++ b/src/runtime/hexagon/profiler/prof_utils.cc
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+// The max loop/function id used among all lwp_handler calls. Since
+// the id is used to index into the lwp_counter buffer, the size of the
+// buffer must be equal or greater than the max possible id.
+#define LWP_COUNTER_SIZE 5000
+
+// LWP_BUFFER_SIZE needs to be at most 100 * LWP_COUNTER_SIZE since 100 is
+// the max number of entries recorded for each instrumented location.
+#define LWP_BUFFER_SIZE (LWP_COUNTER_SIZE * 100)
+
+unsigned int lwp_counter[LWP_COUNTER_SIZE] = {0};
+unsigned int lwp_buffer[LWP_BUFFER_SIZE];
+unsigned int* __lwp_counter = lwp_counter;
+unsigned int* __lwp_buffer_ptr = lwp_buffer;
+unsigned int __lwp_buffer_size = LWP_BUFFER_SIZE;
+unsigned int __lwp_enable_flag = 1;
+unsigned int __lwp_buffer_count = 0;
+
+bool WriteLWPOutput(const std::string& out_json) {
+  std::ostringstream s;
+  s << "{\n";
+  s << "\t\"entries\":[\n";
+  for (size_t i = 0; i < __lwp_buffer_count; i += 4) {
+    s << "\t{\n";
+    s << "\t\t\"ret\":" << std::dec << lwp_buffer[i] << ",\n";
+    s << "\t\t\"id\":" << std::dec << lwp_buffer[i + 1] << ",\n";
+    uint64_t pcycles = (static_cast<uint64_t>(lwp_buffer[i + 3]) << 32) + lwp_buffer[i + 2];
+    s << "\t\t\"cyc\":" << std::dec << pcycles << "\n";
+    s << "\t}";
+    if (i < __lwp_buffer_count - 4) {
+      s << ",\n";
+    }
+  }
+  s << "\t],\n\n";
+  s << "\t\"loop_counts\":[\n";
+  bool need_comma = false;
+  for (size_t i = 0; i < LWP_COUNTER_SIZE; i++) {
+    s << "\t\t" << lwp_counter[i] / 2;
+    if (i < LWP_COUNTER_SIZE - 1)
+      s << ",\n";
+    else
+      s << "\n";
+    need_comma = true;
+  }
+  s << "\t]\n}\n";
+  std::ofstream ofc(out_json);
+  if (!ofc.is_open()) {
+    return false;
+  }
+
+  ofc << s.str() << "\n";
+
+  if (!ofc) {
+    return false;
+  }
+  ofc.close();
+  return true;
+}
diff --git a/src/runtime/hexagon/profiler/prof_utils.h b/src/runtime/hexagon/profiler/prof_utils.h
new file mode 100644
index 000000000000..5c17ad58ad0a
--- /dev/null
+++ b/src/runtime/hexagon/profiler/prof_utils.h
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+bool WriteLWPOutput(const std::string&);
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index 22a54043cd9f..b3570d02c456 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -38,6 +38,7 @@ extern "C" {
 #include "../../../library_module.h"
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon/hexagon_common.h"
+#include "../../profiler/prof_utils.h"
 #include "hexagon_rpc.h"
 
 namespace tvm {
@@ -299,3 +300,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
       *rv = CreateModuleFromLibrary(n);
     });
+
+TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string profiling_mode = args[0];
+      std::string out_file = args[1];
+      if (profiling_mode.compare("lwp") == 0) {
+        *rv = WriteLWPOutput(out_file);
+      } else {
+        HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str());
+        *rv = false;
+      }
+    });
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 119e999276f8..41bb2da6f8b1 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -29,6 +29,7 @@
 #include "../../../library_module.h"
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon_common.h"
+#include "../../profiler/prof_utils.h"
 #include "hexagon_sim_proto.h"
 #include "tvm/runtime/packed_func.h"
 #include "tvm/runtime/registry.h"
@@ -336,3 +337,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
       *rv = CreateModuleFromLibrary(n);
     });
+
+TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string profiling_mode = args[0];
+      std::string out_file = args[1];
+      if (profiling_mode.compare("lwp") == 0) {
+        *rv = WriteLWPOutput(out_file);
+      } else {
+        HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str());
+        *rv = false;
+      }
+    });
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index ff59dfcceb8d..943a09922437 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -75,6 +75,7 @@ class CodeGenHexagon final : public CodeGenCPU {
 
   using CodeGenCPU::VisitStmt_;
   llvm::Value* VisitExpr_(const BufferLoadNode* op) override;
+  llvm::Value* CreateIntrinsic(const CallNode* op) override;
 
   llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
                                 bool skip_first_arg) override;
@@ -193,6 +194,25 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
   return CodeGenCPU::VisitExpr_(op);
 }
 
+llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
+  if (op->op.same_as(builtin::profile_intrinsic())) {
+    llvm::Value* id = MakeValue(op->args[0]);
+    auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom;
+    llvm::Function* func = llvm::Intrinsic::getDeclaration(module_.get(), instrprof_id);
+    llvm::GlobalVariable* name_var = module_->getGlobalVariable("handler_name");
+    if (!name_var) {
+      llvm::StringRef init_str = "lwp_handler";
+      llvm::Constant* init = llvm::ConstantDataArray::getString(module_->getContext(), init_str);
+
+      name_var = new llvm::GlobalVariable(*module_, init->getType(), true,
+                                          llvm::GlobalValue::InternalLinkage, init, "handler_name");
+    }
+    llvm::Type* t_int8_p_ = t_int8_->getPointerTo();
+    return builder_->CreateCall(func, {llvm::ConstantExpr::getBitCast(name_var, t_int8_p_), id});
+  }
+  return CodeGenCPU::CreateIntrinsic(op);
+}
+
 void CodeGenHexagon::CreatePrintf(const std::string& format,
                                   llvm::ArrayRef<llvm::Value*> format_args) {
   // This function generates LLVM instructions to call HAP_debug_v2,
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6a50dd4534c2..2798631d8d0f 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1318,6 +1318,9 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     // TODO(masahi): Support atomic for CPU backend
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
     return nullptr;
+  } else if (op->op.same_as(builtin::profile_intrinsic())) {
+    LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op;
+    return nullptr;
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 1e2d790c76e1..2359fd3e7b10 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -302,6 +302,9 @@ TIR_DEFINE_BUILTIN_FUNC(undef)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kReadState))
     .set_num_inputs(0);
 
+TIR_DEFINE_BUILTIN_FUNC(profile_intrinsic)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
+
 }  // namespace builtin
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
new file mode 100644
index 000000000000..c6041aaa5f6b
--- /dev/null
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file profile_instrumentation.cc
+ */
+// Insert profile intrinsic at loop and function level. During codegen,
+// these intruction can be replaced with a call to a target specific handler
+// and can be used to capture profiling information such as processor cycles.
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+namespace lwp {
+
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_disable_func_prof", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_max_depth", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_min_height", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.instr_siblings", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.reset_start_id", Bool);
+
+static int32_t start_id = 0;
+
+struct LoopInfo {
+  LoopInfo() = default;
+  LoopInfo(unsigned i, unsigned d, unsigned h = 0) : id(i), depth(d), height(h) {
+    has_siblings = false;
+    has_parallel = false;
+  }
+  unsigned id;
+  unsigned depth;
+  int32_t height;
+  bool has_siblings;
+  // Set to 'true' if ForKind::kParallel is set for the current loop or one of its ancestor
+  bool has_parallel;
+};
+
+using LoopInfoMap = std::unordered_map<const ForNode*, LoopInfo>;
+// Traverse loops depth first and assign them a unique number.
+class LoopAnalyzer : public StmtExprVisitor {
+ public:
+  LoopInfoMap Analyze(const Stmt& stmt) {
+    this->VisitStmt(stmt);
+    return loops;
+  }
+  void VisitStmt_(const ForNode* op) final {
+    LoopInfo loop_info(start_id, 0);
+    start_id++;
+    loop_info.height = TraverseLoop(op->body, 0);
+    loops[op] = loop_info;
+  }
+
+  unsigned TraverseLoop(const Stmt& stmt, unsigned parent_depth, bool has_parallel = false) {
+    if (stmt->IsInstance<SeqStmtNode>()) {
+      std::vector<const ForNode*> siblings;
+      unsigned height = 0;
+      bool has_loop = false;
+      const SeqStmtNode* n = stmt.as<SeqStmtNode>();
+      for (Stmt s : n->seq) {
+        if (s->IsInstance<ForNode>()) {
+          has_loop = true;
+          const ForNode* f = s.as<ForNode>();
+          LoopInfo loop_info(start_id, parent_depth + 1);
+          start_id++;
+          bool parent_parallel = false;
+          if (has_parallel) {
+            loop_info.has_parallel = true;
+            parent_parallel = true;
+          } else if (f->kind == ForKind::kParallel) {
+            // has_parallel for the current loop is being set to 'false' since the
+            // intrinsic is added outside of the loop. The instrumentation isn't
+            // allowed for the subsequent nested loops.
+            loop_info.has_parallel = false;
+            parent_parallel = true;
+          }
+          siblings.push_back(f);
+          height = std::max(height, TraverseLoop(f->body, parent_depth + 1, parent_parallel));
+          loop_info.height = height;
+          loops[f] = loop_info;
+        }
+      }
+      if (siblings.size() > 1) {
+        for (auto* l : siblings) {
+          loops[l].has_siblings = true;
+        }
+      }
+      height = has_loop ? height + 1 : height;
+      return height;  // Parent's height : max of all children's height
+    } else if (stmt->IsInstance<IfThenElseNode>()) {
+      const IfThenElseNode* n = stmt.as<IfThenElseNode>();
+      unsigned height = TraverseLoop(n->then_case, parent_depth, has_parallel);
+      if (n->else_case.defined())
+        height = std::max(height, TraverseLoop(n->else_case, parent_depth, has_parallel));
+      return height;
+    } else if (stmt->IsInstance<ForNode>()) {
+      const ForNode* f = stmt.as<ForNode>();
+      LoopInfo loop_info(start_id, parent_depth + 1);
+      start_id++;
+      bool parent_parallel = false;
+      if (has_parallel) {
+        loop_info.has_parallel = true;
+        parent_parallel = true;
+      } else if (f->kind == ForKind::kParallel) {
+        // has_parallel for the current loop is being set to 'false' since the
+        // intrinsic is added outside of the loop. The instrumentation isn't
+        // allowed for the subsequent nested loops.
+        loop_info.has_parallel = false;
+        parent_parallel = true;
+      }
+      unsigned height = TraverseLoop(f->body, parent_depth + 1, parent_parallel);
+      loop_info.height = height;
+      loops[f] = loop_info;
+      return height + 1;
+    } else if (stmt->IsInstance<LetStmtNode>()) {
+      const LetStmtNode* n = stmt.as<LetStmtNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else if (stmt->IsInstance<AttrStmtNode>()) {
+      const AttrStmtNode* n = stmt.as<AttrStmtNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else if (stmt->IsInstance<AllocateNode>()) {
+      const AllocateNode* n = stmt.as<AllocateNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else {
+      return 0;  // inner-most loop
+    }
+  }
+
+ private:
+  LoopInfoMap loops;
+};
+
+class InstrumentIntrin : public StmtMutator {
+ public:
+  InstrumentIntrin(int32_t max_depth, int32_t min_height, bool instr_siblings)
+      : max_instr_depth_(max_depth),
+        min_instr_height_(min_height),
+        instr_siblings_(instr_siblings) {}
+
+  void GetLoopInfo(PrimFuncNode* op) {
+    LoopAnalyzer analzer;
+    loops_ = std::move(analzer.Analyze(op->body));
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode* op) final {
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    return SeqStmt::Flatten(stmt);
+  }
+
+  Stmt VisitStmt_(const ForNode* op) final {
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    if (loops_.count(op) < 1) return stmt;
+
+    LoopInfo loop_info = loops_[op];
+
+    if (loop_info.has_parallel) {
+      return stmt;
+    }
+
+    // Exclude inner-most loops from instrumentation. The inner-most loop has
+    // height '0' and it increases as we move outward in the loop nest.
+    if (loop_info.height < min_instr_height_) {
+      return stmt;
+    }
+
+    // Only instrument loops with a sibling
+    if (instr_siblings_ && !loop_info.has_siblings) {
+      return stmt;
+    }
+
+    // If instr_siblings_ is set, ignore max depth for instrumentation
+    if (!instr_siblings_ && loop_info.depth > max_instr_depth_) {
+      return stmt;
+    }
+    PrimExpr id = static_cast<int32_t>(loop_info.id);
+    PrimExpr call = Call(DataType::Handle(), builtin::profile_intrinsic(), {id});
+    const Stmt profile = Evaluate(call);
+    Stmt new_stmt = SeqStmt({profile, stmt, profile});
+    return new_stmt;
+  }
+
+ private:
+  LoopInfoMap loops_;
+  int32_t max_instr_depth_;
+  int32_t min_instr_height_;
+  bool instr_siblings_;
+};
+
+class CheckParallelLoops : public StmtExprVisitor {
+ public:
+  bool HasParallelLoops(const Stmt& stmt) {
+    this->VisitStmt(stmt);
+    return has_parallel;
+  }
+
+ private:
+  void VisitStmt_(const ForNode* op) final {
+    if (op->kind == ForKind::kParallel)
+      has_parallel = true;
+    else
+      StmtExprVisitor::VisitStmt_(op);
+  }
+
+  bool has_parallel = false;
+};
+
+PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_instr_height,
+                            bool instr_siblings, bool disable_func_instrumentation) {
+  auto* func_ptr = func.CopyOnWrite();
+
+  PrimExpr e = start_id++;
+  if (!disable_func_instrumentation) {
+    PrimExpr call = Call(DataType::Handle(), builtin::profile_intrinsic(), {e});
+    const Stmt profile = Evaluate(call);
+    func_ptr->body = SeqStmt({profile, std::move(func_ptr->body), profile});
+  }
+  InstrumentIntrin p(max_instr_depth, min_instr_height, instr_siblings);
+  p.GetLoopInfo(func_ptr);
+  func_ptr->body = p(std::move(func_ptr->body));
+  return std::move(func);
+}
+
+}  // namespace lwp
+
+namespace transform {
+Pass InstrumentProfileIntrinsics() {
+  auto pass_func = [](IRModule m, PassContext ctx) {
+    auto* mptr = m.CopyOnWrite();
+
+    // All loops with depth <= max_instr_depth are instrumented. By default,
+    // only outer-most loops are instrumented which has a depth of 0.
+    // In addition, loops with siblings are also instrumented provided
+    // their loop depth is >= min_instr_height. This is done to avoid
+    // instrumenting inner-most loops.
+    auto max_instr_depth = ctx->GetConfig<Integer>("tir.lwp_max_depth", Integer(0)).value();
+    auto min_instr_height = ctx->GetConfig<Integer>("tir.lwp_min_height", Integer(1)).value();
+    bool instr_siblings = ctx->GetConfig<Bool>("tir.instr_siblings", Bool(true)).value();
+    bool disable_func_instrumentation =
+        ctx->GetConfig<Bool>("tir.lwp_disable_func_prof", Bool(false)).value();
+    bool reset_start_id = ctx->GetConfig<Bool>("tir.reset_start_id", Bool(false)).value();
+    if (reset_start_id) lwp::start_id = 0;
+    std::vector<std::pair<GlobalVar, PrimFunc>> updates;
+    for (const auto& kv : mptr->functions) {
+      if (auto* n = kv.second.as<PrimFuncNode>()) {
+        PrimFunc func = GetRef<PrimFunc>(n);
+        auto updated_func =
+            lwp::AddProfileBuiltins(func, max_instr_depth.IntValue(), min_instr_height.IntValue(),
+                                    instr_siblings, disable_func_instrumentation);
+        updates.push_back({kv.first, updated_func});
+      }
+    }
+    for (const auto& pair : updates) {
+      mptr->AddUnchecked(pair.first, pair.second);
+    }
+    return m;
+  };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.InstrumentProfileIntrinsics", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InstrumentProfileIntrinsics")
+    .set_body_typed(InstrumentProfileIntrinsics);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 9321ddf71d3b..180fa2b8e32c 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -18,11 +18,17 @@
 """ Test rpc based launcher for hexagon """
 
 import numpy as np
+import os
 
 import tvm.testing
 from tvm import relay, te
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
+from tvm.contrib import utils
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
+
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
+from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
 
 
 @tvm.testing.requires_hexagon
@@ -424,5 +430,244 @@ def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target,
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
+@tvm.testing.requires_hexagon
+def test_lwp(
+    hexagon_server_process,
+    hexagon_launcher: HexagonLauncherRPC,
+    hexagon_session: Session,
+):
+    dtype = "float32"
+    data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
+    weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+
+    f = relay.Function([data, weight], y)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    runtime = Runtime("cpp")
+    executor = Executor("graph")
+
+    weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype)
+    data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
+    params = {"weight": weight_in}
+    inputs = {"data": data_in}
+    temp = utils.tempdir(keep_for_debug=True)
+    dso_binary = "test_binary.so"
+    dso_binary_path = temp.relpath(dso_binary)
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+        )
+        # Save binary file to post-process lwp output
+        lowered.get_lib().save(dso_binary_path)
+        profiler = HexagonProfiler()
+
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    # Get profiling data
+    remote_path = ""
+    android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
+    if android_serial_number is not None and android_serial_number != "simulator":
+        # Get the workspace on the device to extract lwp output
+        remote_path = hexagon_server_process["launcher"]._workspace
+
+    prof_out = hexagon_launcher.get_profile_output(profiler, hexagon_session, remote_path, temp)
+
+    # Process lightweight profiling output into an easily readable csv file
+    # The post-processing requires following parameters:
+    # 1) Path of the binary file
+    # 2) android_serial_number
+    # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
+    # 4) Path to the run log depending on the environment:
+    #    a) For on-device runs:
+    #       Use logcat output as the run log
+    #       To get the logcat output:
+    #       - Create /vendor/lib/rfsa/adsp/tvm_rpc_android.farf on the device
+    #       - Run logcat command in the background or in a separate terminal while
+    #         running the test:
+    #         adb -s <device-id> logcat -c && adb -s <device-id> logcat 2>&1 | tee /tmp//logcat.log
+    #    b) For simulator runs:
+    #       Use "stdout.txt" as the run log. There is no need to specify the full path to
+    #       "stdout.txt" as it will be inferred based on 'prof_out' location.
+    # 5) lwp processed output file -  "lwp.csv"
+    #
+    # NOTE: For on-device run, the logcat output needs to be collected manually and its path
+    # must be passed to 'process_lwp_output' as mentioned above.
+    #
+    lwp_csv = temp.relpath("lwp.csv")
+    if android_serial_number == "simulator":
+        process_lwp_output(dso_binary_path, android_serial_number, prof_out, "stdout.txt", lwp_csv)
+    else:
+        # For on-device run
+        if os.path.exists("/tmp/logcat.log"):
+            process_lwp_output(
+                dso_binary_path, android_serial_number, prof_out, "/tmp/logcat.log", lwp_csv
+            )
+        else:
+            print("WARNING: Error processing lwp output - missing logcat file")
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+        )
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(weight=weight_in)
+    llvm_graph_mod.run(data=data_in)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
+@tvm.testing.requires_hexagon
+def test_lwp_multiple_conv2d(
+    hexagon_server_process,
+    hexagon_launcher: HexagonLauncherRPC,
+    hexagon_session: Session,
+):
+    dtype = "float32"
+    input_shape = (1, 8, 8, 3)
+    w1_shape = (5, 5, 3, 1)
+    w2_shape = (5, 5, 1, 3)
+    data = relay.var("data", relay.TensorType(input_shape, dtype))
+    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
+    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
+    y1 = relay.nn.conv2d(
+        data,
+        weight1,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    y2 = relay.nn.conv2d(
+        y1,
+        weight2,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight1, weight2], y2)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    runtime = Runtime("cpp")
+    executor = Executor("graph")
+
+    temp = utils.tempdir()
+    dso_binary = "test_binary.so"
+    dso_binary_path = temp.relpath(dso_binary)
+
+    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
+        dtype=dtype
+    )
+    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype(
+        dtype=dtype
+    )
+    input_data = np.random.rand(
+        input_shape[0], input_shape[1], input_shape[2], input_shape[3]
+    ).astype(dtype=dtype)
+
+    params = {"weight1": weight1_data, "weight2": weight2_data}
+    inputs = {"data": input_data}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+        )
+        # Save binary file to post-process lwp output
+        lowered.get_lib().save(dso_binary_path)
+        profiler = HexagonProfiler()
+
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    # Get profiling data
+    remote_path = ""
+    android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
+    if android_serial_number is not None and android_serial_number != "simulator":
+        # Get the workspace on the device to extract lwp output
+        remote_path = hexagon_server_process["launcher"]._workspace
+
+    prof_out = hexagon_launcher.get_profile_output(profiler, hexagon_session, remote_path, temp)
+
+    # Process lightweight profiling output into an easily readable csv file
+    # The post-processing requires following parameters:
+    # 1) Path of the binary file
+    # 2) android_serial_number
+    # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
+    # 4) Path to the run log depending on the environment:
+    #    a) For on-device runs:
+    #       Use logcat output as the run log
+    #       To get the logcat output:
+    #       - Create /vendor/lib/rfsa/adsp/tvm_rpc_android.farf on the device
+    #       - Run logcat command in the background or in a separate terminal while
+    #         running the test:
+    #         adb -s <device-id> logcat -c && adb -s <device-id> logcat 2>&1 | tee /tmp//logcat.log
+    #    b) For simulator runs:
+    #       Use "stdout.txt" as the run log. There is no need to specify the full path to
+    #       "stdout.txt" as it will be inferred based on 'prof_out' location.
+    # 5) lwp processed output file -  "lwp.csv"
+    #
+    # NOTE: For on-device run, the logcat output needs to be collected manually and its path
+    # must be passed to 'process_lwp_output' as mentioned above.
+    #
+    lwp_csv = temp.relpath("lwp.csv")
+    if android_serial_number == "simulator":
+        process_lwp_output(dso_binary_path, android_serial_number, prof_out, "stdout.txt", lwp_csv)
+    else:
+        # For on-device run
+        if os.path.exists("/tmp/logcat.log"):
+            process_lwp_output(
+                dso_binary_path, android_serial_number, prof_out, "/tmp/logcat.log", lwp_csv
+            )
+        else:
+            print("WARNING: Error processing lwp output - missing logcat file")
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+        )
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(**params)
+    llvm_graph_mod.run(**inputs)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_profiling_instr.py b/tests/python/unittest/test_tir_transform_profiling_instr.py
new file mode 100644
index 000000000000..3a63963d5e2b
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_profiling_instr.py
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.ir.module import IRModule
+from tvm.script import tir as T
+import numpy
+
+default_lwp_test_config = {
+    "tir.instrument_lwp": True,
+    "tir.lwp_disable_func_prof": True,
+    "tir.reset_start_id": True,
+}
+
+
+@T.prim_func
+def input1(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    for i, j in T.grid(8, 8):
+        for k, l in T.grid(8, 16):
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+        for k, l in T.grid(8, 16):
+            with T.block("C"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+
+
+@T.prim_func
+def input2(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        for j in T.serial(0, 8):
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        for j in T.serial(0, 8):
+            for k, l in T.grid(8, 16):
+                with T.block("C"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+
+
+@T.prim_func
+def input3(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        for j in T.parallel(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        for j in T.serial(0, 8):
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+
+
+@T.prim_func
+def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    for i, j in T.grid(8, 8):
+        T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+        for k, l in T.grid(8, 16):
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+        for k, l in T.grid(8, 16):
+            with T.block("C"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+
+
+@T.prim_func
+def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("C"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+
+
+@T.prim_func
+def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        for j in T.parallel(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+
+
+# By default, only loops with siblings are instrumented.
+def test1():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test1_expected_output)
+
+
+# By default, only loops with siblings are instrumented. Here, 'lwp_max_depth'
+# doesn't have any effect unless 'instr_siblings' is set to False (ex: test3).
+def test2():
+    test2_config = default_lwp_test_config.copy()
+    test2_config.update({"tir.lwp_max_depth": 3})
+    with tvm.transform.PassContext(config=test2_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test1_expected_output)
+
+
+# test3: Use 'lwp_max_depth' to instrument loops upto a certain depth. This flag
+# is effective only when 'instr_siblings' is disabled. Also, note that inner-most
+# loops are always excluded from instrumentation unless overwritten using
+# 'lwp_min_height' (ex: test5)
+def test3():
+    test3_config = default_lwp_test_config.copy()
+    test3_config.update({"tir.lwp_max_depth": 3, "tir.instr_siblings": False})
+    with tvm.transform.PassContext(config=test3_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test3_expected_output)
+
+
+# test4: Use 'lwp_min_height' to exclude inner loops upto a certain height from
+# instrumentation.
+def test4():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input2)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test4_expected_output)
+
+
+# test5: Use both 'lwp_min_height' and 'lwp_max_depth'.
+# instrumentation.
+def test5():
+    test5_config = default_lwp_test_config.copy()
+    test5_config.update(
+        {"tir.lwp_max_depth": 3, "tir.instr_siblings": False, "tir.lwp_min_height": 2}
+    )
+    with tvm.transform.PassContext(config=test5_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test5_expected_output)
+
+
+# test6: Tests instrumentation for the parallel loops
+def test6():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input3)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test6_expected_output)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 5d39093e7c0feb14fbcdb0fd720df605cc42afe1 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Mon, 3 Oct 2022 16:08:52 -0500
Subject: [PATCH 02/13] Fix typos

---
 apps/hexagon_launcher/README.md               |  2 +-
 .../hexagon/profiling/process_lwp_data.py     |  5 +++--
 src/runtime/hexagon/profiler/README.md        | 22 +++++++++----------
 src/tir/transforms/profile_instrumentation.cc |  2 +-
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index 48c76a16ae96..943a6266a3b4 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -205,7 +205,7 @@ with tvm.transform.PassContext(config={'tir.instrument_lwp':True} ):
 Here, `instrument_lwp` is used to enable the tir pass which instruments the code with the builtin calls.
 
 During codegen, profiling builtin calls can be replaced with a target specific handler to record runtime
-information into a buffer. This buffer is written into a JSON file which is proccessed to construct
+information into a buffer. This buffer is written into a JSON file which is processed to construct
 function and loop level profiling information.
 
 To generate LWP JSON file, add `--gen_lwp_json` flag to launcher_android:
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
index ef20c43f8a83..4aa5b7bb8fe2 100644
--- a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -26,12 +26,13 @@
 
 ENABLE_DEBUG = False
 """
-Process lightweight profiling output and generate a .csv filea
+Process lightweight profiling output and generate a CSV file with processor
+cycles for the instrumented functions and loops.
 
 Please note that some assumptions have been made while processing
 the lightweight profiling output. They are as follows:
 
-1) We don't expect profiled functions to call another profiled functions.
+1) We don't expect profiled functions to call another profiled function.
   This constraint can be relaxed if needed but it simplifies the processing
   significantly without introducing any limitations for our use case.
 2) For now, it's also assumed that every unique section (loop) ID has same start
diff --git a/src/runtime/hexagon/profiler/README.md b/src/runtime/hexagon/profiler/README.md
index c629382b4010..6a4cbf451008 100644
--- a/src/runtime/hexagon/profiler/README.md
+++ b/src/runtime/hexagon/profiler/README.md
@@ -18,7 +18,7 @@
 # Hexagon lightweight instrumentation based profiling (LWP)
 
 For Hexagon, LWP can be used to get function and loop level processor cycle count.
-This's done by instrumenting the code with profiling builtin calls using a TIR pass.
+This is done by instrumenting the code with profiling builtin calls using a TIR pass.
 During codegen, these builtin calls are replaced with the calls to a hexagon specific
 handler which records the runtime information into a buffer.
 This buffer is written into a JSON file ('lwp.json') which is processed to construct
@@ -29,19 +29,19 @@ function and loop level profiling information as a csv file.
 The TIR pass offers several config flags to control the level of instrumentation
 as mentioned below:
 
-1) `lwp_disable_func_prof`: To disable function level profiling. By default, it's
+1) `lwp_disable_func_prof`: To disable function level profiling. By default, it is
 set to 'False', i.e., the function level profiling is enabled.
 
 2) `instr_siblings`: When enabled, only loops with siblings are instrumented and rest are
 ignored. The inner-most loops are always excluded from instrumentation unless overwritten
 using `lwp_min_height`. This is done to minimize the adverse effect of instrumentation on
-actual performance. By default, it's set to 'True'.
+actual performance. By default, it is set to 'True'.
 
-3) `lwp_max_depth`: To instrument loops upto a certain depth. This flag is effective
-only when `instr_siblings` is disabled. By default, it's set to 0.
+3) `lwp_max_depth`: To instrument loops up to a certain depth. This flag is effective
+only when `instr_siblings` is disabled. By default, it is set to 0.
 
-4) `lwp_min_height`: To exclude inner loops upto a certain height from instrumentation.
-By default, it's set to 1.
+4) `lwp_min_height`: To exclude inner loops up to a certain height from instrumentation.
+By default, it is set to 1.
 
 For additional usage information on various config flags, please refer to the tests in
 `tests/python/unittest/test_tir_transform_profiling_instr.py`
@@ -55,10 +55,10 @@ For additional usage information on various config flags, please refer to the te
 The steps involved are as follows:
 
 1) While building a model, set `tir.instrument_lwp` to `True`.
-   By default, the builtin calls will only be inserted for the loops with siblings. But, it 
+   By default, the builtin calls will only be inserted for the loops with siblings. But it
    can be altered using LWP config options as described above.
-2) Save the binary file as it'll be needed to process the profiling data (lwp.json) later.
-3) Create `HexagonProfiler` object. It's passed to `get_profile_output` to check if the model was
+2) Save the binary file as it will be needed to process the profiling data (lwp.json) later.
+3) Create `HexagonProfiler` object. It is passed to `get_profile_output` to check if the model was
 built with profiling enabled before copying the data from the device.
 
 ```
@@ -80,7 +80,7 @@ with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True})
 
 **Note:**
 
-- For on-device runs, 'lwp.json' is genrated in the same remote directory where 'tvm_rpc_android'
+- For on-device runs, 'lwp.json' is generated in the same remote directory where 'tvm_rpc_android'
 is copied. This remote path is needed to copy the file from the device and can be found in
 'hexagon_server_process["launcher"].workspace'.
 
diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
index c6041aaa5f6b..bcf296478aff 100644
--- a/src/tir/transforms/profile_instrumentation.cc
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -21,7 +21,7 @@
  * \file profile_instrumentation.cc
  */
 // Insert profile intrinsic at loop and function level. During codegen,
-// these intruction can be replaced with a call to a target specific handler
+// these instruction can be replaced with a call to a target specific handler
 // and can be used to capture profiling information such as processor cycles.
 
 #include <tvm/tir/builtin.h>

From ca5e6f3468d0de6debb7a76d5824f335645e3f32 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Wed, 5 Oct 2022 15:59:08 -0500
Subject: [PATCH 03/13] Update python/tvm/contrib/hexagon/build.py

Add type hint

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>
---
 python/tvm/contrib/hexagon/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 62ae145d9271..595e9aab7536 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -345,7 +345,7 @@ def get_profile_output(
         session: Session,
         remote_path: Union[str, pathlib.Path] = None,
         temp_dir: TempDirectory = None,
-    ):
+    ) -> str:
         """Extract profile output.
 
         Parameters

From 8b8e3b951696527c369b648f1639511ac6fb1875 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Thu, 13 Oct 2022 14:32:22 -0500
Subject: [PATCH 04/13] Address review comments

Simplify the interface to the lightweight profiling.
---
 cmake/modules/Hexagon.cmake                   |  12 +-
 include/tvm/tir/builtin.h                     |   7 +-
 python/tvm/contrib/hexagon/build.py           |  13 +--
 .../tvm/contrib/hexagon/hexagon_profiler.py   |  82 ++++++++++++-
 .../hexagon/profiling/process_lwp_data.py     |   4 +-
 src/driver/driver_api.cc                      |   4 +
 src/runtime/hexagon/profiler/README.md        |  85 +++-----------
 src/runtime/hexagon/profiler/lwp_handler.S    |  86 +++++++++-----
 src/runtime/hexagon/profiler/prof_utils.cc    |  16 ++-
 src/target/llvm/codegen_hexagon.cc            |   2 +-
 src/target/llvm/codegen_llvm.cc               |   3 +-
 src/tir/op/builtin.cc                         |   5 +-
 src/tir/transforms/profile_instrumentation.cc |  26 +++--
 .../contrib/test_hexagon/test_launcher.py     | 109 ++----------------
 14 files changed, 210 insertions(+), 244 deletions(-)

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 75ccb75435f3..0233fc5d8109 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -244,14 +244,14 @@ if(USE_HEXAGON_RPC)
       # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it
       "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc"
     )
-    set(PROFILER_DIR "${TVMRT_SOURCE_DIR}/hexagon/profiler")
+    set(HEXAGON_PROFILER_DIR "${TVMRT_SOURCE_DIR}/hexagon/profiler")
     # Add the hardware-specific RPC code into the skel library.
-    set_property(SOURCE ${PROFILER_DIR}/lwp_handler.S PROPERTY LANGUAGE C)
+    set_property(SOURCE ${HEXAGON_PROFILER_DIR}/lwp_handler.S PROPERTY LANGUAGE C)
     add_library(hexagon_rpc_skel SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon/rpc_server.cc"
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_skel.c"
-      "${PROFILER_DIR}/prof_utils.cc"
-      "${PROFILER_DIR}/lwp_handler.S"
+      "${HEXAGON_PROFILER_DIR}/prof_utils.cc"
+      "${HEXAGON_PROFILER_DIR}/lwp_handler.S"
     )
     target_include_directories(hexagon_rpc_skel
       SYSTEM PRIVATE "${TVMRT_SOURCE_DIR}/hexagon/rpc"
@@ -260,8 +260,8 @@ if(USE_HEXAGON_RPC)
     # executed via run_main_on_sim.
     add_library(hexagon_rpc_sim SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/simulator/rpc_server.cc"
-      "${PROFILER_DIR}/prof_utils.cc"
-      "${PROFILER_DIR}/lwp_handler.S"
+      "${HEXAGON_PROFILER_DIR}/prof_utils.cc"
+      "${HEXAGON_PROFILER_DIR}/lwp_handler.S"
     )
     target_link_libraries(hexagon_rpc_sim
       -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index eee26a3c7efa..9f6b7f9ce5d1 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -750,7 +750,12 @@ TVM_DLL const Op& undef();
 /*!
  * \brief Profiling intrinsic
  */
-TVM_DLL const Op& profile_intrinsic();
+TVM_DLL const Op& start_profile_intrinsic();
+
+/*!
+ * \brief Profiling intrinsic
+ */
+TVM_DLL const Op& end_profile_intrinsic();
 
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 595e9aab7536..f41ea39b0965 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -343,8 +343,6 @@ def get_profile_output(
         self,
         hex_profiler: HexagonProfiler,
         session: Session,
-        remote_path: Union[str, pathlib.Path] = None,
-        temp_dir: TempDirectory = None,
     ) -> str:
         """Extract profile output.
 
@@ -355,10 +353,6 @@ def get_profile_output(
         session : Session
             Remote session. The session must be established (via __enter__)
             prior to calling this function.
-        remote_path: Union[str, pathlib.Path]
-            Remote path for on-device runs (ignored for the simulator run)
-        temp_dir : TempDirectory
-            Local directory where profile output can be saved (ignored for the simulator run)
 
         Returns
         -------
@@ -407,6 +401,7 @@ def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
+
         _check_call_verbose(self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)])
 
     def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
@@ -551,12 +546,12 @@ def get_profile_output(
         self,
         hex_profiler: HexagonProfiler,
         session: Session,
-        remote_path: Union[str, pathlib.Path] = None,
-        temp_dir: TempDirectory = None,
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         profile_data = ""
         if hex_profiler.is_lwp_enabled():
+            temp_dir = hex_profiler.get_temp_dir()
+            remote_path = hex_profiler.get_remote_path()
             if not temp_dir:
                 raise RuntimeError("tempdir not passed")
             fname = "lwp.json"
@@ -683,8 +678,6 @@ def get_profile_output(
         self,
         hex_profiler: HexagonProfiler,
         session: Session,
-        remote_path: Union[str, pathlib.Path] = None,
-        temp_dir: TempDirectory = None,
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         profile_data = ""
diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
index 51f768b71f07..7ee7798964fe 100755
--- a/python/tvm/contrib/hexagon/hexagon_profiler.py
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -15,22 +15,92 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
+import subprocess
 from tvm.ir.transform import PassContext
+from tvm.contrib.hexagon.session import Session
+from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
+from tvm.relay.backend.executor_factory import ExecutorFactoryModule
+from tvm.contrib import utils
 
 class HexagonProfiler:
     """Hexagon Profiler"""
 
-    def __init__(self):
+    def __init__(self, module: ExecutorFactoryModule, hexagon_server_process, enable_debug):
         """Configure HexagonProfiler"""
-        self.profiling_mode = None
+        # Save test .so to process profiling data
+        dso_binary = "test_binary.so"
+        self._temp_dir = utils.tempdir(keep_for_debug=enable_debug)
+        self._dso_binary_path = self._temp_dir.relpath(dso_binary)
+        module.get_lib().save(self._dso_binary_path)
+
+        self._android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
+        self._remote_path = ""
+        self._logcat_path = ""
+
+        self._profiling_mode = None
         config = PassContext.current().config
+        if self._android_serial_number is None:
+            raise RuntimeError("ANDROID_SERIAL_NUMBER must be set for profiling")
+
         if ("tir.instrument_lwp", True) in config.items():
-            self.profiling_mode = "lwp"
-        if self.profiling_mode is None:
+            # Set profiling mode
+            self._profiling_mode = "lwp"
+
+            launcher = hexagon_server_process["launcher"]
+            if self._android_serial_number != "simulator":
+              # Clear the logcat buffer and create a child process to redirect logcat output into a file.
+              subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
+              self._logcat_path = self._temp_dir.relpath('logcat.log')
+              f=open(self._logcat_path, 'w')
+              self._proc = subprocess.Popen(launcher._adb_device_sub_cmd + ["logcat"], stdout = f)
+
+              # Get the remote workspace on the device from where the lwp data needs to be copied.
+              self._remote_path = launcher._workspace
+
+        if self._profiling_mode is None:
             raise "Profiling mode was not set or was not a valid one."
 
     def get_mode(self):
-        return self.profiling_mode
+        return self._profiling_mode
 
     def is_lwp_enabled(self):
-        return self.profiling_mode == "lwp"
+        return self._profiling_mode == "lwp"
+
+    def get_temp_dir(self):
+        return self._temp_dir
+
+    def get_remote_path(self):
+        return self._remote_path
+
+    def get_profile_output(self, hexagon_launcher, hexagon_session, hexagon_server_process):
+        # Get runtime profiling data
+        prof_out = hexagon_launcher.get_profile_output(self, hexagon_session)
+
+        print("lwp json can be found at -- ", prof_out)
+
+        # Process lightweight profiling output into an easily readable csv file
+        # The post-processing requires following parameters:
+        # 1) Path of the binary file
+        # 2) android_serial_number
+        # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
+        # 4) Path to the run log depending on the environment:
+        #    a) For on-device runs:
+        #       Use logcat output as the run log
+        #    b) For simulator runs:
+        #       Use "stdout.txt" as the run log. There is no need to specify the full path to
+        #       "stdout.txt" as it will be inferred based on 'prof_out' location.
+        # 5) lwp processed output file -  "lwp.csv"
+        #
+        lwp_csv = self._temp_dir.relpath("lwp.csv")
+        if self._android_serial_number == "simulator":
+            process_lwp_output(self._dso_binary_path, self._android_serial_number, prof_out, "stdout.txt", lwp_csv)
+        else:
+            # For on-device run
+            self._proc.kill() # End the child process for logcat
+            if os.path.exists(self._logcat_path):
+                process_lwp_output(
+                    self._dso_binary_path, self._android_serial_number, prof_out, self._logcat_path, lwp_csv
+                )
+            else:
+                raise RuntimeError("Error processing lwp output - missing logcat file")
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
index 4aa5b7bb8fe2..5842da9359d1 100644
--- a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -22,7 +22,6 @@
 import os
 from re import search, compile
 from collections import OrderedDict
-from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 
 ENABLE_DEBUG = False
 """
@@ -317,7 +316,8 @@ def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log:
         lines = f.read()
         a = pattern.search(lines)
         load_addr = int(a.group(1), 16)
-
+    if ENABLE_DEBUG:
+      print('load_addr : ', load_addr)
     return load_addr
 
 
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 43c1ae49ea3a..2b9a354f5c7e 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -256,6 +256,10 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(
       tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
+  // This pass instruments the loops with the profile builtin calls to capture the runtime
+  // performance data (only enabled for Hexagon at the moment). To ensure that no other
+  // optimizations are performed on the instrumented code, this pass must be added at the end
+  // of the list.
   if (instrument_lwp) {
     pass_list.push_back(tir::transform::InstrumentProfileIntrinsics());
   }
diff --git a/src/runtime/hexagon/profiler/README.md b/src/runtime/hexagon/profiler/README.md
index 6a4cbf451008..492e45c98498 100644
--- a/src/runtime/hexagon/profiler/README.md
+++ b/src/runtime/hexagon/profiler/README.md
@@ -57,9 +57,7 @@ The steps involved are as follows:
 1) While building a model, set `tir.instrument_lwp` to `True`.
    By default, the builtin calls will only be inserted for the loops with siblings. But it
    can be altered using LWP config options as described above.
-2) Save the binary file as it will be needed to process the profiling data (lwp.json) later.
-3) Create `HexagonProfiler` object. It is passed to `get_profile_output` to check if the model was
-built with profiling enabled before copying the data from the device.
+2) Create `HexagonProfiler` object
 
 ```
 with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
@@ -69,83 +67,32 @@ with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True})
         ...
     )
 
-    # Save binary file to post-process lwp output
-    lowered.get_lib().save(dso_binary_path)
-
     # Create HexagonProfiler object. It sets the profiling mode based on the PassContext config.
-    profiler = HexagonProfiler()
+    # '--hexagon-debug' to pytest can be used to retain any temp or test directories to
+    # inspect the profiling data.
+    profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
 ```
 
-4) Run the model and get profile data (`lwp.json`) from the device (or the simulator):
-
-**Note:**
-
-- For on-device runs, 'lwp.json' is generated in the same remote directory where 'tvm_rpc_android'
-is copied. This remote path is needed to copy the file from the device and can be found in
-'hexagon_server_process["launcher"].workspace'.
-
-- For the simulator runs, the remote path is not needed as the 'lwp.json' file is generated in the
-simulator test output directory.
+4) Run the model and get the profiling data as a CSV file. It is done by post-processing
+   'lwp.json' file generated during runtime.
 
 ```
-    remote_path = ""
-    if android_serial_number is not None and android_serial_number != "simulator":
-        # Get the workspace on the device to extract lwp output
-        remote_path = hexagon_server_process["launcher"]._workspace
-
-    # Get profile data (lwp.json) from the device
-    prof_out = hexagon_launcher.get_profile_output(profiler, hexagon_session, remote_path, temp)
+    graph_mod.run(**inputs)
 
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
 ```
+**Note:**
 
-5) Process `lwp.json` and construct an easy-to-read csv file.
-
-This step requires several parameters as explained below:
-
-- Path of the binary file
-- android_serial_number
-- Path of the lwp json file (lwp.json) which gets created in the current directory
-- Path to the run log depending on the environment:
-  - For on-device runs:
-     Use logcat output as the run log
-     To get the logcat output:
-     - Create /vendor/lib/rfsa/adsp/tvm_rpc_android.farf on the device
-     - Run logcat command in the background or in a separate terminal while
-       running the test:
-       adb -s <device-id> logcat -c && adb -s <device-id> logcat 2>&1 | tee /tmp//logcat.log
-  - For simulator runs:
-     Use "stdout.txt" as the run log. There is no need to specify the full path to
-     "stdout.txt" as it will be inferred based on 'prof_out' location.
-- lwp processed output file -  "lwp.csv"
-
-**Note:** For on-device run, the logcat output needs to be collected manually and its path
-must be passed to 'process_lwp_output' as mentioned above.
-
-```
-    lwp_csv = temp.relpath("lwp.csv")
-    if android_serial_number == "simulator":
-        process_lwp_output(dso_binary_path, android_serial_number, prof_out, "stdout.txt", lwp_csv)
-    else:
-        # For on-device run
-        if os.path.exists("/tmp/logcat.log"):
-            process_lwp_output(
-                dso_binary_path, android_serial_number, prof_out, "/tmp/logcat.log", lwp_csv
-            )
-        else:
-            print("WARNING: Error processing lwp output - missing logcat file")
-```
+- For on-device runs, 'lwp.json' is copied into a temp directory along with the test .so and the processed
+  CSV file
+- For the simulator runs, the file is generated in the simulator test output directory. Test  .so
+  will still be in a separate temp directory. lwp CSV file will also be in the same directory.
 
 **Helpful Hints:**
 
-1) The above code snippet generates 'lwp.csv' in the temporary directory which gets deleted when the
-test exits. To keep the temp directory, set `keep_for_debug` to `True` while creating it. Alternatively,
-you can set `lwp_csv` to "/tmp/lwp.csv".
-
-```
-temp = utils.tempdir(keep_for_debug=True)
-```
-
-2) To prevent the test directories on the Hexagon device from being deleted, pass `--hexagon-debug` to pytest.
+- To prevent the test directories on the Hexagon device as well as temporary test directory on x86
+from being deleted for profiling related runs, pass `--hexagon-debug` to pytest.
 
 ```
 python -m pytest --hexagon-debug tests/python/contrib/test_hexagon/test_launcher.py::test_lwp
diff --git a/src/runtime/hexagon/profiler/lwp_handler.S b/src/runtime/hexagon/profiler/lwp_handler.S
index d457ad4f4378..611c0713111a 100644
--- a/src/runtime/hexagon/profiler/lwp_handler.S
+++ b/src/runtime/hexagon/profiler/lwp_handler.S
@@ -17,71 +17,99 @@
  * under the License.
  */
 
+/*
+Lightweight profiling handler to record processor cycles in a buffer
+(pointed by __lwp_buffer_ptr) for a given invocation of the handler. To keep the
+buffer size within a resonable limit, we only recond data for the first 100
+invocation of the handler for a given loop or function ID (passed in R0 register).
+The buffer size wouldn't be a concern if the loops with only siblings are getting
+profiled. However, since the instrumentation provides several different profiling
+options, this approach ensures that they all function as expexted. We use second
+buffer (pointed by __lwp_counter) to keep count of the calls made to lwp_handler
+function for each function/loop.
+
+Brief explanation of all the global variables used:
+1) __lwp_counter : Pointer to the buffer that keeps count of the number of times handler
+ is called for a given ID. To reduce the complexity of the handler, __lwp_counter is
+ indexed using the ID itself.
+2) __lwp_buffer_ptr : Pointer to the buffer that records loop/function ID, processor cycles
+ and return addresss of the handler. Return address is used to reconstruct the call graph
+ (loop-nest) to make it easier to analyze the profiling data.
+3) __lwp_buffer_size : Size of the buffer
+4) __lwp_buffer_count : Offset into main lwp buffer where data for the current handler
+invocation needs to be written.
+
+NOTE: The handler function saves and restores R0-R5 registers which are caller saved registers
+on Hexagon and should be handled at the callsite. However, to reduce the codegen impact
+of the handler calls on the caller functions, we decided to move this part into the
+handler itself.
+
+*/
   .text
   .globl  lwp_handler
   .falign
   .type  lwp_handler,@function
 lwp_handler:
-  { allocframe(#24)
-    memd(r29+#-16) = r5:4
+  { allocframe(#24)                            // Allocate 24 bytes on the stack to save R0-R5 registers
+    memd(r29+#-16) = r5:4                      // Save R5,R4
   }
   {
-    memd(r29+#8) = r3:2
-    memd(r29+#0) = r1:0
-    r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+    memd(r29+#8) = r3:2                        // Save R3,R2
+    memd(r29+#0) = r1:0                        // Save R1, R0
+    r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL) // Get GOT address
   }
   {
-    r5 = memw(r2+##__lwp_counter@GOT)
-    r3 = memw(r2+##__lwp_buffer_count@GOT)
+    r5 = memw(r2+##__lwp_counter@GOT)         // Get address of the pointer to __lwp_counter
+    r3 = memw(r2+##__lwp_buffer_count@GOT)    // Get the address of __lwp_buffer_count
   }
   {
-    r5 = memw(r5+#0)
-    r3 = memw(r3+#0)
+    r5 = memw(r5+#0)                          // Get the address of __lwp_counter (address of the main lwp buffer)
+    r3 = memw(r3+#0)                          // Get the __lwp_buffer_count value (offset into the main buffer)
   }
   {
-    r4 = memw(r5+r0<<#2)
-    r1 = memw(r2+##__lwp_buffer_size@GOT)
+    r4 = memw(r5+r0<<#2)                      // Get the handler invocation count for the ID (passed in R0)
+    r1 = memw(r2+##__lwp_buffer_size@GOT)     // Get the address of __lwp_buffer_size
   }
   {
-    r4 = add(r4,#1)
-    memw(r5+r0<<#2) = r4.new
-    r1 = memw(r1+#0)
+    r4 = add(r4,#1)                           // Increment count
+    memw(r5+r0<<#2) = r4.new                  // Update count in __lwp_counter for a given ID
+    r1 = memw(r1+#0)                          // Get the buffer size
   }
   {
-    p0 = cmp.gtu(r4,#100)
+    p0 = cmp.gtu(r4,#100)                     // Exit if count for a given ID is greater than 100
     if (p0.new) jump:nt .LBB0_3
-    r5 = memw(r2+##__lwp_buffer_ptr@GOT)
+    r5 = memw(r2+##__lwp_buffer_ptr@GOT)      // Get address of the pointer to __lwp_buffer_ptr
   }
   {
-    r5 = memw(r5+#0)
-    r2 = memw(r2+##__lwp_buffer_count@GOT)
+    r5 = memw(r5+#0)                          // Get address of __lwp_buffer_ptr
+    r2 = memw(r2+##__lwp_buffer_count@GOT)    // Get address of __lwp_buffer_count
   }
   {
-    r4 = add(r3,#4)
-    if (!cmp.gtu(r1,r4.new)) jump:t .LBB0_3
+    r4 = add(r3,#4)                           // Increment the offset by 4 since 4 int32 values are stored for each invocation
+    if (!cmp.gtu(r1,r4.new)) jump:t .LBB0_3   // Exit if the main lwp buffer has run out of space
   }
   {
-    r5 = addasl(r5,r3,#2)
-    memw(r2+#0) = r4
+    r5 = addasl(r5,r3,#2)                     // Get the address where the data needs to be recorded
+    memw(r2+#0) = r4                          // Save next offset into __lwp_buffer_count
   }
   {
-    memw(r5+#0) = r31
-    r1:0 = C15:14
-    memw(r5+#4) = r0 // id
+    memw(r5+#0) = r31                         // Save return address of this function
+    r1:0 = C15:14                             // Control registers that keep processor cycle count (64-bits)
+    memw(r5+#4) = r0                          // Save loop/function ID
   }
   {
-    memw(r5+#12) = r1 // pcyclehi
-    memw(r5+#8) = r0 // pcyclelo
+    memw(r5+#12) = r1                         // Save upper 32 bits
+    memw(r5+#8) = r0                          // Save lower 32 bits
   }
   .falign
 .LBB0_3:
   {
-    r5:4 = memd(r29+#16)
+    r5:4 = memd(r29+#16)                     // Restore the registers from the stack
     r3:2 = memd(r29+#8)
   }
   {
     r1:0 = memd(r29+#0)
-    dealloc_return
+    dealloc_return                          // Deallocate the stack and return
   }
 .Lfunc_end0:
   .size  lwp_handler, .Lfunc_end0-lwp_handler
diff --git a/src/runtime/hexagon/profiler/prof_utils.cc b/src/runtime/hexagon/profiler/prof_utils.cc
index 3682c40699e5..45cbe607031d 100644
--- a/src/runtime/hexagon/profiler/prof_utils.cc
+++ b/src/runtime/hexagon/profiler/prof_utils.cc
@@ -30,13 +30,13 @@
 // the max number of entries recorded for each instrumented location.
 #define LWP_BUFFER_SIZE (LWP_COUNTER_SIZE * 100)
 
-unsigned int lwp_counter[LWP_COUNTER_SIZE] = {0};
-unsigned int lwp_buffer[LWP_BUFFER_SIZE];
-unsigned int* __lwp_counter = lwp_counter;
-unsigned int* __lwp_buffer_ptr = lwp_buffer;
-unsigned int __lwp_buffer_size = LWP_BUFFER_SIZE;
-unsigned int __lwp_enable_flag = 1;
-unsigned int __lwp_buffer_count = 0;
+uint32_t lwp_counter[LWP_COUNTER_SIZE] = {0};
+uint32_t lwp_buffer[LWP_BUFFER_SIZE];
+uint32_t* __lwp_counter = lwp_counter;
+uint32_t* __lwp_buffer_ptr = lwp_buffer;
+uint32_t __lwp_buffer_size = LWP_BUFFER_SIZE;
+uint32_t __lwp_enable_flag = 1;
+uint32_t __lwp_buffer_count = 0;
 
 bool WriteLWPOutput(const std::string& out_json) {
   std::ostringstream s;
@@ -55,14 +55,12 @@ bool WriteLWPOutput(const std::string& out_json) {
   }
   s << "\t],\n\n";
   s << "\t\"loop_counts\":[\n";
-  bool need_comma = false;
   for (size_t i = 0; i < LWP_COUNTER_SIZE; i++) {
     s << "\t\t" << lwp_counter[i] / 2;
     if (i < LWP_COUNTER_SIZE - 1)
       s << ",\n";
     else
       s << "\n";
-    need_comma = true;
   }
   s << "\t]\n}\n";
   std::ofstream ofc(out_json);
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 943a09922437..ddfe1b9173ac 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -195,7 +195,7 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
 }
 
 llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
-  if (op->op.same_as(builtin::profile_intrinsic())) {
+  if (op->op.same_as(builtin::start_profile_intrinsic()) || op->op.same_as(builtin::end_profile_intrinsic())) {
     llvm::Value* id = MakeValue(op->args[0]);
     auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom;
     llvm::Function* func = llvm::Intrinsic::getDeclaration(module_.get(), instrprof_id);
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 2798631d8d0f..143e2d176c2f 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1318,7 +1318,8 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     // TODO(masahi): Support atomic for CPU backend
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
     return nullptr;
-  } else if (op->op.same_as(builtin::profile_intrinsic())) {
+  } else if (op->op.same_as(builtin::start_profile_intrinsic()) ||
+    op->op.same_as(builtin::end_profile_intrinsic())) {
     LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op;
     return nullptr;
   } else {
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 2359fd3e7b10..b605b9de1e02 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -302,7 +302,10 @@ TIR_DEFINE_BUILTIN_FUNC(undef)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kReadState))
     .set_num_inputs(0);
 
-TIR_DEFINE_BUILTIN_FUNC(profile_intrinsic)
+TIR_DEFINE_BUILTIN_FUNC(start_profile_intrinsic)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_BUILTIN_FUNC(end_profile_intrinsic)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
 }  // namespace builtin
diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
index bcf296478aff..5f52fc6630bc 100644
--- a/src/tir/transforms/profile_instrumentation.cc
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -49,7 +49,7 @@ struct LoopInfo {
     has_parallel = false;
   }
   unsigned id;
-  unsigned depth;
+  int32_t depth;
   int32_t height;
   bool has_siblings;
   // Set to 'true' if ForKind::kParallel is set for the current loop or one of its ancestor
@@ -110,8 +110,9 @@ class LoopAnalyzer : public StmtExprVisitor {
     } else if (stmt->IsInstance<IfThenElseNode>()) {
       const IfThenElseNode* n = stmt.as<IfThenElseNode>();
       unsigned height = TraverseLoop(n->then_case, parent_depth, has_parallel);
-      if (n->else_case.defined())
+      if (n->else_case.defined()) {
         height = std::max(height, TraverseLoop(n->else_case, parent_depth, has_parallel));
+      }
       return height;
     } else if (stmt->IsInstance<ForNode>()) {
       const ForNode* f = stmt.as<ForNode>();
@@ -193,9 +194,11 @@ class InstrumentIntrin : public StmtMutator {
       return stmt;
     }
     PrimExpr id = static_cast<int32_t>(loop_info.id);
-    PrimExpr call = Call(DataType::Handle(), builtin::profile_intrinsic(), {id});
-    const Stmt profile = Evaluate(call);
-    Stmt new_stmt = SeqStmt({profile, stmt, profile});
+    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {id});
+    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {id});
+    const Stmt start_profile = Evaluate(start_call);
+    const Stmt end_profile = Evaluate(end_call);
+    Stmt new_stmt = SeqStmt({start_profile, stmt, end_profile});
     return new_stmt;
   }
 
@@ -215,10 +218,11 @@ class CheckParallelLoops : public StmtExprVisitor {
 
  private:
   void VisitStmt_(const ForNode* op) final {
-    if (op->kind == ForKind::kParallel)
+    if (op->kind == ForKind::kParallel) {
       has_parallel = true;
-    else
+    } else {
       StmtExprVisitor::VisitStmt_(op);
+    }
   }
 
   bool has_parallel = false;
@@ -230,9 +234,11 @@ PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_
 
   PrimExpr e = start_id++;
   if (!disable_func_instrumentation) {
-    PrimExpr call = Call(DataType::Handle(), builtin::profile_intrinsic(), {e});
-    const Stmt profile = Evaluate(call);
-    func_ptr->body = SeqStmt({profile, std::move(func_ptr->body), profile});
+    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {e});
+    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {e});
+    const Stmt start_profile = Evaluate(start_call);
+    const Stmt end_profile = Evaluate(end_call);
+    func_ptr->body = SeqStmt({start_profile, std::move(func_ptr->body), end_profile});
   }
   InstrumentIntrin p(max_instr_depth, min_instr_height, instr_siblings);
   p.GetLoopInfo(func_ptr);
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 5555303862ab..7da69b872f8d 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -19,17 +19,13 @@
 import pytest
 
 import numpy as np
-import os
 
 import tvm.testing
 from tvm import relay, te
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
-from tvm.contrib import utils
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
-
 from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
-from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
 
 from .infrastructure import get_hexagon_target
 
@@ -579,6 +575,7 @@ def test_lwp(
     hexagon_server_process,
     hexagon_launcher: HexagonLauncherRPC,
     hexagon_session: Session,
+    hexagon_debug
 ):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
@@ -605,9 +602,6 @@ def test_lwp(
     data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
     params = {"weight": weight_in}
     inputs = {"data": data_in}
-    temp = utils.tempdir(keep_for_debug=True)
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
 
     with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
         lowered = tvm.relay.build(
@@ -616,56 +610,16 @@ def test_lwp(
             runtime=runtime,
             executor=executor,
         )
-        # Save binary file to post-process lwp output
-        lowered.get_lib().save(dso_binary_path)
-        profiler = HexagonProfiler()
+        # Create HexagonProfiler object
+        profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
 
     graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)
     graph_mod.run(**inputs)
     hexagon_output = graph_mod.get_output(0).numpy()
 
-    # Get profiling data
-    remote_path = ""
-    android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
-    if android_serial_number is not None and android_serial_number != "simulator":
-        # Get the workspace on the device to extract lwp output
-        remote_path = hexagon_server_process["launcher"]._workspace
-
-    prof_out = hexagon_launcher.get_profile_output(profiler, hexagon_session, remote_path, temp)
-
-    # Process lightweight profiling output into an easily readable csv file
-    # The post-processing requires following parameters:
-    # 1) Path of the binary file
-    # 2) android_serial_number
-    # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
-    # 4) Path to the run log depending on the environment:
-    #    a) For on-device runs:
-    #       Use logcat output as the run log
-    #       To get the logcat output:
-    #       - Create /vendor/lib/rfsa/adsp/tvm_rpc_android.farf on the device
-    #       - Run logcat command in the background or in a separate terminal while
-    #         running the test:
-    #         adb -s <device-id> logcat -c && adb -s <device-id> logcat 2>&1 | tee /tmp//logcat.log
-    #    b) For simulator runs:
-    #       Use "stdout.txt" as the run log. There is no need to specify the full path to
-    #       "stdout.txt" as it will be inferred based on 'prof_out' location.
-    # 5) lwp processed output file -  "lwp.csv"
-    #
-    # NOTE: For on-device run, the logcat output needs to be collected manually and its path
-    # must be passed to 'process_lwp_output' as mentioned above.
-    #
-    lwp_csv = temp.relpath("lwp.csv")
-    if android_serial_number == "simulator":
-        process_lwp_output(dso_binary_path, android_serial_number, prof_out, "stdout.txt", lwp_csv)
-    else:
-        # For on-device run
-        if os.path.exists("/tmp/logcat.log"):
-            process_lwp_output(
-                dso_binary_path, android_serial_number, prof_out, "/tmp/logcat.log", lwp_csv
-            )
-        else:
-            print("WARNING: Error processing lwp output - missing logcat file")
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):
@@ -688,6 +642,7 @@ def test_lwp_multiple_conv2d(
     hexagon_server_process,
     hexagon_launcher: HexagonLauncherRPC,
     hexagon_session: Session,
+    hexagon_debug
 ):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
@@ -722,10 +677,6 @@ def test_lwp_multiple_conv2d(
     runtime = Runtime("cpp")
     executor = Executor("graph")
 
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-
     weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
         dtype=dtype
     )
@@ -746,56 +697,16 @@ def test_lwp_multiple_conv2d(
             runtime=runtime,
             executor=executor,
         )
-        # Save binary file to post-process lwp output
-        lowered.get_lib().save(dso_binary_path)
-        profiler = HexagonProfiler()
+        # Create HexagonProfiler object
+        profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
 
     graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)
     graph_mod.run(**inputs)
     hexagon_output = graph_mod.get_output(0).numpy()
 
-    # Get profiling data
-    remote_path = ""
-    android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
-    if android_serial_number is not None and android_serial_number != "simulator":
-        # Get the workspace on the device to extract lwp output
-        remote_path = hexagon_server_process["launcher"]._workspace
-
-    prof_out = hexagon_launcher.get_profile_output(profiler, hexagon_session, remote_path, temp)
-
-    # Process lightweight profiling output into an easily readable csv file
-    # The post-processing requires following parameters:
-    # 1) Path of the binary file
-    # 2) android_serial_number
-    # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
-    # 4) Path to the run log depending on the environment:
-    #    a) For on-device runs:
-    #       Use logcat output as the run log
-    #       To get the logcat output:
-    #       - Create /vendor/lib/rfsa/adsp/tvm_rpc_android.farf on the device
-    #       - Run logcat command in the background or in a separate terminal while
-    #         running the test:
-    #         adb -s <device-id> logcat -c && adb -s <device-id> logcat 2>&1 | tee /tmp//logcat.log
-    #    b) For simulator runs:
-    #       Use "stdout.txt" as the run log. There is no need to specify the full path to
-    #       "stdout.txt" as it will be inferred based on 'prof_out' location.
-    # 5) lwp processed output file -  "lwp.csv"
-    #
-    # NOTE: For on-device run, the logcat output needs to be collected manually and its path
-    # must be passed to 'process_lwp_output' as mentioned above.
-    #
-    lwp_csv = temp.relpath("lwp.csv")
-    if android_serial_number == "simulator":
-        process_lwp_output(dso_binary_path, android_serial_number, prof_out, "stdout.txt", lwp_csv)
-    else:
-        # For on-device run
-        if os.path.exists("/tmp/logcat.log"):
-            process_lwp_output(
-                dso_binary_path, android_serial_number, prof_out, "/tmp/logcat.log", lwp_csv
-            )
-        else:
-            print("WARNING: Error processing lwp output - missing logcat file")
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):

From 2b6b15dffc45bdc385b479287bbb70f183756378 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Fri, 14 Oct 2022 14:43:09 -0500
Subject: [PATCH 05/13] Ignore profile builtins if llvm version < 15.0

---
 src/target/llvm/codegen_hexagon.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index ddfe1b9173ac..170d0682fa85 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -195,6 +195,7 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
 }
 
 llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
+#if TVM_LLVM_VERSION >= 150
   if (op->op.same_as(builtin::start_profile_intrinsic()) || op->op.same_as(builtin::end_profile_intrinsic())) {
     llvm::Value* id = MakeValue(op->args[0]);
     auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom;
@@ -210,6 +211,7 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
     llvm::Type* t_int8_p_ = t_int8_->getPointerTo();
     return builder_->CreateCall(func, {llvm::ConstantExpr::getBitCast(name_var, t_int8_p_), id});
   }
+#endif
   return CodeGenCPU::CreateIntrinsic(op);
 }
 

From b9bd7c37d8cfbff658398d8fd5eedc43a7d07b39 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Tue, 18 Oct 2022 11:50:04 -0500
Subject: [PATCH 06/13] Add src/runtime/hexagon/profiler/lwp_handler.S to
 allowed list

---
 tests/lint/check_file_type.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 162e4a1cc7a1..b895b05bd71b 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -157,6 +157,7 @@
     "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template",
     # Hexagon
     "src/runtime/hexagon/rpc/android_bash.sh.template",
+    "src/runtime/hexagon/profiler/lwp_handler.S"
 }
 
 

From 9ab2ccdc6087bf24fd0aaab3e4e47da118982082 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Tue, 18 Oct 2022 12:16:53 -0500
Subject: [PATCH 07/13] Address reformatting issues

---
 .../tvm/contrib/hexagon/hexagon_profiler.py   | 27 +++++++++------
 .../hexagon/profiling/process_lwp_data.py     | 33 +++++++++----------
 tests/lint/check_file_type.py                 |  2 +-
 .../contrib/test_hexagon/test_launcher.py     |  4 +--
 4 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
index 7ee7798964fe..a16b5528d539 100755
--- a/python/tvm/contrib/hexagon/hexagon_profiler.py
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -23,6 +23,7 @@
 from tvm.relay.backend.executor_factory import ExecutorFactoryModule
 from tvm.contrib import utils
 
+
 class HexagonProfiler:
     """Hexagon Profiler"""
 
@@ -49,14 +50,14 @@ def __init__(self, module: ExecutorFactoryModule, hexagon_server_process, enable
 
             launcher = hexagon_server_process["launcher"]
             if self._android_serial_number != "simulator":
-              # Clear the logcat buffer and create a child process to redirect logcat output into a file.
-              subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
-              self._logcat_path = self._temp_dir.relpath('logcat.log')
-              f=open(self._logcat_path, 'w')
-              self._proc = subprocess.Popen(launcher._adb_device_sub_cmd + ["logcat"], stdout = f)
+                # Clear the logcat buffer and create a child process to redirect logcat output into a file.
+                subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
+                self._logcat_path = self._temp_dir.relpath("logcat.log")
+                f = open(self._logcat_path, "w")
+                self._proc = subprocess.Popen(launcher._adb_device_sub_cmd + ["logcat"], stdout=f)
 
-              # Get the remote workspace on the device from where the lwp data needs to be copied.
-              self._remote_path = launcher._workspace
+                # Get the remote workspace on the device from where the lwp data needs to be copied.
+                self._remote_path = launcher._workspace
 
         if self._profiling_mode is None:
             raise "Profiling mode was not set or was not a valid one."
@@ -94,13 +95,19 @@ def get_profile_output(self, hexagon_launcher, hexagon_session, hexagon_server_p
         #
         lwp_csv = self._temp_dir.relpath("lwp.csv")
         if self._android_serial_number == "simulator":
-            process_lwp_output(self._dso_binary_path, self._android_serial_number, prof_out, "stdout.txt", lwp_csv)
+            process_lwp_output(
+                self._dso_binary_path, self._android_serial_number, prof_out, "stdout.txt", lwp_csv
+            )
         else:
             # For on-device run
-            self._proc.kill() # End the child process for logcat
+            self._proc.kill()  # End the child process for logcat
             if os.path.exists(self._logcat_path):
                 process_lwp_output(
-                    self._dso_binary_path, self._android_serial_number, prof_out, self._logcat_path, lwp_csv
+                    self._dso_binary_path,
+                    self._android_serial_number,
+                    prof_out,
+                    self._logcat_path,
+                    lwp_csv,
                 )
             else:
                 raise RuntimeError("Error processing lwp output - missing logcat file")
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
index 5842da9359d1..cee65bba44e6 100644
--- a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -83,9 +83,9 @@ def find_func(func_info, offset):
         ms = func_info[midx]["start"]
         me = func_info[midx]["end"]
         if fidx == lidx:
-            assert offset >= ms and offset <= me, (
-                f"Couldn't find a function for this offset: {offset}"
-            )
+            assert (
+                offset >= ms and offset <= me
+            ), f"Couldn't find a function for this offset: {offset}"
             return fidx
         else:
             if offset > me:
@@ -102,9 +102,9 @@ def accumulate_cycles(overall_cycles, func_cycles, func_name):
     acc_cycles = overall_cycles[func_name]
     for id in func_cycles:
         assert id in acc_cycles, f"id [{id}] missing in the existing function record"
-        assert acc_cycles[id]["start"] == func_cycles[id]["start"], (
-            "Offset value doesn't match with the existing function record."
-        )
+        assert (
+            acc_cycles[id]["start"] == func_cycles[id]["start"]
+        ), "Offset value doesn't match with the existing function record."
         acc_cycles[id]["cycles"] += func_cycles[id]["cycles"]
         acc_cycles[id]["count"] += func_cycles[id]["count"]
     overall_cycles.update({func_name: acc_cycles})
@@ -248,9 +248,9 @@ def process_data(data, func_info, so_ld_addr):
             )
             ordered_visited_list.pop()
             entry_node = visited_set.pop(id)
-            assert entry_node["func_idx"] == func_idx, (
-                f'Error - Found under a different function name : {entry_node["func_idx"]}'
-            )
+            assert (
+                entry_node["func_idx"] == func_idx
+            ), f'Error - Found under a different function name : {entry_node["func_idx"]}'
             cycles = entry["cyc"] - entry_node["cyc"]
             parent = -1
             if ordered_visited_list:
@@ -283,8 +283,7 @@ def process_data(data, func_info, so_ld_addr):
         f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
         f"\t Possible reasons -- \n"
         f"\t\t1) Mismatch between model .so and json file.\n"
-        f"\t\t2) LWP buffer may have overflowed resulting into missing entries!"
-        % prev_func_name
+        f"\t\t2) LWP buffer may have overflowed resulting into missing entries!" % prev_func_name
     )
 
     overall_cycles = adjust_per_loop_counts(overall_cycles, data)
@@ -301,9 +300,9 @@ def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log:
             # If the directory name is specified for the run_log of the
             # simulator (stdout.txt) then it must be same as lwp_json.
             run_log_dir = os.path.dirname(run_log)
-            assert run_log_dir == "" or run_log_dir == basedir, (
-                f"stdout.txt and {os.path.basename(lwp_json)} must be in the same directory"
-            )
+            assert (
+                run_log_dir == "" or run_log_dir == basedir
+            ), f"stdout.txt and {os.path.basename(lwp_json)} must be in the same directory"
             run_log = os.path.join(basedir, os.path.basename(run_log))
         # To extract load address for the simulator run
         pattern = compile(r"Model.*: (\w+):")
@@ -317,7 +316,7 @@ def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log:
         a = pattern.search(lines)
         load_addr = int(a.group(1), 16)
     if ENABLE_DEBUG:
-      print('load_addr : ', load_addr)
+        print("load_addr : ", load_addr)
     return load_addr
 
 
@@ -338,8 +337,8 @@ def process_lwp_output(
     load_addr = get_load_addr(binary_path, serial_number, lwp_json, run_log)
     # Opening JSON file
     with open(lwp_json, "r") as f:
-      # Returns JSON object as a dictionary
-      data = json.load(f)
+        # Returns JSON object as a dictionary
+        data = json.load(f)
 
     # Get function names, and their start and end offsets from the model .so
     func_info = get_func_info(binary_path)
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index b895b05bd71b..527c79754796 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -157,7 +157,7 @@
     "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template",
     # Hexagon
     "src/runtime/hexagon/rpc/android_bash.sh.template",
-    "src/runtime/hexagon/profiler/lwp_handler.S"
+    "src/runtime/hexagon/profiler/lwp_handler.S",
 }
 
 
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 7da69b872f8d..a7215736ed02 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -575,7 +575,7 @@ def test_lwp(
     hexagon_server_process,
     hexagon_launcher: HexagonLauncherRPC,
     hexagon_session: Session,
-    hexagon_debug
+    hexagon_debug,
 ):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
@@ -642,7 +642,7 @@ def test_lwp_multiple_conv2d(
     hexagon_server_process,
     hexagon_launcher: HexagonLauncherRPC,
     hexagon_session: Session,
-    hexagon_debug
+    hexagon_debug,
 ):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)

From fd52d05fc42a6c018969bea49771acf411694bb2 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Wed, 19 Oct 2022 11:02:17 -0500
Subject: [PATCH 08/13] Fix pylint errors

---
 .../tvm/contrib/hexagon/hexagon_profiler.py   | 19 ++++++++++++-------
 .../contrib/test_hexagon/test_launcher.py     |  4 ++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
index a16b5528d539..1cdc0a7663c9 100755
--- a/python/tvm/contrib/hexagon/hexagon_profiler.py
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -15,10 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""Define HexagonProfiler class to enable profiling for Hexagon"""
+
 import os
 import subprocess
 from tvm.ir.transform import PassContext
-from tvm.contrib.hexagon.session import Session
 from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
 from tvm.relay.backend.executor_factory import ExecutorFactoryModule
 from tvm.contrib import utils
@@ -50,17 +51,20 @@ def __init__(self, module: ExecutorFactoryModule, hexagon_server_process, enable
 
             launcher = hexagon_server_process["launcher"]
             if self._android_serial_number != "simulator":
-                # Clear the logcat buffer and create a child process to redirect logcat output into a file.
+                # Clear the logcat buffer and create a child process to redirect logcat output
+                # into a file.
                 subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
                 self._logcat_path = self._temp_dir.relpath("logcat.log")
-                f = open(self._logcat_path, "w")
-                self._proc = subprocess.Popen(launcher._adb_device_sub_cmd + ["logcat"], stdout=f)
+                self._fo = open(self._logcat_path, "w")
+                self._proc = subprocess.Popen(
+                    launcher._adb_device_sub_cmd + ["logcat"], stdout=self._fo
+                )
 
                 # Get the remote workspace on the device from where the lwp data needs to be copied.
                 self._remote_path = launcher._workspace
 
         if self._profiling_mode is None:
-            raise "Profiling mode was not set or was not a valid one."
+            raise RuntimeError("Profiling mode was not set or was not a valid one.")
 
     def get_mode(self):
         return self._profiling_mode
@@ -74,8 +78,8 @@ def get_temp_dir(self):
     def get_remote_path(self):
         return self._remote_path
 
-    def get_profile_output(self, hexagon_launcher, hexagon_session, hexagon_server_process):
-        # Get runtime profiling data
+    def get_profile_output(self, hexagon_launcher, hexagon_session):
+        """Get runtime profiling data"""
         prof_out = hexagon_launcher.get_profile_output(self, hexagon_session)
 
         print("lwp json can be found at -- ", prof_out)
@@ -101,6 +105,7 @@ def get_profile_output(self, hexagon_launcher, hexagon_session, hexagon_server_p
         else:
             # For on-device run
             self._proc.kill()  # End the child process for logcat
+            self._fo.close()
             if os.path.exists(self._logcat_path):
                 process_lwp_output(
                     self._dso_binary_path,
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index a7215736ed02..12df1798859e 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -619,7 +619,7 @@ def test_lwp(
     hexagon_output = graph_mod.get_output(0).numpy()
 
     # Get lightweight profiling output as a CSV file
-    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
+    profiler.get_profile_output(hexagon_launcher, hexagon_session)
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):
@@ -706,7 +706,7 @@ def test_lwp_multiple_conv2d(
     hexagon_output = graph_mod.get_output(0).numpy()
 
     # Get lightweight profiling output as a CSV file
-    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
+    profiler.get_profile_output(hexagon_launcher, hexagon_session)
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):

From 39204ae459ee0a459acfd2034aa66fd86a53d692 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Wed, 19 Oct 2022 12:10:05 -0500
Subject: [PATCH 09/13] Address remaining linter failures

---
 python/tvm/contrib/hexagon/build.py       | 11 +++++------
 src/runtime/hexagon/profiler/prof_utils.h | 10 ++++++++++
 src/target/llvm/codegen_hexagon.cc        |  3 ++-
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index e5799bb30c11..c0e6439d0357 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -32,10 +32,9 @@
 from typing import Union
 
 import tvm
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 from ..._ffi import libinfo
 from .session import Session
-from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
-from tvm.contrib.utils import TempDirectory
 
 
 HEXAGON_RPC_LIB_DIR = os.environ.get("HEXAGON_RPC_LIB_DIR")
@@ -670,8 +669,8 @@ def get_profile_output(
             fname = "lwp.json"
             out_path = os.path.join(remote_path, fname)
             profile_data = temp_dir.relpath(fname)
-            rv = session.get_profile_output(hex_profiler.get_mode(), fname)
-            if rv:
+            ret = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if ret:
                 subprocess.check_call(self._adb_device_sub_cmd + ["pull", out_path, profile_data])
             else:
                 raise RuntimeError("Error generating profile output")
@@ -797,8 +796,8 @@ def get_profile_output(
         if hex_profiler.is_lwp_enabled():
             fname = "lwp.json"
             profile_data = f"{self._workspace}/{fname}"
-            rv = session.get_profile_output(hex_profiler.get_mode(), fname)
-            if not rv:
+            ret = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if not ret:
                 raise RuntimeError("Error generating profile output")
         elif hex_profiler.profiling_mode == "etm":
             raise RuntimeError("ETM Profiling not supported on the simulator")
diff --git a/src/runtime/hexagon/profiler/prof_utils.h b/src/runtime/hexagon/profiler/prof_utils.h
index 5c17ad58ad0a..e086f7a26b36 100644
--- a/src/runtime/hexagon/profiler/prof_utils.h
+++ b/src/runtime/hexagon/profiler/prof_utils.h
@@ -17,4 +17,14 @@
  * under the License.
  */
 
+/*!
+ * \file prof_utils.h
+ */
+#ifndef TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
+#define TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
+
+#include <string>
+
 bool WriteLWPOutput(const std::string&);
+
+#endif  // TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 170d0682fa85..9552a45a60f9 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -196,7 +196,8 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
 
 llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
 #if TVM_LLVM_VERSION >= 150
-  if (op->op.same_as(builtin::start_profile_intrinsic()) || op->op.same_as(builtin::end_profile_intrinsic())) {
+  if (op->op.same_as(builtin::start_profile_intrinsic()) ||
+      op->op.same_as(builtin::end_profile_intrinsic())) {
     llvm::Value* id = MakeValue(op->args[0]);
     auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom;
     llvm::Function* func = llvm::Intrinsic::getDeclaration(module_.get(), instrprof_id);

From d1aae5734f2916476dbdd09683bb7455b5e4b663 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Wed, 19 Oct 2022 13:03:03 -0500
Subject: [PATCH 10/13] clang-format issue

---
 src/target/llvm/codegen_llvm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 143e2d176c2f..ea8a5ff5106a 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1319,7 +1319,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
     return nullptr;
   } else if (op->op.same_as(builtin::start_profile_intrinsic()) ||
-    op->op.same_as(builtin::end_profile_intrinsic())) {
+             op->op.same_as(builtin::end_profile_intrinsic())) {
     LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op;
     return nullptr;
   } else {

From c93cc2311abbcd764263921914ddc54d158cd032 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Wed, 19 Oct 2022 14:33:08 -0500
Subject: [PATCH 11/13] Fix builtin names

---
 .../test_tir_transform_profiling_instr.py     | 80 +++++++++----------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/tests/python/unittest/test_tir_transform_profiling_instr.py b/tests/python/unittest/test_tir_transform_profiling_instr.py
index 3a63963d5e2b..d14e2a4c8925 100644
--- a/tests/python/unittest/test_tir_transform_profiling_instr.py
+++ b/tests/python/unittest/test_tir_transform_profiling_instr.py
@@ -109,18 +109,18 @@ def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     for i, j in T.grid(8, 8):
-        T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
         for k, l in T.grid(8, 16):
             with T.block("B"):
                 vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
-        T.evaluate(T.profile_intrinsic(3, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
         for k, l in T.grid(8, 16):
             with T.block("C"):
                 vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
-        T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
 
 
 @T.prim_func
@@ -128,9 +128,9 @@ def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
-    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
     for i in T.serial(0, 8):
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
@@ -142,8 +142,8 @@ def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
-    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
 
 
 @T.prim_func
@@ -151,26 +151,26 @@ def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
-    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
     for i in T.serial(0, 8):
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
-            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
-            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
-            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
-            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
-    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
 
 
 @T.prim_func
@@ -180,36 +180,36 @@ def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) ->
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     D = T.match_buffer(d, (8, 8, 128), dtype="int32")
     for i in T.serial(0, 8):
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
-            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
-            T.evaluate(T.profile_intrinsic(3, dtype="handle"))
-            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
-            T.evaluate(T.profile_intrinsic(5, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
         for j in T.serial(0, 8):
-            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("C"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
-            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
-            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
-            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))
 
 
 @T.prim_func
@@ -217,9 +217,9 @@ def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
-    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
     for i in T.serial(0, 8):
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
@@ -231,8 +231,8 @@ def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
-    T.evaluate(T.profile_intrinsic(1, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
 
 
 @T.prim_func
@@ -242,7 +242,7 @@ def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) ->
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     D = T.match_buffer(d, (8, 8, 128), dtype="int32")
     for i in T.serial(0, 8):
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.parallel(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
@@ -254,24 +254,24 @@ def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) ->
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
-        T.evaluate(T.profile_intrinsic(2, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
         for j in T.serial(0, 8):
-            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
             for k in T.parallel(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
-            T.evaluate(T.profile_intrinsic(8, dtype="handle"))
-            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
             for k in T.parallel(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
-            T.evaluate(T.profile_intrinsic(10, dtype="handle"))
-        T.evaluate(T.profile_intrinsic(7, dtype="handle"))
+            T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))
 
 
 # By default, only loops with siblings are instrumented.

From 514f8cfea74d0129e521286be43f06e6431f75a7 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Wed, 19 Oct 2022 22:39:53 -0500
Subject: [PATCH 12/13] Resolve test failure for the simulator run

---
 python/tvm/contrib/hexagon/hexagon_profiler.py           | 2 +-
 python/tvm/contrib/hexagon/profiling/process_lwp_data.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
index 1cdc0a7663c9..fdecc09c3164 100755
--- a/python/tvm/contrib/hexagon/hexagon_profiler.py
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -49,10 +49,10 @@ def __init__(self, module: ExecutorFactoryModule, hexagon_server_process, enable
             # Set profiling mode
             self._profiling_mode = "lwp"
 
-            launcher = hexagon_server_process["launcher"]
             if self._android_serial_number != "simulator":
                 # Clear the logcat buffer and create a child process to redirect logcat output
                 # into a file.
+                launcher = hexagon_server_process["launcher"]
                 subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
                 self._logcat_path = self._temp_dir.relpath("logcat.log")
                 self._fo = open(self._logcat_path, "w")
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
index cee65bba44e6..eb92228b7cf3 100644
--- a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -350,6 +350,7 @@ def process_lwp_output(
     overall_cycles = process_data(data, func_info, so_ld_addr)
     create_csv_report(overall_cycles, lwp_out)
     print("lwp processed output written to -- ", lwp_out)
+    print("[NOTE: Use '--hexagon-debug' to keep the temp directory]")
 
 
 def get_args():

From 80353b3bd04c2b47c63df02a94da153b84f7a40c Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <jverma@quicinc.com>
Date: Thu, 20 Oct 2022 17:17:53 -0500
Subject: [PATCH 13/13] Allow for the tests to provide .so name

---
 python/tvm/contrib/hexagon/hexagon_profiler.py     | 5 +++--
 tests/python/contrib/test_hexagon/test_launcher.py | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
index fdecc09c3164..9a5df3d9b99a 100755
--- a/python/tvm/contrib/hexagon/hexagon_profiler.py
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -28,10 +28,11 @@
 class HexagonProfiler:
     """Hexagon Profiler"""
 
-    def __init__(self, module: ExecutorFactoryModule, hexagon_server_process, enable_debug):
+    def __init__(
+        self, dso_binary: str, module: ExecutorFactoryModule, hexagon_server_process, enable_debug
+    ):
         """Configure HexagonProfiler"""
         # Save test .so to process profiling data
-        dso_binary = "test_binary.so"
         self._temp_dir = utils.tempdir(keep_for_debug=enable_debug)
         self._dso_binary_path = self._temp_dir.relpath(dso_binary)
         module.get_lib().save(self._dso_binary_path)
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 12df1798859e..76d5cba60a1f 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -611,7 +611,8 @@ def test_lwp(
             executor=executor,
         )
         # Create HexagonProfiler object
-        profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
+        dso_binary = "test_binary.so"
+        profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug)
 
     graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)
@@ -698,7 +699,8 @@ def test_lwp_multiple_conv2d(
             executor=executor,
         )
         # Create HexagonProfiler object
-        profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
+        dso_binary = "test_binary.so"
+        profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug)
 
     graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)