diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index cc433f245759..943a6266a3b4 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -189,6 +189,46 @@ lowered = tvm.relay.build(
 lowered.export_library("model-aot.so", tvm.contrib.hexagon.link_shared)
 ```
 
+
+## Profiling using hexagon launcher
+
+### Enabling lightweight profiling (LWP) instrumentation
+
+This profiling option can be used to get function and loop level processor cycles.
+This needs to be enabled explicitly while compiling a model. For example:
+
+```
+with tvm.transform.PassContext(config={'tir.instrument_lwp':True} ):
+    lib = relay.build(...)
+```
+
+Here, `instrument_lwp` is used to enable the tir pass which instruments the code with the builtin calls.
+
+During codegen, profiling builtin calls can be replaced with a target specific handler to record runtime
+information into a buffer. This buffer is written into a JSON file which is processed to construct
+function and loop level profiling information.
+
+To generate LWP JSON file, add `--gen_lwp_json` flag to launcher_android:
+
+```
+./launcher_android --in_config input.json --out_config output.json --gen_lwp_json
+```
+
+Please note that `--gen_lwp_json` flag by itself doesn't enable profiling and is only used to dump
+the profiling data into a json file called lwp.json. This file will be created at the same location
+on the device where launcher_android is executed from. To generate the data, profiling instrumentation
+must be enabled while compiling a model as mentioned above.
+
+Use this command to pull `lwp.json` from the device:
+
+```
+adb -s <DEVICE-ID> pull /path/to/lwp.json
+```
+
+**Note:** Please refer to src/runtime/hexagon/profiler/README.md for information on how
+to enable profiling using Hexagon RPC launcher and also to learn about additional profiling related
+config options.
+
 # Disclaimer
 
 The launcher does not perform any correctness verification. In order to verify
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index 5fae6b0a4099..e8bd67dde7a2 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -16,7 +16,7 @@
 # under the License.
 
 cmake_minimum_required(VERSION 3.2)
-project(HexagonLauncherRPCSkel C CXX)
+project(HexagonLauncherRPCSkel C CXX ASM)
 
 include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
 # From the include above get
@@ -68,11 +68,14 @@ set(SKEL_SRCS
   "${LAUNCHER_SRC}/launcher_core.cc"
   "${LAUNCHER_SRC}/launcher_hexagon.cc"
 )
+set(PROFILER_DIR "${TVM_SOURCE_DIR}/src/runtime/hexagon/profiler")
 
 add_library(launcher_rpc_skel SHARED
   "${LAUNCHER_RPC_H}"
   "${LAUNCHER_RPC_SKEL_C}"
   "${SKEL_SRCS}"
+  "${PROFILER_DIR}/prof_utils.cc"
+  "${PROFILER_DIR}/lwp_handler.S"
 )
 
 ExternalProject_Add(static_hexagon_tvm_runtime
diff --git a/apps/hexagon_launcher/launcher_android.cc b/apps/hexagon_launcher/launcher_android.cc
index 008e4fdfe1c4..34db0bdacb60 100644
--- a/apps/hexagon_launcher/launcher_android.cc
+++ b/apps/hexagon_launcher/launcher_android.cc
@@ -55,7 +55,8 @@ AEEResult set_remote_stack_size(int size) {
 }
 
 struct RPCChannel : public ExecutionSession {
-  explicit RPCChannel(const std::string& uri) {
+  explicit RPCChannel(const std::string& uri, bool gen_lwp_json = false)
+      : ExecutionSession(gen_lwp_json) {
     enable_unsigned_pd(true);
     set_remote_stack_size(128 * 1024);
 
@@ -127,7 +128,7 @@ struct RPCChannel : public ExecutionSession {
   }
 
   bool run(uint64_t* pcycles, uint64_t* usecs) override {
-    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs);
+    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs, gen_lwp_json);
     if (rc != AEE_SUCCESS) {
       std::cout << "error running model: " << std::hex << rc << '\n';
     }
@@ -158,8 +159,8 @@ struct RPCChannel : public ExecutionSession {
   std::vector<void*> allocations;
 };
 
-ExecutionSession* create_execution_session() {
-  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN);
+ExecutionSession* create_execution_session(bool gen_lwp_json) {
+  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN, gen_lwp_json);
   if (session->handle == -1) {
     delete session;
     session = nullptr;
diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
index a32bf937af58..da0dfcbbd5a6 100644
--- a/apps/hexagon_launcher/launcher_core.h
+++ b/apps/hexagon_launcher/launcher_core.h
@@ -94,6 +94,8 @@ struct Model {
 };
 
 struct ExecutionSession {
+  explicit ExecutionSession(bool lwp_json = false) : gen_lwp_json(lwp_json) {}
+
   template <typename T>
   T* alloc(size_t bytes, size_t align = 1) {
     return reinterpret_cast<T*>(alloc_mem(bytes, align));
@@ -111,6 +113,7 @@ struct ExecutionSession {
   virtual bool get_num_outputs(int* num_outputs) = 0;
   virtual bool get_output(int output_idx, tensor_meta* output_meta, int meta_size,
                           void* output_data, int data_size) = 0;
+  bool gen_lwp_json = false;
 };
 
 bool read_model_config(const std::string& file_name, ModelConfig* model_config);
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index 03524661c4e6..2692caf90e66 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -35,6 +35,7 @@ extern "C" {
 #include "launcher_rpc.h"
 
 static std::unique_ptr<Model> TheModel;
+bool WriteLWPOutput(const std::string&);
 
 static AEEResult error_too_small(const std::string& func_name, const std::string& value_name,
                                  int given, int needed) {
@@ -203,7 +204,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
 }
 
 AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcycles,
-                                          uint64_t* usecs) {
+                                          uint64_t* usecs, int gen_lwp_json) {
   if (!TheModel) {
     // No model created.
     LOG(ERROR) << __func__ << ": no model created";
@@ -220,5 +221,12 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   *pcycles = pc_end - pc_begin;
   *usecs = us_end - us_begin;
 
+  if (gen_lwp_json) {
+    if (!WriteLWPOutput("lwp.json")) {
+      LOG(ERROR) << "ERROR: failed to generate lwp json file";
+      return AEE_EFAILED;
+    }
+  }
+
   return AEE_SUCCESS;
 }
diff --git a/apps/hexagon_launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc
index 163d582db440..1ef3b5d2ff3c 100644
--- a/apps/hexagon_launcher/launcher_main.cc
+++ b/apps/hexagon_launcher/launcher_main.cc
@@ -27,12 +27,14 @@
 #include "launcher_core.h"
 #include "launcher_util.h"
 
-ExecutionSession* create_execution_session();
+ExecutionSession* create_execution_session(bool gen_lwp_json);
 
-int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path) {
+int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path,
+                       bool* gen_lwp_json) {
   static option long_options[] = {
       {"in_config", required_argument, nullptr, 0},
       {"out_config", required_argument, nullptr, 0},
+      {"gen_lwp_json", optional_argument, nullptr, 0},
   };
 
   bool show_usage = false;
@@ -49,6 +51,9 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string
       case 1:
         *out_path = std::string(optarg);
         break;
+      case 2:
+        *gen_lwp_json = true;
+        break;
     }
   }
   if (in_path->empty() || out_path->empty() || show_usage) {
@@ -61,7 +66,8 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string
 
 int main(int argc, char* argv[]) {
   std::string in_path, out_path;
-  if (parse_command_line(argc, argv, &in_path, &out_path) != 0) {
+  bool gen_lwp_json;
+  if (parse_command_line(argc, argv, &in_path, &out_path, &gen_lwp_json) != 0) {
     return 1;
   }
 
@@ -70,7 +76,7 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  ExecutionSession* session_ptr = create_execution_session();
+  ExecutionSession* session_ptr = create_execution_session(gen_lwp_json);
   if (session_ptr == nullptr) {
     return 1;
   }
diff --git a/apps/hexagon_launcher/launcher_rpc.idl b/apps/hexagon_launcher/launcher_rpc.idl
index 6677108a76f0..27e5d1d15d68 100644
--- a/apps/hexagon_launcher/launcher_rpc.idl
+++ b/apps/hexagon_launcher/launcher_rpc.idl
@@ -29,5 +29,5 @@ interface launcher_rpc : remote_handle64 {
   AEEResult set_input(in long input_idx, in buffer input_meta, in buffer input_value);
   AEEResult get_num_outputs(rout long num_outputs);
   AEEResult get_output(in long output_idx, rout buffer output_meta, rout buffer output_value);
-  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs);
+  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs, in long gen_lwp_json);
 };
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 399623ef1c3e..735d21e492b5 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -281,10 +281,14 @@ if(USE_HEXAGON_RPC)
       # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it
       "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc"
     )
+    set(HEXAGON_PROFILER_DIR "${TVMRT_SOURCE_DIR}/hexagon/profiler")
     # Add the hardware-specific RPC code into the skel library.
+    set_property(SOURCE ${HEXAGON_PROFILER_DIR}/lwp_handler.S PROPERTY LANGUAGE C)
     add_library(hexagon_rpc_skel SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon/rpc_server.cc"
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_skel.c"
+      "${HEXAGON_PROFILER_DIR}/prof_utils.cc"
+      "${HEXAGON_PROFILER_DIR}/lwp_handler.S"
     )
     target_include_directories(hexagon_rpc_skel
       SYSTEM PRIVATE "${TVMRT_SOURCE_DIR}/hexagon/rpc"
@@ -293,6 +297,8 @@ if(USE_HEXAGON_RPC)
     # executed via run_main_on_sim.
     add_library(hexagon_rpc_sim SHARED
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/simulator/rpc_server.cc"
+      "${HEXAGON_PROFILER_DIR}/prof_utils.cc"
+      "${HEXAGON_PROFILER_DIR}/lwp_handler.S"
     )
     target_link_libraries(hexagon_rpc_sim
       -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 173f0f3b2d67..ddb158cad95e 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -157,9 +157,15 @@ function(_get_hexagon_sdk_property_impl
     if(_property STREQUAL "SDK_INCLUDE")
       set(_dirs "${_hexagon_sdk_root}/incs" "${_hexagon_sdk_root}/incs/stddef")
     elseif(_property STREQUAL "QURT_INCLUDE")
+      # Set the Hexagon arch directory for runtime linker.
+      set(_rtld_dir "hexagon_toolv84_${_hexagon_arch}")
+      if(_hexagon_arch STREQUAL "v69")
+        set(_rtld_dir "hexagon_toolv84_v68") # Use hexagon_toolv84_v68 for v69
+      endif()
       set(_dirs
         "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/posix"
         "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/qurt"
+        "${_hexagon_sdk_root}/ipc/fastrpc/rtld/ship/${_rtld_dir}"
       )
     elseif(_property STREQUAL "QURT_LIB")
       set(_dirs "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/lib/pic")
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index a1a97595bfd8..9f6b7f9ce5d1 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -747,6 +747,16 @@ TVM_DLL const Op& assume();
  */
 TVM_DLL const Op& undef();
 
+/*!
+ * \brief Profiling intrinsic
+ */
+TVM_DLL const Op& start_profile_intrinsic();
+
+/*!
+ * \brief Profiling intrinsic
+ */
+TVM_DLL const Op& end_profile_intrinsic();
+
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
   // array head address
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index e31919fbd223..48372565469b 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -690,6 +690,12 @@ TVM_DLL Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite = false);
  */
 TVM_DLL Pass ManifestSharedMemoryLocalStage();
 
+/*!
+ * \brief Insert intrinsic calls to instrument function and loop level profiling.
+ * \return The pass.
+ */
+TVM_DLL Pass InstrumentProfileIntrinsics();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 8105e6e716c0..c0e6439d0357 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -32,6 +32,7 @@
 from typing import Union
 
 import tvm
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 from ..._ffi import libinfo
 from .session import Session
 
@@ -336,6 +337,29 @@ def get_graph_debug_executor(
         """
         return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root)
 
+    @abc.abstractmethod
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+    ) -> str:
+        """Extract profile output.
+
+        Parameters
+        ----------
+        hex_profiler : HexagonProfiler
+            HexagonProfiler object that contains the profiling related information.
+        session : Session
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
+
+        Returns
+        -------
+        profile_data : str
+            Path of the profiling data file
+        """
+        ...
+
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
     """Hexagon Launcher for Android."""
@@ -392,6 +416,7 @@ def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
+
         _check_call_verbose(self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)])
 
     def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
@@ -629,6 +654,32 @@ def stop_server(self):
         if not self._hexagon_debug:
             self.cleanup_directory()
 
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+    ):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        profile_data = ""
+        if hex_profiler.is_lwp_enabled():
+            temp_dir = hex_profiler.get_temp_dir()
+            remote_path = hex_profiler.get_remote_path()
+            if not temp_dir:
+                raise RuntimeError("tempdir not passed")
+            fname = "lwp.json"
+            out_path = os.path.join(remote_path, fname)
+            profile_data = temp_dir.relpath(fname)
+            ret = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if ret:
+                subprocess.check_call(self._adb_device_sub_cmd + ["pull", out_path, profile_data])
+            else:
+                raise RuntimeError("Error generating profile output")
+        elif hex_profiler.profiling_mode == "etm":
+            hex_profiler.pull_files_for_etm_processing(self._workspace)
+        else:
+            raise RuntimeError("Profiling not enabled")
+        return profile_data
+
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
     """Hexagon Launcher for Hexagon simulator."""
@@ -735,6 +786,26 @@ def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
 
+    def get_profile_output(
+        self,
+        hex_profiler: HexagonProfiler,
+        session: Session,
+    ):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        profile_data = ""
+        if hex_profiler.is_lwp_enabled():
+            fname = "lwp.json"
+            profile_data = f"{self._workspace}/{fname}"
+            ret = session.get_profile_output(hex_profiler.get_mode(), fname)
+            if not ret:
+                raise RuntimeError("Error generating profile output")
+        elif hex_profiler.profiling_mode == "etm":
+            raise RuntimeError("ETM Profiling not supported on the simulator")
+        else:
+            raise RuntimeError("Profiling not enabled")
+
+        return profile_data
+
 
 # https://stackoverflow.com/a/52872579/2689797
 def _is_port_in_use(port: int) -> bool:
diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
new file mode 100755
index 000000000000..9a5df3d9b99a
--- /dev/null
+++ b/python/tvm/contrib/hexagon/hexagon_profiler.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Define HexagonProfiler class to enable profiling for Hexagon"""
+
+import os
+import subprocess
+from tvm.ir.transform import PassContext
+from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output
+from tvm.relay.backend.executor_factory import ExecutorFactoryModule
+from tvm.contrib import utils
+
+
+class HexagonProfiler:
+    """Hexagon Profiler"""
+
+    def __init__(
+        self, dso_binary: str, module: ExecutorFactoryModule, hexagon_server_process, enable_debug
+    ):
+        """Configure HexagonProfiler"""
+        # Save test .so to process profiling data
+        self._temp_dir = utils.tempdir(keep_for_debug=enable_debug)
+        self._dso_binary_path = self._temp_dir.relpath(dso_binary)
+        module.get_lib().save(self._dso_binary_path)
+
+        self._android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER")
+        self._remote_path = ""
+        self._logcat_path = ""
+
+        self._profiling_mode = None
+        config = PassContext.current().config
+        if self._android_serial_number is None:
+            raise RuntimeError("ANDROID_SERIAL_NUMBER must be set for profiling")
+
+        if ("tir.instrument_lwp", True) in config.items():
+            # Set profiling mode
+            self._profiling_mode = "lwp"
+
+            if self._android_serial_number != "simulator":
+                # Clear the logcat buffer and create a child process to redirect logcat output
+                # into a file.
+                launcher = hexagon_server_process["launcher"]
+                subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"])
+                self._logcat_path = self._temp_dir.relpath("logcat.log")
+                self._fo = open(self._logcat_path, "w")
+                self._proc = subprocess.Popen(
+                    launcher._adb_device_sub_cmd + ["logcat"], stdout=self._fo
+                )
+
+                # Get the remote workspace on the device from where the lwp data needs to be copied.
+                self._remote_path = launcher._workspace
+
+        if self._profiling_mode is None:
+            raise RuntimeError("Profiling mode was not set or was not a valid one.")
+
+    def get_mode(self):
+        return self._profiling_mode
+
+    def is_lwp_enabled(self):
+        return self._profiling_mode == "lwp"
+
+    def get_temp_dir(self):
+        return self._temp_dir
+
+    def get_remote_path(self):
+        return self._remote_path
+
+    def get_profile_output(self, hexagon_launcher, hexagon_session):
+        """Get runtime profiling data"""
+        prof_out = hexagon_launcher.get_profile_output(self, hexagon_session)
+
+        print("lwp json can be found at -- ", prof_out)
+
+        # Process lightweight profiling output into an easily readable csv file
+        # The post-processing requires following parameters:
+        # 1) Path of the binary file
+        # 2) android_serial_number
+        # 3) Path of the lwp json file (lwp.json) which gets created in the current directory
+        # 4) Path to the run log depending on the environment:
+        #    a) For on-device runs:
+        #       Use logcat output as the run log
+        #    b) For simulator runs:
+        #       Use "stdout.txt" as the run log. There is no need to specify the full path to
+        #       "stdout.txt" as it will be inferred based on 'prof_out' location.
+        # 5) lwp processed output file -  "lwp.csv"
+        #
+        lwp_csv = self._temp_dir.relpath("lwp.csv")
+        if self._android_serial_number == "simulator":
+            process_lwp_output(
+                self._dso_binary_path, self._android_serial_number, prof_out, "stdout.txt", lwp_csv
+            )
+        else:
+            # For on-device run
+            self._proc.kill()  # End the child process for logcat
+            self._fo.close()
+            if os.path.exists(self._logcat_path):
+                process_lwp_output(
+                    self._dso_binary_path,
+                    self._android_serial_number,
+                    prof_out,
+                    self._logcat_path,
+                    lwp_csv,
+                )
+            else:
+                raise RuntimeError("Error processing lwp output - missing logcat file")
diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
new file mode 100644
index 000000000000..eb92228b7cf3
--- /dev/null
+++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py
@@ -0,0 +1,388 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import csv
+import subprocess
+import argparse
+import os
+from re import search, compile
+from collections import OrderedDict
+
+ENABLE_DEBUG = False
+"""
+Process lightweight profiling output and generate a CSV file with processor
+cycles for the instrumented functions and loops.
+
+Please note that some assumptions have been made while processing
+the lightweight profiling output. They are as follows:
+
+1) We don't expect profiled functions to call another profiled function.
+  This constraint can be relaxed if needed but it simplifies the processing
+  significantly without introducing any limitations for our use case.
+2) For now, it's also assumed that every unique section (loop) ID has same start
+  and end offset which will not be true while a loop gets unrolled as it will
+  create multiple profiling section with the same ID. The current
+  implementation doesn't handle this case.
+
+"""
+
+
+def get_func_info(model_so):
+    """Get all the .text sections along with their start and end offset values"""
+    hexagon_nm_path = os.environ["HEXAGON_TOOLCHAIN"] + "/bin/hexagon-nm"
+    out = subprocess.Popen(
+        [hexagon_nm_path, "--print-size", model_so],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    stdo, stde = out.communicate()
+    stdo = stdo.decode("utf-8")
+
+    func_info = []
+    for l in stdo.split("\n"):
+        info = {}
+        if search(" (T|t) ", l):  # If .text section
+            parts = l.split(" ")
+            assert len(parts) == 4
+            info["start"] = int(parts[0], base=16)
+            info["end"] = int(parts[0], base=16) + int(parts[1], base=16)
+            info["name"] = parts[3]
+            func_info.append(info)
+
+    # Sort the entries in the increasing order of the start offset value.
+    func_info = sorted(func_info, key=lambda d: d["start"])
+
+    if ENABLE_DEBUG:
+        print("func_info :\n ")
+        for f in func_info:
+            print(f)
+    return func_info
+
+
+def find_func(func_info, offset):
+    """For a given offset, find the function it belongs to."""
+    fidx = 0
+    lidx = len(func_info) - 1
+    while fidx <= lidx:
+        midx = (fidx + lidx) // 2
+        ms = func_info[midx]["start"]
+        me = func_info[midx]["end"]
+        if fidx == lidx:
+            assert (
+                offset >= ms and offset <= me
+            ), f"Couldn't find a function for this offset: {offset}"
+            return fidx
+        else:
+            if offset > me:
+                fidx = midx + 1
+            elif offset < ms:
+                lidx = midx - 1
+            else:
+                return midx
+    assert False, "Possible mismatch between model .so and LWP data"
+
+
+def accumulate_cycles(overall_cycles, func_cycles, func_name):
+    """Accumulate function cycles"""
+    acc_cycles = overall_cycles[func_name]
+    for id in func_cycles:
+        assert id in acc_cycles, f"id [{id}] missing in the existing function record"
+        assert (
+            acc_cycles[id]["start"] == func_cycles[id]["start"]
+        ), "Offset value doesn't match with the existing function record."
+        acc_cycles[id]["cycles"] += func_cycles[id]["cycles"]
+        acc_cycles[id]["count"] += func_cycles[id]["count"]
+    overall_cycles.update({func_name: acc_cycles})
+    return overall_cycles
+
+
+def adjust_per_loop_counts(overall_cycles, data):
+    """
+    Use execution count and the number of entries recorded for each function/loop
+    to compute the overall cycles spent on them.
+    """
+    for func in overall_cycles:
+        func_cycles = overall_cycles[func]
+        for id in func_cycles:
+            exec_count = data["loop_counts"][id]
+            rec_count = func_cycles[id]["count"]
+            assert exec_count != 0, "Execution count should have been non-zero."
+            assert rec_count != 0, "Entry count should have been non-zero."
+            exec_cycles = ((int(func_cycles[id]["cycles"])) * exec_count) // rec_count
+            func_cycles[id]["cycles"] = exec_cycles
+            func_cycles[id]["count"] = exec_count
+        overall_cycles.update({func: OrderedDict(sorted(func_cycles.items()))})
+    return overall_cycles
+
+
+def create_csv_report(overall_cycles, fname):
+    """Create csv report"""
+    header = [
+        "function name",
+        "loop/function id",
+        "loop depth",
+        "start offset",
+        "end offset",
+        "pcycles",
+        "parent count",
+    ]
+    with open(fname, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        for func in overall_cycles:
+            func_cycles = overall_cycles[func]
+            data = []
+            root = -1
+            outer_most = -1
+            for key, value in func_cycles.items():
+                if value["parent"] == -1:
+                    assert root == -1, "Can't have multiple root nodes."
+                    root = key
+
+                data.append(func)
+                data.append(key)
+                if value["parent"] == -1:
+                    data.append("-")  # Total cycles over all invocations of this function.
+                elif value["parent"] == root:
+                    data.append(0)
+                    outer_most = key
+                else:
+                    if outer_most > -1:
+                        data.append(key - outer_most)
+                    else:
+                        data.append(key - value["parent"])
+                data.append(hex(value["start"]))
+                data.append(hex(value["end"]))
+                data.append(value["cycles"])
+                data.append(value["count"])
+                writer.writerow(data)
+                data.clear()
+
+
+def process_data(data, func_info, so_ld_addr):
+    """Process data"""
+    # Keep an ordered list of loop IDs as they are being visited. This is used
+    # to match entry and exit pairs. Once the function/loop is processed, it's
+    # removed from the list.
+    ordered_visited_list = []
+    # Store information regarding visited nodes as they are being processed. Once
+    # the function/loop is processed, it's removed from the set.
+    visited_set = {}
+    # Dictionary to store cycles for the entire model which is grouped into functions.
+    overall_cycles = {}
+    func_cycles = {}
+
+    func_idx = -1
+    func_name = ""
+    prev_func_name = ""
+    func_start = 0
+    func_end = 0
+    save_data = False
+    # Iterate over all the entries in the LWP data file and process them
+    # to construct a report.
+    for entry in data["entries"]:
+        id = entry["id"]
+        offset = entry["ret"] - so_ld_addr
+
+        # Recorded return address should fall within the function begin and end
+        # offsets. If not, find the function it belongs to.
+        if offset < func_start or offset > func_end:
+            prev_func_name = func_name
+            if ENABLE_DEBUG:
+                print("offset : ", offset)
+                print("id : ", id)
+
+            func_idx = find_func(func_info, offset)
+            func_name = func_info[func_idx]["name"]
+            func_start = func_info[func_idx]["start"]
+            func_end = func_info[func_idx]["end"]
+            if ENABLE_DEBUG:
+                print("func_name : ", func_name)
+
+            if save_data:
+                # overall_cycles = save_func_cycles(prev_func_name, overall_cycles, func_cycles, ordered_visited_list)
+                # Done processing the previous function, copy its info into 'overall_cycles'.
+                if prev_func_name not in overall_cycles:
+                    overall_cycles[prev_func_name] = func_cycles.copy()
+                else:
+                    # Accumulate cycles into existing function entry.
+                    overall_cycles = accumulate_cycles(overall_cycles, func_cycles, prev_func_name)
+                # We don't allow for fused operators (functions) calling another operator.
+                if ENABLE_DEBUG:
+                    print("ordered_visited_list : ", ordered_visited_list)
+
+                assert len(ordered_visited_list) == 0, (
+                    f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
+                    f"\t Possible reasons -- \n"
+                    f"\t\t1) Mismatch between model .so and json file.\n"
+                    f"\t\t2) LWP buffer may have overflowed resulting into missing entries!"
+                )
+                func_cycles.clear()
+
+            save_data = True
+
+        if id not in visited_set:  # Found 'entry' record
+            visited_info = {"func_idx": func_idx, "ret": offset, "cyc": entry["cyc"]}
+            visited_set[id] = visited_info
+            ordered_visited_list.append(id)
+        else:  # Found 'exit' record
+            # This should be the last entry in the ordered_visited_list. If not, error out.
+            assert ordered_visited_list[-1] == id, (
+                "Problem with LWP output - Interleaved handler calls found."
+                f"Loop [{ordered_visited_list[-1]}] hasn't exited yet."
+            )
+            ordered_visited_list.pop()
+            entry_node = visited_set.pop(id)
+            assert (
+                entry_node["func_idx"] == func_idx
+            ), f'Error - Found under a different function name : {entry_node["func_idx"]}'
+            cycles = entry["cyc"] - entry_node["cyc"]
+            parent = -1
+            if ordered_visited_list:
+                parent = int(ordered_visited_list[-1])
+            if id in func_cycles:
+                fcycles = func_cycles[id]
+                fcycles["cycles"] += cycles
+                fcycles["count"] += 1
+                func_cycles[id] = fcycles
+            else:
+                func_cycles[id] = {
+                    "cycles": cycles,
+                    "start": entry_node["ret"],
+                    "end": offset,
+                    "parent": parent,
+                    "count": 1,
+                }
+
+    # Done processing the previous function, copy its info into 'overall_cycles'.
+    if func_name not in overall_cycles:
+        overall_cycles[func_name] = func_cycles.copy()
+    else:
+        # Accumulate cycles into existing function entry.
+        overall_cycles = accumulate_cycles(overall_cycles, func_cycles, func_name)
+    # We don't allow for fused operators (functions) calling another operator.
+    if ENABLE_DEBUG:
+        print("ordered_visited_list : ", ordered_visited_list)
+
+    assert len(ordered_visited_list) == 0, (
+        f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n"
+        f"\t Possible reasons -- \n"
+        f"\t\t1) Mismatch between model .so and json file.\n"
+        f"\t\t2) LWP buffer may have overflowed resulting into missing entries!" % prev_func_name
+    )
+
+    overall_cycles = adjust_per_loop_counts(overall_cycles, data)
+    return overall_cycles
+
+
+def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log: str):
+    """Get load address of the binary file"""
+    if serial_number == "simulator":
+        basedir = os.path.dirname(lwp_json)
+        if run_log is None:
+            run_log = os.path.join(basedir, "stdout.txt")
+        else:
+            # If the directory name is specified for the run_log of the
+            # simulator (stdout.txt) then it must be same as lwp_json.
+            run_log_dir = os.path.dirname(run_log)
+            assert (
+                run_log_dir == "" or run_log_dir == basedir
+            ), f"stdout.txt and {os.path.basename(lwp_json)} must be in the same directory"
+            run_log = os.path.join(basedir, os.path.basename(run_log))
+        # To extract load address for the simulator run
+        pattern = compile(r"Model.*: (\w+):")
+    else:
+        # To extract load address for on-device run
+        binary_name = os.path.basename(binary_path)
+        pattern = compile(r"{}, len \w+, laddr (\w+)".format(binary_name))
+
+    with open(run_log, "r") as f:
+        lines = f.read()
+        a = pattern.search(lines)
+        load_addr = int(a.group(1), 16)
+    if ENABLE_DEBUG:
+        print("load_addr : ", load_addr)
+    return load_addr
+
+
+def process_lwp_output(
+    binary_path: str,
+    serial_number: str,
+    lwp_json: str,
+    run_log: str,
+    lwp_out: str,
+    enable_debug: bool = False,
+):
+    """Process lightweight profiling data"""
+    # Enable debug messages
+    global ENABLE_DEBUG
+    ENABLE_DEBUG = enable_debug
+
+    # Get load address for the binary
+    load_addr = get_load_addr(binary_path, serial_number, lwp_json, run_log)
+    # Opening JSON file
+    with open(lwp_json, "r") as f:
+        # Returns JSON object as a dictionary
+        data = json.load(f)
+
+    # Get function names, and their start and end offsets from the model .so
+    func_info = get_func_info(binary_path)
+
+    # Get the load address for model .so.
+    so_ld_addr = load_addr
+
+    # Process profiling data to construct a CSV report.
+    overall_cycles = process_data(data, func_info, so_ld_addr)
+    create_csv_report(overall_cycles, lwp_out)
+    print("lwp processed output written to -- ", lwp_out)
+    print("[NOTE: Use '--hexagon-debug' to keep the temp directory]")
+
+
+def get_args():
+    """Add commandline arguments to run the script manually if needed"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lwp-json", help="LWP json file", required=True)
+    parser.add_argument("--serial-num", help="device-id/simulator", required=True)
+    parser.add_argument("--test-so", help="Test shared library", required=True)
+    parser.add_argument(
+        "--run-log",
+        help="Logcat file for on-device run and stdout.txt for simulator run",
+        required=True,
+    )
+    parser.add_argument("--lwp-out", help="LWP output file name", required=True)
+    parser.add_argument(
+        "--debug",
+        help="Enable debug output from the script",
+        dest="debug",
+        action="store_true",
+        required=False,
+    )
+    parser.set_defaults(debug=False)
+    args = parser.parse_args()
+
+    global ENABLE_DEBUG
+    ENABLE_DEBUG = args.debug
+
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    process_lwp_output(
+        args.test_so, args.serial_num, args.lwp_json, args.run_log, args.lwp_out, args.debug
+    )
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index b69382fe1290..d6ea51b53e17 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -393,3 +393,8 @@ def _aot_executor_from_factory(
             remote_file_path = self.upload(binary_path, binary_name)
 
         return self.get_aot_executor(remote_file_path)
+
+    def get_profile_output(self, mode: str, path: str):
+        assert isinstance(mode, str), f"Invalid mode type, {type(mode)} != str"
+        assert isinstance(path, str), f"Invalid path type, {type(path)} != str"
+        return self._rpc.get_function("tvm.hexagon.get_profile_output")(mode, path)
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 7b3a81acc525..82533a2f9f5a 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -995,3 +995,14 @@ def ManifestSharedMemoryLocalStage():
         The result pass
     """
     return _ffi_api.ManifestSharedMemoryLocalStage()  # type: ignore
+
+
+def InstrumentProfileIntrinsics():
+    """Insert intrinsic calls to instrument function and loop level profiling.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InstrumentProfileIntrinsics()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 5f8c8742695d..2b9a354f5c7e 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -52,6 +52,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -157,6 +158,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
       pass_ctx->GetConfig<Array<Array<ObjectRef>>>("tir.add_lower_pass", Array<Array<ObjectRef>>())
           .value();
 
+  bool instrument_lwp = pass_ctx->GetConfig<Bool>("tir.instrument_lwp", Bool(false)).value();
+
   Array<transform::Pass> user_lower_phase0 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase1 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase2 = Array<transform::Pass>();
@@ -253,6 +256,14 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(
       tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
+  // This pass instruments the loops with the profile builtin calls to capture the runtime
+  // performance data (only enabled for Hexagon at the moment). To ensure that no other
+  // optimizations are performed on the instrumented code, this pass must be added at the end
+  // of the list.
+  if (instrument_lwp) {
+    pass_list.push_back(tir::transform::InstrumentProfileIntrinsics());
+  }
+
   return pass_list;
 }
 
diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
index 81eb30ee12d2..a0c6c48b5e44 100644
--- a/src/runtime/dso_library.cc
+++ b/src/runtime/dso_library.cc
@@ -34,6 +34,12 @@
 #include <dlfcn.h>
 #endif
 
+#if defined(__hexagon__)
+extern "C" {
+#include <HAP_farf.h>
+}
+#endif
+
 namespace tvm {
 namespace runtime {
 
@@ -118,6 +124,14 @@ void DSOLibrary::Load(const std::string& name) {
   lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
   ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
                                  << dlerror();
+#if defined(__hexagon__)
+  int p;
+  int rc = dlinfo(lib_handle_, RTLD_DI_LOAD_ADDR, &p);
+  if (rc)
+    FARF(ERROR, "error getting model .so start address : %u", rc);
+  else
+    FARF(ALWAYS, "Model .so Start Address : %x", p);
+#endif
 }
 
 void* DSOLibrary::GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
diff --git a/src/runtime/hexagon/profiler/README.md b/src/runtime/hexagon/profiler/README.md
new file mode 100644
index 000000000000..492e45c98498
--- /dev/null
+++ b/src/runtime/hexagon/profiler/README.md
@@ -0,0 +1,99 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Hexagon lightweight instrumentation based profiling (LWP)
+
+For Hexagon, LWP can be used to get function and loop level processor cycle count.
+This is done by instrumenting the code with profiling builtin calls using a TIR pass.
+During codegen, these builtin calls are replaced with the calls to a hexagon specific
+handler which records the runtime information into a buffer.
+This buffer is written into a JSON file ('lwp.json') which is processed to construct
+function and loop level profiling information as a csv file.
+
+**Note:** During codegen, the profiling builtin calls are ignored for other targets.
+
+The TIR pass offers several config flags to control the level of instrumentation
+as mentioned below:
+
+1) `lwp_disable_func_prof`: To disable function level profiling. By default, it is
+set to 'False', i.e., the function level profiling is enabled.
+
+2) `instr_siblings`: When enabled, only loops with siblings are instrumented and rest are
+ignored. The inner-most loops are always excluded from instrumentation unless overwritten
+using `lwp_min_height`. This is done to minimize the adverse effect of instrumentation on
+actual performance. By default, it is set to 'True'.
+
+3) `lwp_max_depth`: To instrument loops up to a certain depth. This flag is effective
+only when `instr_siblings` is disabled. By default, it is set to 0.
+
+4) `lwp_min_height`: To exclude inner loops up to a certain height from instrumentation.
+By default, it is set to 1.
+
+For additional usage information on various config flags, please refer to the tests in
+`tests/python/unittest/test_tir_transform_profiling_instr.py`
+
+
+## How to use lightweight profiling with RPC Launcher:
+
+`tests/python/contrib/test_hexagon/test_launcher.py` contains two tests, `test_lwp` and
+`test_lwp_multiple_conv2d`, to demonstrate lightweight profiling usage.
+
+The steps involved are as follows:
+
+1) While building a model, set `tir.instrument_lwp` to `True`.
+   By default, the builtin calls will only be inserted for the loops with siblings. But it
+   can be altered using LWP config options as described above.
+2) Create `HexagonProfiler` object
+
+```
+with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+    lowered = tvm.relay.build(
+        relay_mod,
+        tvm.target.Target(target_hexagon, host=target_hexagon),
+        ...
+    )
+
+    # Create HexagonProfiler object. It sets the profiling mode based on the PassContext config.
+    # '--hexagon-debug' to pytest can be used to retain any temp or test directories to
+    # inspect the profiling data.
+    profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug)
+```
+
+4) Run the model and get the profiling data as a CSV file. It is done by post-processing
+   'lwp.json' file generated during runtime.
+
+```
+    graph_mod.run(**inputs)
+
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process)
+```
+**Note:**
+
+- For on-device runs, 'lwp.json' is copied into a temp directory along with the test .so and the processed
+  CSV file
+- For the simulator runs, the file is generated in the simulator test output directory. Test  .so
+  will still be in a separate temp directory. lwp CSV file will also be in the same directory.
+
+**Helpful Hints:**
+
+- To prevent the test directories on the Hexagon device as well as temporary test directory on x86
+from being deleted for profiling related runs, pass `--hexagon-debug` to pytest.
+
+```
+python -m pytest --hexagon-debug tests/python/contrib/test_hexagon/test_launcher.py::test_lwp
+```
diff --git a/src/runtime/hexagon/profiler/lwp_handler.S b/src/runtime/hexagon/profiler/lwp_handler.S
new file mode 100644
index 000000000000..611c0713111a
--- /dev/null
+++ b/src/runtime/hexagon/profiler/lwp_handler.S
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+Lightweight profiling handler to record processor cycles in a buffer
+(pointed by __lwp_buffer_ptr) for a given invocation of the handler. To keep the
+buffer size within a resonable limit, we only recond data for the first 100
+invocation of the handler for a given loop or function ID (passed in R0 register).
+The buffer size wouldn't be a concern if the loops with only siblings are getting
+profiled. However, since the instrumentation provides several different profiling
+options, this approach ensures that they all function as expexted. We use second
+buffer (pointed by __lwp_counter) to keep count of the calls made to lwp_handler
+function for each function/loop.
+
+Brief explanation of all the global variables used:
+1) __lwp_counter : Pointer to the buffer that keeps count of the number of times handler
+ is called for a given ID. To reduce the complexity of the handler, __lwp_counter is
+ indexed using the ID itself.
+2) __lwp_buffer_ptr : Pointer to the buffer that records loop/function ID, processor cycles
+ and return addresss of the handler. Return address is used to reconstruct the call graph
+ (loop-nest) to make it easier to analyze the profiling data.
+3) __lwp_buffer_size : Size of the buffer
+4) __lwp_buffer_count : Offset into main lwp buffer where data for the current handler
+invocation needs to be written.
+
+NOTE: The handler function saves and restores R0-R5 registers which are caller saved registers
+on Hexagon and should be handled at the callsite. However, to reduce the codegen impact
+of the handler calls on the caller functions, we decided to move this part into the
+handler itself.
+
+*/
+  .text
+  .globl  lwp_handler
+  .falign
+  .type  lwp_handler,@function
+lwp_handler:
+  { allocframe(#24)                            // Allocate 24 bytes on the stack to save R0-R5 registers
+    memd(r29+#-16) = r5:4                      // Save R5,R4
+  }
+  {
+    memd(r29+#8) = r3:2                        // Save R3,R2
+    memd(r29+#0) = r1:0                        // Save R1, R0
+    r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL) // Get GOT address
+  }
+  {
+    r5 = memw(r2+##__lwp_counter@GOT)         // Get address of the pointer to __lwp_counter
+    r3 = memw(r2+##__lwp_buffer_count@GOT)    // Get the address of __lwp_buffer_count
+  }
+  {
+    r5 = memw(r5+#0)                          // Get the address of __lwp_counter (address of the main lwp buffer)
+    r3 = memw(r3+#0)                          // Get the __lwp_buffer_count value (offset into the main buffer)
+  }
+  {
+    r4 = memw(r5+r0<<#2)                      // Get the handler invocation count for the ID (passed in R0)
+    r1 = memw(r2+##__lwp_buffer_size@GOT)     // Get the address of __lwp_buffer_size
+  }
+  {
+    r4 = add(r4,#1)                           // Increment count
+    memw(r5+r0<<#2) = r4.new                  // Update count in __lwp_counter for a given ID
+    r1 = memw(r1+#0)                          // Get the buffer size
+  }
+  {
+    p0 = cmp.gtu(r4,#100)                     // Exit if count for a given ID is greater than 100
+    if (p0.new) jump:nt .LBB0_3
+    r5 = memw(r2+##__lwp_buffer_ptr@GOT)      // Get address of the pointer to __lwp_buffer_ptr
+  }
+  {
+    r5 = memw(r5+#0)                          // Get address of __lwp_buffer_ptr
+    r2 = memw(r2+##__lwp_buffer_count@GOT)    // Get address of __lwp_buffer_count
+  }
+  {
+    r4 = add(r3,#4)                           // Increment the offset by 4 since 4 int32 values are stored for each invocation
+    if (!cmp.gtu(r1,r4.new)) jump:t .LBB0_3   // Exit if the main lwp buffer has run out of space
+  }
+  {
+    r5 = addasl(r5,r3,#2)                     // Get the address where the data needs to be recorded
+    memw(r2+#0) = r4                          // Save next offset into __lwp_buffer_count
+  }
+  {
+    memw(r5+#0) = r31                         // Save return address of this function
+    r1:0 = C15:14                             // Control registers that keep processor cycle count (64-bits)
+    memw(r5+#4) = r0                          // Save loop/function ID
+  }
+  {
+    memw(r5+#12) = r1                         // Save upper 32 bits
+    memw(r5+#8) = r0                          // Save lower 32 bits
+  }
+  .falign
+.LBB0_3:
+  {
+    r5:4 = memd(r29+#16)                     // Restore the registers from the stack
+    r3:2 = memd(r29+#8)
+  }
+  {
+    r1:0 = memd(r29+#0)
+    dealloc_return                          // Deallocate the stack and return
+  }
+.Lfunc_end0:
+  .size  lwp_handler, .Lfunc_end0-lwp_handler
diff --git a/src/runtime/hexagon/profiler/prof_utils.cc b/src/runtime/hexagon/profiler/prof_utils.cc
new file mode 100644
index 000000000000..45cbe607031d
--- /dev/null
+++ b/src/runtime/hexagon/profiler/prof_utils.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+// The max loop/function id used among all lwp_handler calls. Since
+// the id is used to index into the lwp_counter buffer, the size of the
+// buffer must be equal or greater than the max possible id.
+#define LWP_COUNTER_SIZE 5000
+
+// LWP_BUFFER_SIZE needs to be at most 100 * LWP_COUNTER_SIZE since 100 is
+// the max number of entries recorded for each instrumented location.
+#define LWP_BUFFER_SIZE (LWP_COUNTER_SIZE * 100)
+
+uint32_t lwp_counter[LWP_COUNTER_SIZE] = {0};
+uint32_t lwp_buffer[LWP_BUFFER_SIZE];
+uint32_t* __lwp_counter = lwp_counter;
+uint32_t* __lwp_buffer_ptr = lwp_buffer;
+uint32_t __lwp_buffer_size = LWP_BUFFER_SIZE;
+uint32_t __lwp_enable_flag = 1;
+uint32_t __lwp_buffer_count = 0;
+
+bool WriteLWPOutput(const std::string& out_json) {
+  std::ostringstream s;
+  s << "{\n";
+  s << "\t\"entries\":[\n";
+  for (size_t i = 0; i < __lwp_buffer_count; i += 4) {
+    s << "\t{\n";
+    s << "\t\t\"ret\":" << std::dec << lwp_buffer[i] << ",\n";
+    s << "\t\t\"id\":" << std::dec << lwp_buffer[i + 1] << ",\n";
+    uint64_t pcycles = (static_cast<uint64_t>(lwp_buffer[i + 3]) << 32) + lwp_buffer[i + 2];
+    s << "\t\t\"cyc\":" << std::dec << pcycles << "\n";
+    s << "\t}";
+    if (i < __lwp_buffer_count - 4) {
+      s << ",\n";
+    }
+  }
+  s << "\t],\n\n";
+  s << "\t\"loop_counts\":[\n";
+  for (size_t i = 0; i < LWP_COUNTER_SIZE; i++) {
+    s << "\t\t" << lwp_counter[i] / 2;
+    if (i < LWP_COUNTER_SIZE - 1)
+      s << ",\n";
+    else
+      s << "\n";
+  }
+  s << "\t]\n}\n";
+  std::ofstream ofc(out_json);
+  if (!ofc.is_open()) {
+    return false;
+  }
+
+  ofc << s.str() << "\n";
+
+  if (!ofc) {
+    return false;
+  }
+  ofc.close();
+  return true;
+}
diff --git a/src/runtime/hexagon/profiler/prof_utils.h b/src/runtime/hexagon/profiler/prof_utils.h
new file mode 100644
index 000000000000..e086f7a26b36
--- /dev/null
+++ b/src/runtime/hexagon/profiler/prof_utils.h
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prof_utils.h
+ */
+#ifndef TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
+#define TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
+
+#include <string>
+
+bool WriteLWPOutput(const std::string&);
+
+#endif  // TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index b4799d5d7127..41c63d0affeb 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -39,6 +39,7 @@ extern "C" {
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon/hexagon_common.h"
 #include "../../hexagon/hexagon_device_api.h"
+#include "../../profiler/prof_utils.h"
 #include "hexagon_rpc.h"
 
 namespace tvm {
@@ -329,3 +330,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
       *rv = CreateModuleFromLibrary(n);
     });
+
+TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string profiling_mode = args[0];
+      std::string out_file = args[1];
+      if (profiling_mode.compare("lwp") == 0) {
+        *rv = WriteLWPOutput(out_file);
+      } else {
+        HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str());
+        *rv = false;
+      }
+    });
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 119e999276f8..41bb2da6f8b1 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -29,6 +29,7 @@
 #include "../../../library_module.h"
 #include "../../../minrpc/minrpc_server.h"
 #include "../../hexagon_common.h"
+#include "../../profiler/prof_utils.h"
 #include "hexagon_sim_proto.h"
 #include "tvm/runtime/packed_func.h"
 #include "tvm/runtime/registry.h"
@@ -336,3 +337,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
       *rv = CreateModuleFromLibrary(n);
     });
+
+TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output")
+    .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
+      std::string profiling_mode = args[0];
+      std::string out_file = args[1];
+      if (profiling_mode.compare("lwp") == 0) {
+        *rv = WriteLWPOutput(out_file);
+      } else {
+        HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str());
+        *rv = false;
+      }
+    });
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index ff59dfcceb8d..9552a45a60f9 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -75,6 +75,7 @@ class CodeGenHexagon final : public CodeGenCPU {
 
   using CodeGenCPU::VisitStmt_;
   llvm::Value* VisitExpr_(const BufferLoadNode* op) override;
+  llvm::Value* CreateIntrinsic(const CallNode* op) override;
 
   llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
                                 bool skip_first_arg) override;
@@ -193,6 +194,28 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
   return CodeGenCPU::VisitExpr_(op);
 }
 
+llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
+#if TVM_LLVM_VERSION >= 150
+  if (op->op.same_as(builtin::start_profile_intrinsic()) ||
+      op->op.same_as(builtin::end_profile_intrinsic())) {
+    llvm::Value* id = MakeValue(op->args[0]);
+    auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom;
+    llvm::Function* func = llvm::Intrinsic::getDeclaration(module_.get(), instrprof_id);
+    llvm::GlobalVariable* name_var = module_->getGlobalVariable("handler_name");
+    if (!name_var) {
+      llvm::StringRef init_str = "lwp_handler";
+      llvm::Constant* init = llvm::ConstantDataArray::getString(module_->getContext(), init_str);
+
+      name_var = new llvm::GlobalVariable(*module_, init->getType(), true,
+                                          llvm::GlobalValue::InternalLinkage, init, "handler_name");
+    }
+    llvm::Type* t_int8_p_ = t_int8_->getPointerTo();
+    return builder_->CreateCall(func, {llvm::ConstantExpr::getBitCast(name_var, t_int8_p_), id});
+  }
+#endif
+  return CodeGenCPU::CreateIntrinsic(op);
+}
+
 void CodeGenHexagon::CreatePrintf(const std::string& format,
                                   llvm::ArrayRef<llvm::Value*> format_args) {
   // This function generates LLVM instructions to call HAP_debug_v2,
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6a50dd4534c2..ea8a5ff5106a 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1318,6 +1318,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     // TODO(masahi): Support atomic for CPU backend
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
     return nullptr;
+  } else if (op->op.same_as(builtin::start_profile_intrinsic()) ||
+             op->op.same_as(builtin::end_profile_intrinsic())) {
+    LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op;
+    return nullptr;
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 1e2d790c76e1..b605b9de1e02 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -302,6 +302,12 @@ TIR_DEFINE_BUILTIN_FUNC(undef)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kReadState))
     .set_num_inputs(0);
 
+TIR_DEFINE_BUILTIN_FUNC(start_profile_intrinsic)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_BUILTIN_FUNC(end_profile_intrinsic)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
+
 }  // namespace builtin
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc
new file mode 100644
index 000000000000..5f52fc6630bc
--- /dev/null
+++ b/src/tir/transforms/profile_instrumentation.cc
@@ -0,0 +1,293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file profile_instrumentation.cc
+ */
+// Insert profile intrinsic at loop and function level. During codegen,
+// these instruction can be replaced with a call to a target specific handler
+// and can be used to capture profiling information such as processor cycles.
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+namespace lwp {
+
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_disable_func_prof", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_max_depth", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_min_height", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.instr_siblings", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.reset_start_id", Bool);
+
+static int32_t start_id = 0;
+
+struct LoopInfo {
+  LoopInfo() = default;
+  LoopInfo(unsigned i, unsigned d, unsigned h = 0) : id(i), depth(d), height(h) {
+    has_siblings = false;
+    has_parallel = false;
+  }
+  unsigned id;
+  int32_t depth;
+  int32_t height;
+  bool has_siblings;
+  // Set to 'true' if ForKind::kParallel is set for the current loop or one of its ancestor
+  bool has_parallel;
+};
+
+using LoopInfoMap = std::unordered_map<const ForNode*, LoopInfo>;
+// Traverse loops depth first and assign them a unique number.
+class LoopAnalyzer : public StmtExprVisitor {
+ public:
+  LoopInfoMap Analyze(const Stmt& stmt) {
+    this->VisitStmt(stmt);
+    return loops;
+  }
+  void VisitStmt_(const ForNode* op) final {
+    LoopInfo loop_info(start_id, 0);
+    start_id++;
+    loop_info.height = TraverseLoop(op->body, 0);
+    loops[op] = loop_info;
+  }
+
+  unsigned TraverseLoop(const Stmt& stmt, unsigned parent_depth, bool has_parallel = false) {
+    if (stmt->IsInstance<SeqStmtNode>()) {
+      std::vector<const ForNode*> siblings;
+      unsigned height = 0;
+      bool has_loop = false;
+      const SeqStmtNode* n = stmt.as<SeqStmtNode>();
+      for (Stmt s : n->seq) {
+        if (s->IsInstance<ForNode>()) {
+          has_loop = true;
+          const ForNode* f = s.as<ForNode>();
+          LoopInfo loop_info(start_id, parent_depth + 1);
+          start_id++;
+          bool parent_parallel = false;
+          if (has_parallel) {
+            loop_info.has_parallel = true;
+            parent_parallel = true;
+          } else if (f->kind == ForKind::kParallel) {
+            // has_parallel for the current loop is being set to 'false' since the
+            // intrinsic is added outside of the loop. The instrumentation isn't
+            // allowed for the subsequent nested loops.
+            loop_info.has_parallel = false;
+            parent_parallel = true;
+          }
+          siblings.push_back(f);
+          height = std::max(height, TraverseLoop(f->body, parent_depth + 1, parent_parallel));
+          loop_info.height = height;
+          loops[f] = loop_info;
+        }
+      }
+      if (siblings.size() > 1) {
+        for (auto* l : siblings) {
+          loops[l].has_siblings = true;
+        }
+      }
+      height = has_loop ? height + 1 : height;
+      return height;  // Parent's height : max of all children's height
+    } else if (stmt->IsInstance<IfThenElseNode>()) {
+      const IfThenElseNode* n = stmt.as<IfThenElseNode>();
+      unsigned height = TraverseLoop(n->then_case, parent_depth, has_parallel);
+      if (n->else_case.defined()) {
+        height = std::max(height, TraverseLoop(n->else_case, parent_depth, has_parallel));
+      }
+      return height;
+    } else if (stmt->IsInstance<ForNode>()) {
+      const ForNode* f = stmt.as<ForNode>();
+      LoopInfo loop_info(start_id, parent_depth + 1);
+      start_id++;
+      bool parent_parallel = false;
+      if (has_parallel) {
+        loop_info.has_parallel = true;
+        parent_parallel = true;
+      } else if (f->kind == ForKind::kParallel) {
+        // has_parallel for the current loop is being set to 'false' since the
+        // intrinsic is added outside of the loop. The instrumentation isn't
+        // allowed for the subsequent nested loops.
+        loop_info.has_parallel = false;
+        parent_parallel = true;
+      }
+      unsigned height = TraverseLoop(f->body, parent_depth + 1, parent_parallel);
+      loop_info.height = height;
+      loops[f] = loop_info;
+      return height + 1;
+    } else if (stmt->IsInstance<LetStmtNode>()) {
+      const LetStmtNode* n = stmt.as<LetStmtNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else if (stmt->IsInstance<AttrStmtNode>()) {
+      const AttrStmtNode* n = stmt.as<AttrStmtNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else if (stmt->IsInstance<AllocateNode>()) {
+      const AllocateNode* n = stmt.as<AllocateNode>();
+      return TraverseLoop(n->body, parent_depth, has_parallel);
+    } else {
+      return 0;  // inner-most loop
+    }
+  }
+
+ private:
+  LoopInfoMap loops;
+};
+
+class InstrumentIntrin : public StmtMutator {
+ public:
+  InstrumentIntrin(int32_t max_depth, int32_t min_height, bool instr_siblings)
+      : max_instr_depth_(max_depth),
+        min_instr_height_(min_height),
+        instr_siblings_(instr_siblings) {}
+
+  void GetLoopInfo(PrimFuncNode* op) {
+    LoopAnalyzer analzer;
+    loops_ = std::move(analzer.Analyze(op->body));
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode* op) final {
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    return SeqStmt::Flatten(stmt);
+  }
+
+  Stmt VisitStmt_(const ForNode* op) final {
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    if (loops_.count(op) < 1) return stmt;
+
+    LoopInfo loop_info = loops_[op];
+
+    if (loop_info.has_parallel) {
+      return stmt;
+    }
+
+    // Exclude inner-most loops from instrumentation. The inner-most loop has
+    // height '0' and it increases as we move outward in the loop nest.
+    if (loop_info.height < min_instr_height_) {
+      return stmt;
+    }
+
+    // Only instrument loops with a sibling
+    if (instr_siblings_ && !loop_info.has_siblings) {
+      return stmt;
+    }
+
+    // If instr_siblings_ is set, ignore max depth for instrumentation
+    if (!instr_siblings_ && loop_info.depth > max_instr_depth_) {
+      return stmt;
+    }
+    PrimExpr id = static_cast<int32_t>(loop_info.id);
+    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {id});
+    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {id});
+    const Stmt start_profile = Evaluate(start_call);
+    const Stmt end_profile = Evaluate(end_call);
+    Stmt new_stmt = SeqStmt({start_profile, stmt, end_profile});
+    return new_stmt;
+  }
+
+ private:
+  LoopInfoMap loops_;
+  int32_t max_instr_depth_;
+  int32_t min_instr_height_;
+  bool instr_siblings_;
+};
+
+class CheckParallelLoops : public StmtExprVisitor {
+ public:
+  bool HasParallelLoops(const Stmt& stmt) {
+    this->VisitStmt(stmt);
+    return has_parallel;
+  }
+
+ private:
+  void VisitStmt_(const ForNode* op) final {
+    if (op->kind == ForKind::kParallel) {
+      has_parallel = true;
+    } else {
+      StmtExprVisitor::VisitStmt_(op);
+    }
+  }
+
+  bool has_parallel = false;
+};
+
+PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_instr_height,
+                            bool instr_siblings, bool disable_func_instrumentation) {
+  auto* func_ptr = func.CopyOnWrite();
+
+  PrimExpr e = start_id++;
+  if (!disable_func_instrumentation) {
+    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {e});
+    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {e});
+    const Stmt start_profile = Evaluate(start_call);
+    const Stmt end_profile = Evaluate(end_call);
+    func_ptr->body = SeqStmt({start_profile, std::move(func_ptr->body), end_profile});
+  }
+  InstrumentIntrin p(max_instr_depth, min_instr_height, instr_siblings);
+  p.GetLoopInfo(func_ptr);
+  func_ptr->body = p(std::move(func_ptr->body));
+  return std::move(func);
+}
+
+}  // namespace lwp
+
+namespace transform {
+Pass InstrumentProfileIntrinsics() {
+  auto pass_func = [](IRModule m, PassContext ctx) {
+    auto* mptr = m.CopyOnWrite();
+
+    // All loops with depth <= max_instr_depth are instrumented. By default,
+    // only outer-most loops are instrumented which has a depth of 0.
+    // In addition, loops with siblings are also instrumented provided
+    // their loop depth is >= min_instr_height. This is done to avoid
+    // instrumenting inner-most loops.
+    auto max_instr_depth = ctx->GetConfig<Integer>("tir.lwp_max_depth", Integer(0)).value();
+    auto min_instr_height = ctx->GetConfig<Integer>("tir.lwp_min_height", Integer(1)).value();
+    bool instr_siblings = ctx->GetConfig<Bool>("tir.instr_siblings", Bool(true)).value();
+    bool disable_func_instrumentation =
+        ctx->GetConfig<Bool>("tir.lwp_disable_func_prof", Bool(false)).value();
+    bool reset_start_id = ctx->GetConfig<Bool>("tir.reset_start_id", Bool(false)).value();
+    if (reset_start_id) lwp::start_id = 0;
+    std::vector<std::pair<GlobalVar, PrimFunc>> updates;
+    for (const auto& kv : mptr->functions) {
+      if (auto* n = kv.second.as<PrimFuncNode>()) {
+        PrimFunc func = GetRef<PrimFunc>(n);
+        auto updated_func =
+            lwp::AddProfileBuiltins(func, max_instr_depth.IntValue(), min_instr_height.IntValue(),
+                                    instr_siblings, disable_func_instrumentation);
+        updates.push_back({kv.first, updated_func});
+      }
+    }
+    for (const auto& pair : updates) {
+      mptr->AddUnchecked(pair.first, pair.second);
+    }
+    return m;
+  };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.InstrumentProfileIntrinsics", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InstrumentProfileIntrinsics")
+    .set_body_typed(InstrumentProfileIntrinsics);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 162e4a1cc7a1..527c79754796 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -157,6 +157,7 @@
     "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template",
     # Hexagon
     "src/runtime/hexagon/rpc/android_bash.sh.template",
+    "src/runtime/hexagon/profiler/lwp_handler.S",
 }
 
 
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 565999c32957..76d5cba60a1f 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -24,6 +24,8 @@
 from tvm import relay, te
 from tvm.contrib.hexagon.session import Session
 from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler
 
 from .infrastructure import get_hexagon_target
 
@@ -568,5 +570,161 @@ def test_dense_relay_vrmpy(hexagon_session, data_dtype, weight_dtype):
     np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_hexagon
+def test_lwp(
+    hexagon_server_process,
+    hexagon_launcher: HexagonLauncherRPC,
+    hexagon_session: Session,
+    hexagon_debug,
+):
+    dtype = "float32"
+    data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
+    weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+
+    f = relay.Function([data, weight], y)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    runtime = Runtime("cpp")
+    executor = Executor("graph")
+
+    weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype)
+    data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
+    params = {"weight": weight_in}
+    inputs = {"data": data_in}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+        )
+        # Create HexagonProfiler object
+        dso_binary = "test_binary.so"
+        profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug)
+
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session)
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+        )
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(weight=weight_in)
+    llvm_graph_mod.run(data=data_in)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
+@tvm.testing.requires_hexagon
+def test_lwp_multiple_conv2d(
+    hexagon_server_process,
+    hexagon_launcher: HexagonLauncherRPC,
+    hexagon_session: Session,
+    hexagon_debug,
+):
+    dtype = "float32"
+    input_shape = (1, 8, 8, 3)
+    w1_shape = (5, 5, 3, 1)
+    w2_shape = (5, 5, 1, 3)
+    data = relay.var("data", relay.TensorType(input_shape, dtype))
+    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
+    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
+    y1 = relay.nn.conv2d(
+        data,
+        weight1,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    y2 = relay.nn.conv2d(
+        y1,
+        weight2,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight1, weight2], y2)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    runtime = Runtime("cpp")
+    executor = Executor("graph")
+
+    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
+        dtype=dtype
+    )
+    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype(
+        dtype=dtype
+    )
+    input_data = np.random.rand(
+        input_shape[0], input_shape[1], input_shape[2], input_shape[3]
+    ).astype(dtype=dtype)
+
+    params = {"weight1": weight1_data, "weight2": weight2_data}
+    inputs = {"data": input_data}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+        )
+        # Create HexagonProfiler object
+        dso_binary = "test_binary.so"
+        profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug)
+
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    # Get lightweight profiling output as a CSV file
+    profiler.get_profile_output(hexagon_launcher, hexagon_session)
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+        )
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(**params)
+    llvm_graph_mod.run(**inputs)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_profiling_instr.py b/tests/python/unittest/test_tir_transform_profiling_instr.py
new file mode 100644
index 000000000000..d14e2a4c8925
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_profiling_instr.py
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.ir.module import IRModule
+from tvm.script import tir as T
+import numpy
+
+default_lwp_test_config = {
+    "tir.instrument_lwp": True,
+    "tir.lwp_disable_func_prof": True,
+    "tir.reset_start_id": True,
+}
+
+
+@T.prim_func
+def input1(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    for i, j in T.grid(8, 8):
+        for k, l in T.grid(8, 16):
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+        for k, l in T.grid(8, 16):
+            with T.block("C"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+
+
+@T.prim_func
+def input2(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        for j in T.serial(0, 8):
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        for j in T.serial(0, 8):
+            for k, l in T.grid(8, 16):
+                with T.block("C"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+
+
+@T.prim_func
+def input3(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        for j in T.parallel(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        for j in T.serial(0, 8):
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+
+
+@T.prim_func
+def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    for i, j in T.grid(8, 8):
+        T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
+        for k, l in T.grid(8, 16):
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
+        for k, l in T.grid(8, 16):
+            with T.block("C"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+
+
+@T.prim_func
+def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("C"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
+            for k, l in T.grid(8, 16):
+                with T.block("B"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))
+
+
+@T.prim_func
+def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.serial(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+    T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))
+
+
+@T.prim_func
+def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
+    A = T.match_buffer(a, (8, 8, 128), dtype="int32")
+    B = T.match_buffer(b, (8, 8, 128), dtype="int32")
+    C = T.match_buffer(c, (8, 8, 128), dtype="int32")
+    D = T.match_buffer(d, (8, 8, 128), dtype="int32")
+    for i in T.serial(0, 8):
+        T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
+        for j in T.parallel(0, 8):
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
+            for k in T.serial(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+        T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
+        T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
+        for j in T.serial(0, 8):
+            T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("C"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
+            T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
+            T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
+            for k in T.parallel(0, 8):
+                for l in T.serial(0, 16):
+                    with T.block("B"):
+                        vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                        C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
+            T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
+        T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))
+
+
+# By default, only loops with siblings are instrumented.
+def test1():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test1_expected_output)
+
+
+# By default, only loops with siblings are instrumented. Here, 'lwp_max_depth'
+# doesn't have any effect unless 'instr_siblings' is set to False (ex: test3).
+def test2():
+    test2_config = default_lwp_test_config.copy()
+    test2_config.update({"tir.lwp_max_depth": 3})
+    with tvm.transform.PassContext(config=test2_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test1_expected_output)
+
+
+# test3: Use 'lwp_max_depth' to instrument loops upto a certain depth. This flag
+# is effective only when 'instr_siblings' is disabled. Also, note that inner-most
+# loops are always excluded from instrumentation unless overwritten using
+# 'lwp_min_height' (ex: test5)
+def test3():
+    test3_config = default_lwp_test_config.copy()
+    test3_config.update({"tir.lwp_max_depth": 3, "tir.instr_siblings": False})
+    with tvm.transform.PassContext(config=test3_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test3_expected_output)
+
+
+# test4: Use 'lwp_min_height' to exclude inner loops upto a certain height from
+# instrumentation.
+def test4():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input2)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test4_expected_output)
+
+
+# test5: Use both 'lwp_min_height' and 'lwp_max_depth'.
+# instrumentation.
+def test5():
+    test5_config = default_lwp_test_config.copy()
+    test5_config.update(
+        {"tir.lwp_max_depth": 3, "tir.instr_siblings": False, "tir.lwp_min_height": 2}
+    )
+    with tvm.transform.PassContext(config=test5_config):
+        mod = tvm.IRModule.from_expr(input1)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test5_expected_output)
+
+
+# test6: Tests instrumentation for the parallel loops
+def test6():
+    with tvm.transform.PassContext(config=default_lwp_test_config):
+        mod = tvm.IRModule.from_expr(input3)
+        mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], test6_expected_output)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()