diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md index cc433f245759..943a6266a3b4 100644 --- a/apps/hexagon_launcher/README.md +++ b/apps/hexagon_launcher/README.md @@ -189,6 +189,46 @@ lowered = tvm.relay.build( lowered.export_library("model-aot.so", tvm.contrib.hexagon.link_shared) ``` + +## Profiling using hexagon launcher + +### Enabling lightweight profiling (LWP) instrumentation + +This profiling option can be used to get function and loop level processor cycles. +This needs to be enabled explicitly while compiling a model. For example: + +``` +with tvm.transform.PassContext(config={'tir.instrument_lwp':True} ): + lib = relay.build(...) +``` + +Here, `instrument_lwp` is used to enable the tir pass which instruments the code with the builtin calls. + +During codegen, profiling builtin calls can be replaced with a target specific handler to record runtime +information into a buffer. This buffer is written into a JSON file which is processed to construct +function and loop level profiling information. + +To generate LWP JSON file, add `--gen_lwp_json` flag to launcher_android: + +``` +./launcher_android --in_config input.json --out_config output.json --gen_lwp_json +``` + +Please note that `--gen_lwp_json` flag by itself doesn't enable profiling and is only used to dump +the profiling data into a json file called lwp.json. This file will be created at the same location +on the device where launcher_android is executed from. To generate the data, profiling instrumentation +must be enabled while compiling a model as mentioned above. + +Use this command to pull `lwp.json` from the device: + +``` +adb -s pull /path/to/lwp.json +``` + +**Note:** Please refer to src/runtime/hexagon/profiler/README.md for information on how +to enable profiling using Hexagon RPC launcher and also to learn about additional profiling related +config options. + # Disclaimer The launcher does not perform any correctness verification. In order to verify diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt index 5fae6b0a4099..e8bd67dde7a2 100644 --- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt +++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. cmake_minimum_required(VERSION 3.2) -project(HexagonLauncherRPCSkel C CXX) +project(HexagonLauncherRPCSkel C CXX ASM) include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake") # From the include above get @@ -68,11 +68,14 @@ set(SKEL_SRCS "${LAUNCHER_SRC}/launcher_core.cc" "${LAUNCHER_SRC}/launcher_hexagon.cc" ) +set(PROFILER_DIR "${TVM_SOURCE_DIR}/src/runtime/hexagon/profiler") add_library(launcher_rpc_skel SHARED "${LAUNCHER_RPC_H}" "${LAUNCHER_RPC_SKEL_C}" "${SKEL_SRCS}" + "${PROFILER_DIR}/prof_utils.cc" + "${PROFILER_DIR}/lwp_handler.S" ) ExternalProject_Add(static_hexagon_tvm_runtime diff --git a/apps/hexagon_launcher/launcher_android.cc b/apps/hexagon_launcher/launcher_android.cc index 008e4fdfe1c4..34db0bdacb60 100644 --- a/apps/hexagon_launcher/launcher_android.cc +++ b/apps/hexagon_launcher/launcher_android.cc @@ -55,7 +55,8 @@ AEEResult set_remote_stack_size(int size) { } struct RPCChannel : public ExecutionSession { - explicit RPCChannel(const std::string& uri) { + explicit RPCChannel(const std::string& uri, bool gen_lwp_json = false) + : ExecutionSession(gen_lwp_json) { enable_unsigned_pd(true); set_remote_stack_size(128 * 1024); @@ -127,7 +128,7 @@ struct RPCChannel : public ExecutionSession { } bool run(uint64_t* pcycles, uint64_t* usecs) override { - AEEResult rc = launcher_rpc_run(handle, pcycles, usecs); + AEEResult rc = launcher_rpc_run(handle, pcycles, usecs, gen_lwp_json); if (rc != AEE_SUCCESS) { std::cout << "error running model: " << std::hex << rc << '\n'; } @@ -158,8 +159,8 @@ struct RPCChannel : public ExecutionSession { std::vector allocations; }; -ExecutionSession* create_execution_session() { - auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN); +ExecutionSession* create_execution_session(bool gen_lwp_json) { + auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN, gen_lwp_json); if (session->handle == -1) { delete session; session = nullptr; diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h index a32bf937af58..da0dfcbbd5a6 100644 --- a/apps/hexagon_launcher/launcher_core.h +++ b/apps/hexagon_launcher/launcher_core.h @@ -94,6 +94,8 @@ struct Model { }; struct ExecutionSession { + explicit ExecutionSession(bool lwp_json = false) : gen_lwp_json(lwp_json) {} + template T* alloc(size_t bytes, size_t align = 1) { return reinterpret_cast(alloc_mem(bytes, align)); @@ -111,6 +113,7 @@ struct ExecutionSession { virtual bool get_num_outputs(int* num_outputs) = 0; virtual bool get_output(int output_idx, tensor_meta* output_meta, int meta_size, void* output_data, int data_size) = 0; + bool gen_lwp_json = false; }; bool read_model_config(const std::string& file_name, ModelConfig* model_config); diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc index 03524661c4e6..2692caf90e66 100644 --- a/apps/hexagon_launcher/launcher_hexagon.cc +++ b/apps/hexagon_launcher/launcher_hexagon.cc @@ -35,6 +35,7 @@ extern "C" { #include "launcher_rpc.h" static std::unique_ptr TheModel; +bool WriteLWPOutput(const std::string&); static AEEResult error_too_small(const std::string& func_name, const std::string& value_name, int given, int needed) { @@ -203,7 +204,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out } AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcycles, - uint64_t* usecs) { + uint64_t* usecs, int gen_lwp_json) { if (!TheModel) { // No model created. LOG(ERROR) << __func__ << ": no model created"; @@ -220,5 +221,12 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc *pcycles = pc_end - pc_begin; *usecs = us_end - us_begin; + if (gen_lwp_json) { + if (!WriteLWPOutput("lwp.json")) { + LOG(ERROR) << "ERROR: failed to generate lwp json file"; + return AEE_EFAILED; + } + } + return AEE_SUCCESS; } diff --git a/apps/hexagon_launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc index 163d582db440..1ef3b5d2ff3c 100644 --- a/apps/hexagon_launcher/launcher_main.cc +++ b/apps/hexagon_launcher/launcher_main.cc @@ -27,12 +27,14 @@ #include "launcher_core.h" #include "launcher_util.h" -ExecutionSession* create_execution_session(); +ExecutionSession* create_execution_session(bool gen_lwp_json); -int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path) { +int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path, + bool* gen_lwp_json) { static option long_options[] = { {"in_config", required_argument, nullptr, 0}, {"out_config", required_argument, nullptr, 0}, + {"gen_lwp_json", optional_argument, nullptr, 0}, }; bool show_usage = false; @@ -49,6 +51,9 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string case 1: *out_path = std::string(optarg); break; + case 2: + *gen_lwp_json = true; + break; } } if (in_path->empty() || out_path->empty() || show_usage) { @@ -61,7 +66,8 @@ int parse_command_line(int argc, char* argv[], std::string* in_path, std::string int main(int argc, char* argv[]) { std::string in_path, out_path; - if (parse_command_line(argc, argv, &in_path, &out_path) != 0) { + bool gen_lwp_json; + if (parse_command_line(argc, argv, &in_path, &out_path, &gen_lwp_json) != 0) { return 1; } @@ -70,7 +76,7 @@ int main(int argc, char* argv[]) { return 1; } - ExecutionSession* session_ptr = create_execution_session(); + ExecutionSession* session_ptr = create_execution_session(gen_lwp_json); if (session_ptr == nullptr) { return 1; } diff --git a/apps/hexagon_launcher/launcher_rpc.idl b/apps/hexagon_launcher/launcher_rpc.idl index 6677108a76f0..27e5d1d15d68 100644 --- a/apps/hexagon_launcher/launcher_rpc.idl +++ b/apps/hexagon_launcher/launcher_rpc.idl @@ -29,5 +29,5 @@ interface launcher_rpc : remote_handle64 { AEEResult set_input(in long input_idx, in buffer input_meta, in buffer input_value); AEEResult get_num_outputs(rout long num_outputs); AEEResult get_output(in long output_idx, rout buffer output_meta, rout buffer output_value); - AEEResult run(rout uint64_t pcycles, rout uint64_t usecs); + AEEResult run(rout uint64_t pcycles, rout uint64_t usecs, in long gen_lwp_json); }; diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index 399623ef1c3e..735d21e492b5 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -281,10 +281,14 @@ if(USE_HEXAGON_RPC) # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc" ) + set(HEXAGON_PROFILER_DIR "${TVMRT_SOURCE_DIR}/hexagon/profiler") # Add the hardware-specific RPC code into the skel library. + set_property(SOURCE ${HEXAGON_PROFILER_DIR}/lwp_handler.S PROPERTY LANGUAGE C) add_library(hexagon_rpc_skel SHARED "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon/rpc_server.cc" "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_skel.c" + "${HEXAGON_PROFILER_DIR}/prof_utils.cc" + "${HEXAGON_PROFILER_DIR}/lwp_handler.S" ) target_include_directories(hexagon_rpc_skel SYSTEM PRIVATE "${TVMRT_SOURCE_DIR}/hexagon/rpc" @@ -293,6 +297,8 @@ if(USE_HEXAGON_RPC) # executed via run_main_on_sim. add_library(hexagon_rpc_sim SHARED "${TVMRT_SOURCE_DIR}/hexagon/rpc/simulator/rpc_server.cc" + "${HEXAGON_PROFILER_DIR}/prof_utils.cc" + "${HEXAGON_PROFILER_DIR}/lwp_handler.S" ) target_link_libraries(hexagon_rpc_sim -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake index 173f0f3b2d67..ddb158cad95e 100644 --- a/cmake/modules/HexagonSDK.cmake +++ b/cmake/modules/HexagonSDK.cmake @@ -157,9 +157,15 @@ function(_get_hexagon_sdk_property_impl if(_property STREQUAL "SDK_INCLUDE") set(_dirs "${_hexagon_sdk_root}/incs" "${_hexagon_sdk_root}/incs/stddef") elseif(_property STREQUAL "QURT_INCLUDE") + # Set the Hexagon arch directory for runtime linker. + set(_rtld_dir "hexagon_toolv84_${_hexagon_arch}") + if(_hexagon_arch STREQUAL "v69") + set(_rtld_dir "hexagon_toolv84_v68") # Use hexagon_toolv84_v68 for v69 + endif() set(_dirs "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/posix" "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/qurt" + "${_hexagon_sdk_root}/ipc/fastrpc/rtld/ship/${_rtld_dir}" ) elseif(_property STREQUAL "QURT_LIB") set(_dirs "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/lib/pic") diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index a1a97595bfd8..9f6b7f9ce5d1 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -747,6 +747,16 @@ TVM_DLL const Op& assume(); */ TVM_DLL const Op& undef(); +/*! + * \brief Profiling intrinsic + */ +TVM_DLL const Op& start_profile_intrinsic(); + +/*! + * \brief Profiling intrinsic + */ +TVM_DLL const Op& end_profile_intrinsic(); + /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { // array head address diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index e31919fbd223..48372565469b 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -690,6 +690,12 @@ TVM_DLL Pass RemoveWeightLayoutRewriteBlock(bool skip_ndarray_rewrite = false); */ TVM_DLL Pass ManifestSharedMemoryLocalStage(); +/*! + * \brief Insert intrinsic calls to instrument function and loop level profiling. + * \return The pass. + */ +TVM_DLL Pass InstrumentProfileIntrinsics(); + } // namespace transform } // namespace tir } // namespace tvm diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py index 8105e6e716c0..c0e6439d0357 100644 --- a/python/tvm/contrib/hexagon/build.py +++ b/python/tvm/contrib/hexagon/build.py @@ -32,6 +32,7 @@ from typing import Union import tvm +from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler from ..._ffi import libinfo from .session import Session @@ -336,6 +337,29 @@ def get_graph_debug_executor( """ return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root) + @abc.abstractmethod + def get_profile_output( + self, + hex_profiler: HexagonProfiler, + session: Session, + ) -> str: + """Extract profile output. + + Parameters + ---------- + hex_profiler : HexagonProfiler + HexagonProfiler object that contains the profiling related information. + session : Session + Remote session. The session must be established (via __enter__) + prior to calling this function. + + Returns + ------- + profile_data : str + Path of the profiling data file + """ + ... + class HexagonLauncherAndroid(HexagonLauncherRPC): """Hexagon Launcher for Android.""" @@ -392,6 +416,7 @@ def _copy_to_remote( self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path] ): """Abstract method implementation. See description in HexagonLauncherRPC.""" + _check_call_verbose(self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)]) def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path: @@ -629,6 +654,32 @@ def stop_server(self): if not self._hexagon_debug: self.cleanup_directory() + def get_profile_output( + self, + hex_profiler: HexagonProfiler, + session: Session, + ): + """Abstract method implementation. See description in HexagonLauncherRPC.""" + profile_data = "" + if hex_profiler.is_lwp_enabled(): + temp_dir = hex_profiler.get_temp_dir() + remote_path = hex_profiler.get_remote_path() + if not temp_dir: + raise RuntimeError("tempdir not passed") + fname = "lwp.json" + out_path = os.path.join(remote_path, fname) + profile_data = temp_dir.relpath(fname) + ret = session.get_profile_output(hex_profiler.get_mode(), fname) + if ret: + subprocess.check_call(self._adb_device_sub_cmd + ["pull", out_path, profile_data]) + else: + raise RuntimeError("Error generating profile output") + elif hex_profiler.profiling_mode == "etm": + hex_profiler.pull_files_for_etm_processing(self._workspace) + else: + raise RuntimeError("Profiling not enabled") + return profile_data + class HexagonLauncherSimulator(HexagonLauncherRPC): """Hexagon Launcher for Hexagon simulator.""" @@ -735,6 +786,26 @@ def stop_server(self): """Abstract method implementation. See description in HexagonLauncherRPC.""" self._server_process.terminate() + def get_profile_output( + self, + hex_profiler: HexagonProfiler, + session: Session, + ): + """Abstract method implementation. See description in HexagonLauncherRPC.""" + profile_data = "" + if hex_profiler.is_lwp_enabled(): + fname = "lwp.json" + profile_data = f"{self._workspace}/{fname}" + ret = session.get_profile_output(hex_profiler.get_mode(), fname) + if not ret: + raise RuntimeError("Error generating profile output") + elif hex_profiler.profiling_mode == "etm": + raise RuntimeError("ETM Profiling not supported on the simulator") + else: + raise RuntimeError("Profiling not enabled") + + return profile_data + # https://stackoverflow.com/a/52872579/2689797 def _is_port_in_use(port: int) -> bool: diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py new file mode 100755 index 000000000000..9a5df3d9b99a --- /dev/null +++ b/python/tvm/contrib/hexagon/hexagon_profiler.py @@ -0,0 +1,119 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Define HexagonProfiler class to enable profiling for Hexagon""" + +import os +import subprocess +from tvm.ir.transform import PassContext +from tvm.contrib.hexagon.profiling.process_lwp_data import process_lwp_output +from tvm.relay.backend.executor_factory import ExecutorFactoryModule +from tvm.contrib import utils + + +class HexagonProfiler: + """Hexagon Profiler""" + + def __init__( + self, dso_binary: str, module: ExecutorFactoryModule, hexagon_server_process, enable_debug + ): + """Configure HexagonProfiler""" + # Save test .so to process profiling data + self._temp_dir = utils.tempdir(keep_for_debug=enable_debug) + self._dso_binary_path = self._temp_dir.relpath(dso_binary) + module.get_lib().save(self._dso_binary_path) + + self._android_serial_number = os.environ.get("ANDROID_SERIAL_NUMBER") + self._remote_path = "" + self._logcat_path = "" + + self._profiling_mode = None + config = PassContext.current().config + if self._android_serial_number is None: + raise RuntimeError("ANDROID_SERIAL_NUMBER must be set for profiling") + + if ("tir.instrument_lwp", True) in config.items(): + # Set profiling mode + self._profiling_mode = "lwp" + + if self._android_serial_number != "simulator": + # Clear the logcat buffer and create a child process to redirect logcat output + # into a file. + launcher = hexagon_server_process["launcher"] + subprocess.check_call(launcher._adb_device_sub_cmd + ["logcat", "-c"]) + self._logcat_path = self._temp_dir.relpath("logcat.log") + self._fo = open(self._logcat_path, "w") + self._proc = subprocess.Popen( + launcher._adb_device_sub_cmd + ["logcat"], stdout=self._fo + ) + + # Get the remote workspace on the device from where the lwp data needs to be copied. + self._remote_path = launcher._workspace + + if self._profiling_mode is None: + raise RuntimeError("Profiling mode was not set or was not a valid one.") + + def get_mode(self): + return self._profiling_mode + + def is_lwp_enabled(self): + return self._profiling_mode == "lwp" + + def get_temp_dir(self): + return self._temp_dir + + def get_remote_path(self): + return self._remote_path + + def get_profile_output(self, hexagon_launcher, hexagon_session): + """Get runtime profiling data""" + prof_out = hexagon_launcher.get_profile_output(self, hexagon_session) + + print("lwp json can be found at -- ", prof_out) + + # Process lightweight profiling output into an easily readable csv file + # The post-processing requires following parameters: + # 1) Path of the binary file + # 2) android_serial_number + # 3) Path of the lwp json file (lwp.json) which gets created in the current directory + # 4) Path to the run log depending on the environment: + # a) For on-device runs: + # Use logcat output as the run log + # b) For simulator runs: + # Use "stdout.txt" as the run log. There is no need to specify the full path to + # "stdout.txt" as it will be inferred based on 'prof_out' location. + # 5) lwp processed output file - "lwp.csv" + # + lwp_csv = self._temp_dir.relpath("lwp.csv") + if self._android_serial_number == "simulator": + process_lwp_output( + self._dso_binary_path, self._android_serial_number, prof_out, "stdout.txt", lwp_csv + ) + else: + # For on-device run + self._proc.kill() # End the child process for logcat + self._fo.close() + if os.path.exists(self._logcat_path): + process_lwp_output( + self._dso_binary_path, + self._android_serial_number, + prof_out, + self._logcat_path, + lwp_csv, + ) + else: + raise RuntimeError("Error processing lwp output - missing logcat file") diff --git a/python/tvm/contrib/hexagon/profiling/process_lwp_data.py b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py new file mode 100644 index 000000000000..eb92228b7cf3 --- /dev/null +++ b/python/tvm/contrib/hexagon/profiling/process_lwp_data.py @@ -0,0 +1,388 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +import csv +import subprocess +import argparse +import os +from re import search, compile +from collections import OrderedDict + +ENABLE_DEBUG = False +""" +Process lightweight profiling output and generate a CSV file with processor +cycles for the instrumented functions and loops. + +Please note that some assumptions have been made while processing +the lightweight profiling output. They are as follows: + +1) We don't expect profiled functions to call another profiled function. + This constraint can be relaxed if needed but it simplifies the processing + significantly without introducing any limitations for our use case. +2) For now, it's also assumed that every unique section (loop) ID has same start + and end offset which will not be true while a loop gets unrolled as it will + create multiple profiling section with the same ID. The current + implementation doesn't handle this case. + +""" + + +def get_func_info(model_so): + """Get all the .text sections along with their start and end offset values""" + hexagon_nm_path = os.environ["HEXAGON_TOOLCHAIN"] + "/bin/hexagon-nm" + out = subprocess.Popen( + [hexagon_nm_path, "--print-size", model_so], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + stdo, stde = out.communicate() + stdo = stdo.decode("utf-8") + + func_info = [] + for l in stdo.split("\n"): + info = {} + if search(" (T|t) ", l): # If .text section + parts = l.split(" ") + assert len(parts) == 4 + info["start"] = int(parts[0], base=16) + info["end"] = int(parts[0], base=16) + int(parts[1], base=16) + info["name"] = parts[3] + func_info.append(info) + + # Sort the entries in the increasing order of the start offset value. + func_info = sorted(func_info, key=lambda d: d["start"]) + + if ENABLE_DEBUG: + print("func_info :\n ") + for f in func_info: + print(f) + return func_info + + +def find_func(func_info, offset): + """For a given offset, find the function it belongs to.""" + fidx = 0 + lidx = len(func_info) - 1 + while fidx <= lidx: + midx = (fidx + lidx) // 2 + ms = func_info[midx]["start"] + me = func_info[midx]["end"] + if fidx == lidx: + assert ( + offset >= ms and offset <= me + ), f"Couldn't find a function for this offset: {offset}" + return fidx + else: + if offset > me: + fidx = midx + 1 + elif offset < ms: + lidx = midx - 1 + else: + return midx + assert False, "Possible mismatch between model .so and LWP data" + + +def accumulate_cycles(overall_cycles, func_cycles, func_name): + """Accumulate function cycles""" + acc_cycles = overall_cycles[func_name] + for id in func_cycles: + assert id in acc_cycles, f"id [{id}] missing in the existing function record" + assert ( + acc_cycles[id]["start"] == func_cycles[id]["start"] + ), "Offset value doesn't match with the existing function record." + acc_cycles[id]["cycles"] += func_cycles[id]["cycles"] + acc_cycles[id]["count"] += func_cycles[id]["count"] + overall_cycles.update({func_name: acc_cycles}) + return overall_cycles + + +def adjust_per_loop_counts(overall_cycles, data): + """ + Use execution count and the number of entries recorded for each function/loop + to compute the overall cycles spent on them. + """ + for func in overall_cycles: + func_cycles = overall_cycles[func] + for id in func_cycles: + exec_count = data["loop_counts"][id] + rec_count = func_cycles[id]["count"] + assert exec_count != 0, "Execution count should have been non-zero." + assert rec_count != 0, "Entry count should have been non-zero." + exec_cycles = ((int(func_cycles[id]["cycles"])) * exec_count) // rec_count + func_cycles[id]["cycles"] = exec_cycles + func_cycles[id]["count"] = exec_count + overall_cycles.update({func: OrderedDict(sorted(func_cycles.items()))}) + return overall_cycles + + +def create_csv_report(overall_cycles, fname): + """Create csv report""" + header = [ + "function name", + "loop/function id", + "loop depth", + "start offset", + "end offset", + "pcycles", + "parent count", + ] + with open(fname, "w") as f: + writer = csv.writer(f) + writer.writerow(header) + for func in overall_cycles: + func_cycles = overall_cycles[func] + data = [] + root = -1 + outer_most = -1 + for key, value in func_cycles.items(): + if value["parent"] == -1: + assert root == -1, "Can't have multiple root nodes." + root = key + + data.append(func) + data.append(key) + if value["parent"] == -1: + data.append("-") # Total cycles over all invocations of this function. + elif value["parent"] == root: + data.append(0) + outer_most = key + else: + if outer_most > -1: + data.append(key - outer_most) + else: + data.append(key - value["parent"]) + data.append(hex(value["start"])) + data.append(hex(value["end"])) + data.append(value["cycles"]) + data.append(value["count"]) + writer.writerow(data) + data.clear() + + +def process_data(data, func_info, so_ld_addr): + """Process data""" + # Keep an ordered list of loop IDs as they are being visited. This is used + # to match entry and exit pairs. Once the function/loop is processed, it's + # removed from the list. + ordered_visited_list = [] + # Store information regarding visited nodes as they are being processed. Once + # the function/loop is processed, it's removed from the set. + visited_set = {} + # Dictionary to store cycles for the entire model which is grouped into functions. + overall_cycles = {} + func_cycles = {} + + func_idx = -1 + func_name = "" + prev_func_name = "" + func_start = 0 + func_end = 0 + save_data = False + # Iterate over all the entries in the LWP data file and process them + # to construct a report. + for entry in data["entries"]: + id = entry["id"] + offset = entry["ret"] - so_ld_addr + + # Recorded return address should fall within the function begin and end + # offsets. If not, find the function it belongs to. + if offset < func_start or offset > func_end: + prev_func_name = func_name + if ENABLE_DEBUG: + print("offset : ", offset) + print("id : ", id) + + func_idx = find_func(func_info, offset) + func_name = func_info[func_idx]["name"] + func_start = func_info[func_idx]["start"] + func_end = func_info[func_idx]["end"] + if ENABLE_DEBUG: + print("func_name : ", func_name) + + if save_data: + # overall_cycles = save_func_cycles(prev_func_name, overall_cycles, func_cycles, ordered_visited_list) + # Done processing the previous function, copy its info into 'overall_cycles'. + if prev_func_name not in overall_cycles: + overall_cycles[prev_func_name] = func_cycles.copy() + else: + # Accumulate cycles into existing function entry. + overall_cycles = accumulate_cycles(overall_cycles, func_cycles, prev_func_name) + # We don't allow for fused operators (functions) calling another operator. + if ENABLE_DEBUG: + print("ordered_visited_list : ", ordered_visited_list) + + assert len(ordered_visited_list) == 0, ( + f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n" + f"\t Possible reasons -- \n" + f"\t\t1) Mismatch between model .so and json file.\n" + f"\t\t2) LWP buffer may have overflowed resulting into missing entries!" + ) + func_cycles.clear() + + save_data = True + + if id not in visited_set: # Found 'entry' record + visited_info = {"func_idx": func_idx, "ret": offset, "cyc": entry["cyc"]} + visited_set[id] = visited_info + ordered_visited_list.append(id) + else: # Found 'exit' record + # This should be the last entry in the ordered_visited_list. If not, error out. + assert ordered_visited_list[-1] == id, ( + "Problem with LWP output - Interleaved handler calls found." + f"Loop [{ordered_visited_list[-1]}] hasn't exited yet." + ) + ordered_visited_list.pop() + entry_node = visited_set.pop(id) + assert ( + entry_node["func_idx"] == func_idx + ), f'Error - Found under a different function name : {entry_node["func_idx"]}' + cycles = entry["cyc"] - entry_node["cyc"] + parent = -1 + if ordered_visited_list: + parent = int(ordered_visited_list[-1]) + if id in func_cycles: + fcycles = func_cycles[id] + fcycles["cycles"] += cycles + fcycles["count"] += 1 + func_cycles[id] = fcycles + else: + func_cycles[id] = { + "cycles": cycles, + "start": entry_node["ret"], + "end": offset, + "parent": parent, + "count": 1, + } + + # Done processing the previous function, copy its info into 'overall_cycles'. + if func_name not in overall_cycles: + overall_cycles[func_name] = func_cycles.copy() + else: + # Accumulate cycles into existing function entry. + overall_cycles = accumulate_cycles(overall_cycles, func_cycles, func_name) + # We don't allow for fused operators (functions) calling another operator. + if ENABLE_DEBUG: + print("ordered_visited_list : ", ordered_visited_list) + + assert len(ordered_visited_list) == 0, ( + f"\nDone processing function [{prev_func_name}] but ordered_visited_list not empty.\n" + f"\t Possible reasons -- \n" + f"\t\t1) Mismatch between model .so and json file.\n" + f"\t\t2) LWP buffer may have overflowed resulting into missing entries!" % prev_func_name + ) + + overall_cycles = adjust_per_loop_counts(overall_cycles, data) + return overall_cycles + + +def get_load_addr(binary_path: str, serial_number: str, lwp_json: str, run_log: str): + """Get load address of the binary file""" + if serial_number == "simulator": + basedir = os.path.dirname(lwp_json) + if run_log is None: + run_log = os.path.join(basedir, "stdout.txt") + else: + # If the directory name is specified for the run_log of the + # simulator (stdout.txt) then it must be same as lwp_json. + run_log_dir = os.path.dirname(run_log) + assert ( + run_log_dir == "" or run_log_dir == basedir + ), f"stdout.txt and {os.path.basename(lwp_json)} must be in the same directory" + run_log = os.path.join(basedir, os.path.basename(run_log)) + # To extract load address for the simulator run + pattern = compile(r"Model.*: (\w+):") + else: + # To extract load address for on-device run + binary_name = os.path.basename(binary_path) + pattern = compile(r"{}, len \w+, laddr (\w+)".format(binary_name)) + + with open(run_log, "r") as f: + lines = f.read() + a = pattern.search(lines) + load_addr = int(a.group(1), 16) + if ENABLE_DEBUG: + print("load_addr : ", load_addr) + return load_addr + + +def process_lwp_output( + binary_path: str, + serial_number: str, + lwp_json: str, + run_log: str, + lwp_out: str, + enable_debug: bool = False, +): + """Process lightweight profiling data""" + # Enable debug messages + global ENABLE_DEBUG + ENABLE_DEBUG = enable_debug + + # Get load address for the binary + load_addr = get_load_addr(binary_path, serial_number, lwp_json, run_log) + # Opening JSON file + with open(lwp_json, "r") as f: + # Returns JSON object as a dictionary + data = json.load(f) + + # Get function names, and their start and end offsets from the model .so + func_info = get_func_info(binary_path) + + # Get the load address for model .so. + so_ld_addr = load_addr + + # Process profiling data to construct a CSV report. + overall_cycles = process_data(data, func_info, so_ld_addr) + create_csv_report(overall_cycles, lwp_out) + print("lwp processed output written to -- ", lwp_out) + print("[NOTE: Use '--hexagon-debug' to keep the temp directory]") + + +def get_args(): + """Add commandline arguments to run the script manually if needed""" + parser = argparse.ArgumentParser() + parser.add_argument("--lwp-json", help="LWP json file", required=True) + parser.add_argument("--serial-num", help="device-id/simulator", required=True) + parser.add_argument("--test-so", help="Test shared library", required=True) + parser.add_argument( + "--run-log", + help="Logcat file for on-device run and stdout.txt for simulator run", + required=True, + ) + parser.add_argument("--lwp-out", help="LWP output file name", required=True) + parser.add_argument( + "--debug", + help="Enable debug output from the script", + dest="debug", + action="store_true", + required=False, + ) + parser.set_defaults(debug=False) + args = parser.parse_args() + + global ENABLE_DEBUG + ENABLE_DEBUG = args.debug + + return args + + +if __name__ == "__main__": + args = get_args() + process_lwp_output( + args.test_so, args.serial_num, args.lwp_json, args.run_log, args.lwp_out, args.debug + ) diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py index b69382fe1290..d6ea51b53e17 100644 --- a/python/tvm/contrib/hexagon/session.py +++ b/python/tvm/contrib/hexagon/session.py @@ -393,3 +393,8 @@ def _aot_executor_from_factory( remote_file_path = self.upload(binary_path, binary_name) return self.get_aot_executor(remote_file_path) + + def get_profile_output(self, mode: str, path: str): + assert isinstance(mode, str), f"Invalid mode type, {type(mode)} != str" + assert isinstance(path, str), f"Invalid path type, {type(path)} != str" + return self._rpc.get_function("tvm.hexagon.get_profile_output")(mode, path) diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 7b3a81acc525..82533a2f9f5a 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -995,3 +995,14 @@ def ManifestSharedMemoryLocalStage(): The result pass """ return _ffi_api.ManifestSharedMemoryLocalStage() # type: ignore + + +def InstrumentProfileIntrinsics(): + """Insert intrinsic calls to instrument function and loop level profiling. + + Returns + ------- + fpass : tvm.transform.Pass + The result pass + """ + return _ffi_api.InstrumentProfileIntrinsics() # type: ignore diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 5f8c8742695d..2b9a354f5c7e 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -52,6 +52,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array>); TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool); using runtime::PackedFunc; using runtime::TVMArgs; @@ -157,6 +158,8 @@ Array CreatePassList(bool disable_loop_partition) { pass_ctx->GetConfig>>("tir.add_lower_pass", Array>()) .value(); + bool instrument_lwp = pass_ctx->GetConfig("tir.instrument_lwp", Bool(false)).value(); + Array user_lower_phase0 = Array(); Array user_lower_phase1 = Array(); Array user_lower_phase2 = Array(); @@ -253,6 +256,14 @@ Array CreatePassList(bool disable_loop_partition) { pass_list.push_back( tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir)); + // This pass instruments the loops with the profile builtin calls to capture the runtime + // performance data (only enabled for Hexagon at the moment). To ensure that no other + // optimizations are performed on the instrumented code, this pass must be added at the end + // of the list. + if (instrument_lwp) { + pass_list.push_back(tir::transform::InstrumentProfileIntrinsics()); + } + return pass_list; } diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc index 81eb30ee12d2..a0c6c48b5e44 100644 --- a/src/runtime/dso_library.cc +++ b/src/runtime/dso_library.cc @@ -34,6 +34,12 @@ #include #endif +#if defined(__hexagon__) +extern "C" { +#include +} +#endif + namespace tvm { namespace runtime { @@ -118,6 +124,14 @@ void DSOLibrary::Load(const std::string& name) { lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " " << dlerror(); +#if defined(__hexagon__) + int p; + int rc = dlinfo(lib_handle_, RTLD_DI_LOAD_ADDR, &p); + if (rc) + FARF(ERROR, "error getting model .so start address : %u", rc); + else + FARF(ALWAYS, "Model .so Start Address : %x", p); +#endif } void* DSOLibrary::GetSymbol_(const char* name) { return dlsym(lib_handle_, name); } diff --git a/src/runtime/hexagon/profiler/README.md b/src/runtime/hexagon/profiler/README.md new file mode 100644 index 000000000000..492e45c98498 --- /dev/null +++ b/src/runtime/hexagon/profiler/README.md @@ -0,0 +1,99 @@ + + + + + + + + + + + + + + + + + +# Hexagon lightweight instrumentation based profiling (LWP) + +For Hexagon, LWP can be used to get function and loop level processor cycle count. +This is done by instrumenting the code with profiling builtin calls using a TIR pass. +During codegen, these builtin calls are replaced with the calls to a hexagon specific +handler which records the runtime information into a buffer. +This buffer is written into a JSON file ('lwp.json') which is processed to construct +function and loop level profiling information as a csv file. + +**Note:** During codegen, the profiling builtin calls are ignored for other targets. + +The TIR pass offers several config flags to control the level of instrumentation +as mentioned below: + +1) `lwp_disable_func_prof`: To disable function level profiling. By default, it is +set to 'False', i.e., the function level profiling is enabled. + +2) `instr_siblings`: When enabled, only loops with siblings are instrumented and rest are +ignored. The inner-most loops are always excluded from instrumentation unless overwritten +using `lwp_min_height`. This is done to minimize the adverse effect of instrumentation on +actual performance. By default, it is set to 'True'. + +3) `lwp_max_depth`: To instrument loops up to a certain depth. This flag is effective +only when `instr_siblings` is disabled. By default, it is set to 0. + +4) `lwp_min_height`: To exclude inner loops up to a certain height from instrumentation. +By default, it is set to 1. + +For additional usage information on various config flags, please refer to the tests in +`tests/python/unittest/test_tir_transform_profiling_instr.py` + + +## How to use lightweight profiling with RPC Launcher: + +`tests/python/contrib/test_hexagon/test_launcher.py` contains two tests, `test_lwp` and +`test_lwp_multiple_conv2d`, to demonstrate lightweight profiling usage. + +The steps involved are as follows: + +1) While building a model, set `tir.instrument_lwp` to `True`. + By default, the builtin calls will only be inserted for the loops with siblings. But it + can be altered using LWP config options as described above. +2) Create `HexagonProfiler` object + +``` +with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}): + lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target_hexagon, host=target_hexagon), + ... + ) + + # Create HexagonProfiler object. It sets the profiling mode based on the PassContext config. + # '--hexagon-debug' to pytest can be used to retain any temp or test directories to + # inspect the profiling data. + profiler = HexagonProfiler(lowered, hexagon_server_process, hexagon_debug) +``` + +4) Run the model and get the profiling data as a CSV file. It is done by post-processing + 'lwp.json' file generated during runtime. + +``` + graph_mod.run(**inputs) + + # Get lightweight profiling output as a CSV file + profiler.get_profile_output(hexagon_launcher, hexagon_session, hexagon_server_process) +``` +**Note:** + +- For on-device runs, 'lwp.json' is copied into a temp directory along with the test .so and the processed + CSV file +- For the simulator runs, the file is generated in the simulator test output directory. Test .so + will still be in a separate temp directory. lwp CSV file will also be in the same directory. + +**Helpful Hints:** + +- To prevent the test directories on the Hexagon device as well as temporary test directory on x86 +from being deleted for profiling related runs, pass `--hexagon-debug` to pytest. + +``` +python -m pytest --hexagon-debug tests/python/contrib/test_hexagon/test_launcher.py::test_lwp +``` diff --git a/src/runtime/hexagon/profiler/lwp_handler.S b/src/runtime/hexagon/profiler/lwp_handler.S new file mode 100644 index 000000000000..611c0713111a --- /dev/null +++ b/src/runtime/hexagon/profiler/lwp_handler.S @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* +Lightweight profiling handler to record processor cycles in a buffer +(pointed by __lwp_buffer_ptr) for a given invocation of the handler. To keep the +buffer size within a resonable limit, we only recond data for the first 100 +invocation of the handler for a given loop or function ID (passed in R0 register). +The buffer size wouldn't be a concern if the loops with only siblings are getting +profiled. However, since the instrumentation provides several different profiling +options, this approach ensures that they all function as expexted. We use second +buffer (pointed by __lwp_counter) to keep count of the calls made to lwp_handler +function for each function/loop. + +Brief explanation of all the global variables used: +1) __lwp_counter : Pointer to the buffer that keeps count of the number of times handler + is called for a given ID. To reduce the complexity of the handler, __lwp_counter is + indexed using the ID itself. +2) __lwp_buffer_ptr : Pointer to the buffer that records loop/function ID, processor cycles + and return addresss of the handler. Return address is used to reconstruct the call graph + (loop-nest) to make it easier to analyze the profiling data. +3) __lwp_buffer_size : Size of the buffer +4) __lwp_buffer_count : Offset into main lwp buffer where data for the current handler +invocation needs to be written. + +NOTE: The handler function saves and restores R0-R5 registers which are caller saved registers +on Hexagon and should be handled at the callsite. However, to reduce the codegen impact +of the handler calls on the caller functions, we decided to move this part into the +handler itself. + +*/ + .text + .globl lwp_handler + .falign + .type lwp_handler,@function +lwp_handler: + { allocframe(#24) // Allocate 24 bytes on the stack to save R0-R5 registers + memd(r29+#-16) = r5:4 // Save R5,R4 + } + { + memd(r29+#8) = r3:2 // Save R3,R2 + memd(r29+#0) = r1:0 // Save R1, R0 + r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL) // Get GOT address + } + { + r5 = memw(r2+##__lwp_counter@GOT) // Get address of the pointer to __lwp_counter + r3 = memw(r2+##__lwp_buffer_count@GOT) // Get the address of __lwp_buffer_count + } + { + r5 = memw(r5+#0) // Get the address of __lwp_counter (address of the main lwp buffer) + r3 = memw(r3+#0) // Get the __lwp_buffer_count value (offset into the main buffer) + } + { + r4 = memw(r5+r0<<#2) // Get the handler invocation count for the ID (passed in R0) + r1 = memw(r2+##__lwp_buffer_size@GOT) // Get the address of __lwp_buffer_size + } + { + r4 = add(r4,#1) // Increment count + memw(r5+r0<<#2) = r4.new // Update count in __lwp_counter for a given ID + r1 = memw(r1+#0) // Get the buffer size + } + { + p0 = cmp.gtu(r4,#100) // Exit if count for a given ID is greater than 100 + if (p0.new) jump:nt .LBB0_3 + r5 = memw(r2+##__lwp_buffer_ptr@GOT) // Get address of the pointer to __lwp_buffer_ptr + } + { + r5 = memw(r5+#0) // Get address of __lwp_buffer_ptr + r2 = memw(r2+##__lwp_buffer_count@GOT) // Get address of __lwp_buffer_count + } + { + r4 = add(r3,#4) // Increment the offset by 4 since 4 int32 values are stored for each invocation + if (!cmp.gtu(r1,r4.new)) jump:t .LBB0_3 // Exit if the main lwp buffer has run out of space + } + { + r5 = addasl(r5,r3,#2) // Get the address where the data needs to be recorded + memw(r2+#0) = r4 // Save next offset into __lwp_buffer_count + } + { + memw(r5+#0) = r31 // Save return address of this function + r1:0 = C15:14 // Control registers that keep processor cycle count (64-bits) + memw(r5+#4) = r0 // Save loop/function ID + } + { + memw(r5+#12) = r1 // Save upper 32 bits + memw(r5+#8) = r0 // Save lower 32 bits + } + .falign +.LBB0_3: + { + r5:4 = memd(r29+#16) // Restore the registers from the stack + r3:2 = memd(r29+#8) + } + { + r1:0 = memd(r29+#0) + dealloc_return // Deallocate the stack and return + } +.Lfunc_end0: + .size lwp_handler, .Lfunc_end0-lwp_handler diff --git a/src/runtime/hexagon/profiler/prof_utils.cc b/src/runtime/hexagon/profiler/prof_utils.cc new file mode 100644 index 000000000000..45cbe607031d --- /dev/null +++ b/src/runtime/hexagon/profiler/prof_utils.cc @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +// The max loop/function id used among all lwp_handler calls. Since +// the id is used to index into the lwp_counter buffer, the size of the +// buffer must be equal or greater than the max possible id. +#define LWP_COUNTER_SIZE 5000 + +// LWP_BUFFER_SIZE needs to be at most 100 * LWP_COUNTER_SIZE since 100 is +// the max number of entries recorded for each instrumented location. +#define LWP_BUFFER_SIZE (LWP_COUNTER_SIZE * 100) + +uint32_t lwp_counter[LWP_COUNTER_SIZE] = {0}; +uint32_t lwp_buffer[LWP_BUFFER_SIZE]; +uint32_t* __lwp_counter = lwp_counter; +uint32_t* __lwp_buffer_ptr = lwp_buffer; +uint32_t __lwp_buffer_size = LWP_BUFFER_SIZE; +uint32_t __lwp_enable_flag = 1; +uint32_t __lwp_buffer_count = 0; + +bool WriteLWPOutput(const std::string& out_json) { + std::ostringstream s; + s << "{\n"; + s << "\t\"entries\":[\n"; + for (size_t i = 0; i < __lwp_buffer_count; i += 4) { + s << "\t{\n"; + s << "\t\t\"ret\":" << std::dec << lwp_buffer[i] << ",\n"; + s << "\t\t\"id\":" << std::dec << lwp_buffer[i + 1] << ",\n"; + uint64_t pcycles = (static_cast(lwp_buffer[i + 3]) << 32) + lwp_buffer[i + 2]; + s << "\t\t\"cyc\":" << std::dec << pcycles << "\n"; + s << "\t}"; + if (i < __lwp_buffer_count - 4) { + s << ",\n"; + } + } + s << "\t],\n\n"; + s << "\t\"loop_counts\":[\n"; + for (size_t i = 0; i < LWP_COUNTER_SIZE; i++) { + s << "\t\t" << lwp_counter[i] / 2; + if (i < LWP_COUNTER_SIZE - 1) + s << ",\n"; + else + s << "\n"; + } + s << "\t]\n}\n"; + std::ofstream ofc(out_json); + if (!ofc.is_open()) { + return false; + } + + ofc << s.str() << "\n"; + + if (!ofc) { + return false; + } + ofc.close(); + return true; +} diff --git a/src/runtime/hexagon/profiler/prof_utils.h b/src/runtime/hexagon/profiler/prof_utils.h new file mode 100644 index 000000000000..e086f7a26b36 --- /dev/null +++ b/src/runtime/hexagon/profiler/prof_utils.h @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file prof_utils.h + */ +#ifndef TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_ +#define TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_ + +#include + +bool WriteLWPOutput(const std::string&); + +#endif // TVM_RUNTIME_HEXAGON_PROFILER_PROF_UTILS_H_ diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc index b4799d5d7127..41c63d0affeb 100644 --- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc +++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc @@ -39,6 +39,7 @@ extern "C" { #include "../../../minrpc/minrpc_server.h" #include "../../hexagon/hexagon_common.h" #include "../../hexagon/hexagon_device_api.h" +#include "../../profiler/prof_utils.h" #include "hexagon_rpc.h" namespace tvm { @@ -329,3 +330,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module") tvm::ObjectPtr n = tvm::runtime::CreateDSOLibraryObject(soname); *rv = CreateModuleFromLibrary(n); }); + +TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output") + .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) { + std::string profiling_mode = args[0]; + std::string out_file = args[1]; + if (profiling_mode.compare("lwp") == 0) { + *rv = WriteLWPOutput(out_file); + } else { + HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str()); + *rv = false; + } + }); diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc index 119e999276f8..41bb2da6f8b1 100644 --- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc +++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc @@ -29,6 +29,7 @@ #include "../../../library_module.h" #include "../../../minrpc/minrpc_server.h" #include "../../hexagon_common.h" +#include "../../profiler/prof_utils.h" #include "hexagon_sim_proto.h" #include "tvm/runtime/packed_func.h" #include "tvm/runtime/registry.h" @@ -336,3 +337,15 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module") tvm::ObjectPtr n = tvm::runtime::CreateDSOLibraryObject(soname); *rv = CreateModuleFromLibrary(n); }); + +TVM_REGISTER_GLOBAL("tvm.hexagon.get_profile_output") + .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) { + std::string profiling_mode = args[0]; + std::string out_file = args[1]; + if (profiling_mode.compare("lwp") == 0) { + *rv = WriteLWPOutput(out_file); + } else { + HEXAGON_PRINT(ERROR, "ERROR: Unsupported profiling mode: %s", profiling_mode.c_str()); + *rv = false; + } + }); diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index ff59dfcceb8d..9552a45a60f9 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -75,6 +75,7 @@ class CodeGenHexagon final : public CodeGenCPU { using CodeGenCPU::VisitStmt_; llvm::Value* VisitExpr_(const BufferLoadNode* op) override; + llvm::Value* CreateIntrinsic(const CallNode* op) override; llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array& args, bool skip_first_arg) override; @@ -193,6 +194,28 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) { return CodeGenCPU::VisitExpr_(op); } +llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) { +#if TVM_LLVM_VERSION >= 150 + if (op->op.same_as(builtin::start_profile_intrinsic()) || + op->op.same_as(builtin::end_profile_intrinsic())) { + llvm::Value* id = MakeValue(op->args[0]); + auto instrprof_id = llvm::Intrinsic::hexagon_instrprof_custom; + llvm::Function* func = llvm::Intrinsic::getDeclaration(module_.get(), instrprof_id); + llvm::GlobalVariable* name_var = module_->getGlobalVariable("handler_name"); + if (!name_var) { + llvm::StringRef init_str = "lwp_handler"; + llvm::Constant* init = llvm::ConstantDataArray::getString(module_->getContext(), init_str); + + name_var = new llvm::GlobalVariable(*module_, init->getType(), true, + llvm::GlobalValue::InternalLinkage, init, "handler_name"); + } + llvm::Type* t_int8_p_ = t_int8_->getPointerTo(); + return builder_->CreateCall(func, {llvm::ConstantExpr::getBitCast(name_var, t_int8_p_), id}); + } +#endif + return CodeGenCPU::CreateIntrinsic(op); +} + void CodeGenHexagon::CreatePrintf(const std::string& format, llvm::ArrayRef format_args) { // This function generates LLVM instructions to call HAP_debug_v2, diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 6a50dd4534c2..ea8a5ff5106a 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -1318,6 +1318,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { // TODO(masahi): Support atomic for CPU backend LOG(FATAL) << "CPU backend does not support atomic add yet."; return nullptr; + } else if (op->op.same_as(builtin::start_profile_intrinsic()) || + op->op.same_as(builtin::end_profile_intrinsic())) { + LOG(INFO) << "Ignoring profile_intrinsic ... " << op->op; + return nullptr; } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr; diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 1e2d790c76e1..b605b9de1e02 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -302,6 +302,12 @@ TIR_DEFINE_BUILTIN_FUNC(undef) .set_attr("TCallEffectKind", Integer(CallEffectKind::kReadState)) .set_num_inputs(0); +TIR_DEFINE_BUILTIN_FUNC(start_profile_intrinsic) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)); + +TIR_DEFINE_BUILTIN_FUNC(end_profile_intrinsic) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)); + } // namespace builtin } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/profile_instrumentation.cc b/src/tir/transforms/profile_instrumentation.cc new file mode 100644 index 000000000000..5f52fc6630bc --- /dev/null +++ b/src/tir/transforms/profile_instrumentation.cc @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file profile_instrumentation.cc + */ +// Insert profile intrinsic at loop and function level. During codegen, +// these instruction can be replaced with a call to a target specific handler +// and can be used to capture profiling information such as processor cycles. + +#include +#include +#include +#include +#include + +namespace tvm { +namespace tir { +namespace lwp { + +TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_disable_func_prof", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_max_depth", Integer); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.lwp_min_height", Integer); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.instr_siblings", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.reset_start_id", Bool); + +static int32_t start_id = 0; + +struct LoopInfo { + LoopInfo() = default; + LoopInfo(unsigned i, unsigned d, unsigned h = 0) : id(i), depth(d), height(h) { + has_siblings = false; + has_parallel = false; + } + unsigned id; + int32_t depth; + int32_t height; + bool has_siblings; + // Set to 'true' if ForKind::kParallel is set for the current loop or one of its ancestor + bool has_parallel; +}; + +using LoopInfoMap = std::unordered_map; +// Traverse loops depth first and assign them a unique number. +class LoopAnalyzer : public StmtExprVisitor { + public: + LoopInfoMap Analyze(const Stmt& stmt) { + this->VisitStmt(stmt); + return loops; + } + void VisitStmt_(const ForNode* op) final { + LoopInfo loop_info(start_id, 0); + start_id++; + loop_info.height = TraverseLoop(op->body, 0); + loops[op] = loop_info; + } + + unsigned TraverseLoop(const Stmt& stmt, unsigned parent_depth, bool has_parallel = false) { + if (stmt->IsInstance()) { + std::vector siblings; + unsigned height = 0; + bool has_loop = false; + const SeqStmtNode* n = stmt.as(); + for (Stmt s : n->seq) { + if (s->IsInstance()) { + has_loop = true; + const ForNode* f = s.as(); + LoopInfo loop_info(start_id, parent_depth + 1); + start_id++; + bool parent_parallel = false; + if (has_parallel) { + loop_info.has_parallel = true; + parent_parallel = true; + } else if (f->kind == ForKind::kParallel) { + // has_parallel for the current loop is being set to 'false' since the + // intrinsic is added outside of the loop. The instrumentation isn't + // allowed for the subsequent nested loops. + loop_info.has_parallel = false; + parent_parallel = true; + } + siblings.push_back(f); + height = std::max(height, TraverseLoop(f->body, parent_depth + 1, parent_parallel)); + loop_info.height = height; + loops[f] = loop_info; + } + } + if (siblings.size() > 1) { + for (auto* l : siblings) { + loops[l].has_siblings = true; + } + } + height = has_loop ? height + 1 : height; + return height; // Parent's height : max of all children's height + } else if (stmt->IsInstance()) { + const IfThenElseNode* n = stmt.as(); + unsigned height = TraverseLoop(n->then_case, parent_depth, has_parallel); + if (n->else_case.defined()) { + height = std::max(height, TraverseLoop(n->else_case, parent_depth, has_parallel)); + } + return height; + } else if (stmt->IsInstance()) { + const ForNode* f = stmt.as(); + LoopInfo loop_info(start_id, parent_depth + 1); + start_id++; + bool parent_parallel = false; + if (has_parallel) { + loop_info.has_parallel = true; + parent_parallel = true; + } else if (f->kind == ForKind::kParallel) { + // has_parallel for the current loop is being set to 'false' since the + // intrinsic is added outside of the loop. The instrumentation isn't + // allowed for the subsequent nested loops. + loop_info.has_parallel = false; + parent_parallel = true; + } + unsigned height = TraverseLoop(f->body, parent_depth + 1, parent_parallel); + loop_info.height = height; + loops[f] = loop_info; + return height + 1; + } else if (stmt->IsInstance()) { + const LetStmtNode* n = stmt.as(); + return TraverseLoop(n->body, parent_depth, has_parallel); + } else if (stmt->IsInstance()) { + const AttrStmtNode* n = stmt.as(); + return TraverseLoop(n->body, parent_depth, has_parallel); + } else if (stmt->IsInstance()) { + const AllocateNode* n = stmt.as(); + return TraverseLoop(n->body, parent_depth, has_parallel); + } else { + return 0; // inner-most loop + } + } + + private: + LoopInfoMap loops; +}; + +class InstrumentIntrin : public StmtMutator { + public: + InstrumentIntrin(int32_t max_depth, int32_t min_height, bool instr_siblings) + : max_instr_depth_(max_depth), + min_instr_height_(min_height), + instr_siblings_(instr_siblings) {} + + void GetLoopInfo(PrimFuncNode* op) { + LoopAnalyzer analzer; + loops_ = std::move(analzer.Analyze(op->body)); + } + + Stmt VisitStmt_(const SeqStmtNode* op) final { + Stmt stmt = StmtMutator::VisitStmt_(op); + return SeqStmt::Flatten(stmt); + } + + Stmt VisitStmt_(const ForNode* op) final { + Stmt stmt = StmtMutator::VisitStmt_(op); + if (loops_.count(op) < 1) return stmt; + + LoopInfo loop_info = loops_[op]; + + if (loop_info.has_parallel) { + return stmt; + } + + // Exclude inner-most loops from instrumentation. The inner-most loop has + // height '0' and it increases as we move outward in the loop nest. + if (loop_info.height < min_instr_height_) { + return stmt; + } + + // Only instrument loops with a sibling + if (instr_siblings_ && !loop_info.has_siblings) { + return stmt; + } + + // If instr_siblings_ is set, ignore max depth for instrumentation + if (!instr_siblings_ && loop_info.depth > max_instr_depth_) { + return stmt; + } + PrimExpr id = static_cast(loop_info.id); + PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {id}); + PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {id}); + const Stmt start_profile = Evaluate(start_call); + const Stmt end_profile = Evaluate(end_call); + Stmt new_stmt = SeqStmt({start_profile, stmt, end_profile}); + return new_stmt; + } + + private: + LoopInfoMap loops_; + int32_t max_instr_depth_; + int32_t min_instr_height_; + bool instr_siblings_; +}; + +class CheckParallelLoops : public StmtExprVisitor { + public: + bool HasParallelLoops(const Stmt& stmt) { + this->VisitStmt(stmt); + return has_parallel; + } + + private: + void VisitStmt_(const ForNode* op) final { + if (op->kind == ForKind::kParallel) { + has_parallel = true; + } else { + StmtExprVisitor::VisitStmt_(op); + } + } + + bool has_parallel = false; +}; + +PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_instr_height, + bool instr_siblings, bool disable_func_instrumentation) { + auto* func_ptr = func.CopyOnWrite(); + + PrimExpr e = start_id++; + if (!disable_func_instrumentation) { + PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {e}); + PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {e}); + const Stmt start_profile = Evaluate(start_call); + const Stmt end_profile = Evaluate(end_call); + func_ptr->body = SeqStmt({start_profile, std::move(func_ptr->body), end_profile}); + } + InstrumentIntrin p(max_instr_depth, min_instr_height, instr_siblings); + p.GetLoopInfo(func_ptr); + func_ptr->body = p(std::move(func_ptr->body)); + return std::move(func); +} + +} // namespace lwp + +namespace transform { +Pass InstrumentProfileIntrinsics() { + auto pass_func = [](IRModule m, PassContext ctx) { + auto* mptr = m.CopyOnWrite(); + + // All loops with depth <= max_instr_depth are instrumented. By default, + // only outer-most loops are instrumented which has a depth of 0. + // In addition, loops with siblings are also instrumented provided + // their loop depth is >= min_instr_height. This is done to avoid + // instrumenting inner-most loops. + auto max_instr_depth = ctx->GetConfig("tir.lwp_max_depth", Integer(0)).value(); + auto min_instr_height = ctx->GetConfig("tir.lwp_min_height", Integer(1)).value(); + bool instr_siblings = ctx->GetConfig("tir.instr_siblings", Bool(true)).value(); + bool disable_func_instrumentation = + ctx->GetConfig("tir.lwp_disable_func_prof", Bool(false)).value(); + bool reset_start_id = ctx->GetConfig("tir.reset_start_id", Bool(false)).value(); + if (reset_start_id) lwp::start_id = 0; + std::vector> updates; + for (const auto& kv : mptr->functions) { + if (auto* n = kv.second.as()) { + PrimFunc func = GetRef(n); + auto updated_func = + lwp::AddProfileBuiltins(func, max_instr_depth.IntValue(), min_instr_height.IntValue(), + instr_siblings, disable_func_instrumentation); + updates.push_back({kv.first, updated_func}); + } + } + for (const auto& pair : updates) { + mptr->AddUnchecked(pair.first, pair.second); + } + return m; + }; + + return tvm::transform::CreateModulePass(pass_func, 0, "tir.InstrumentProfileIntrinsics", {}); +} + +TVM_REGISTER_GLOBAL("tir.transform.InstrumentProfileIntrinsics") + .set_body_typed(InstrumentProfileIntrinsics); + +} // namespace transform + +} // namespace tir +} // namespace tvm diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index 162e4a1cc7a1..527c79754796 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -157,6 +157,7 @@ "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template", # Hexagon "src/runtime/hexagon/rpc/android_bash.sh.template", + "src/runtime/hexagon/profiler/lwp_handler.S", } diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py index 565999c32957..76d5cba60a1f 100644 --- a/tests/python/contrib/test_hexagon/test_launcher.py +++ b/tests/python/contrib/test_hexagon/test_launcher.py @@ -24,6 +24,8 @@ from tvm import relay, te from tvm.contrib.hexagon.session import Session from tvm.relay.backend import Executor, Runtime +from tvm.contrib.hexagon.build import HexagonLauncherRPC +from tvm.contrib.hexagon.hexagon_profiler import HexagonProfiler from .infrastructure import get_hexagon_target @@ -568,5 +570,161 @@ def test_dense_relay_vrmpy(hexagon_session, data_dtype, weight_dtype): np.testing.assert_equal(out, ref) +@tvm.testing.requires_hexagon +def test_lwp( + hexagon_server_process, + hexagon_launcher: HexagonLauncherRPC, + hexagon_session: Session, + hexagon_debug, +): + dtype = "float32" + data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype)) + weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype)) + y = relay.nn.conv2d( + data, + weight, + padding=(2, 2), + kernel_size=(5, 5), + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype="float32", + ) + + f = relay.Function([data, weight], y) + relay_mod = tvm.IRModule.from_expr(f) + relay_mod = relay.transform.InferType()(relay_mod) + + target_hexagon = tvm.target.hexagon("v68") + runtime = Runtime("cpp") + executor = Executor("graph") + + weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype) + data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype) + params = {"weight": weight_in} + inputs = {"data": data_in} + + with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}): + lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target_hexagon, host=target_hexagon), + runtime=runtime, + executor=executor, + ) + # Create HexagonProfiler object + dso_binary = "test_binary.so" + profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug) + + graph_mod = hexagon_session.get_executor_from_factory(lowered) + graph_mod.set_input(**params) + graph_mod.run(**inputs) + hexagon_output = graph_mod.get_output(0).numpy() + + # Get lightweight profiling output as a CSV file + profiler.get_profile_output(hexagon_launcher, hexagon_session) + + target_llvm = tvm.target.Target("llvm") + with tvm.transform.PassContext(opt_level=3): + llvm_lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target_llvm, host=target_llvm), + runtime=runtime, + executor=executor, + ) + llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) + llvm_graph_mod.set_input(weight=weight_in) + llvm_graph_mod.run(data=data_in) + expected_output = llvm_graph_mod.get_output(0).numpy() + + tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5) + + +@tvm.testing.requires_hexagon +def test_lwp_multiple_conv2d( + hexagon_server_process, + hexagon_launcher: HexagonLauncherRPC, + hexagon_session: Session, + hexagon_debug, +): + dtype = "float32" + input_shape = (1, 8, 8, 3) + w1_shape = (5, 5, 3, 1) + w2_shape = (5, 5, 1, 3) + data = relay.var("data", relay.TensorType(input_shape, dtype)) + weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype)) + weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype)) + y1 = relay.nn.conv2d( + data, + weight1, + padding=(2, 2), + kernel_size=(5, 5), + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype="float32", + ) + y2 = relay.nn.conv2d( + y1, + weight2, + padding=(2, 2), + kernel_size=(5, 5), + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype="float32", + ) + f = relay.Function([data, weight1, weight2], y2) + relay_mod = tvm.IRModule.from_expr(f) + relay_mod = relay.transform.InferType()(relay_mod) + + target_hexagon = tvm.target.hexagon("v68") + runtime = Runtime("cpp") + executor = Executor("graph") + + weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype( + dtype=dtype + ) + weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype( + dtype=dtype + ) + input_data = np.random.rand( + input_shape[0], input_shape[1], input_shape[2], input_shape[3] + ).astype(dtype=dtype) + + params = {"weight1": weight1_data, "weight2": weight2_data} + inputs = {"data": input_data} + + with tvm.transform.PassContext(opt_level=3, config={"tir.instrument_lwp": True}): + lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target_hexagon, host=target_hexagon), + runtime=runtime, + executor=executor, + ) + # Create HexagonProfiler object + dso_binary = "test_binary.so" + profiler = HexagonProfiler(dso_binary, lowered, hexagon_server_process, hexagon_debug) + + graph_mod = hexagon_session.get_executor_from_factory(lowered) + graph_mod.set_input(**params) + graph_mod.run(**inputs) + hexagon_output = graph_mod.get_output(0).numpy() + + # Get lightweight profiling output as a CSV file + profiler.get_profile_output(hexagon_launcher, hexagon_session) + + target_llvm = tvm.target.Target("llvm") + with tvm.transform.PassContext(opt_level=3): + llvm_lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target_llvm, host=target_llvm), + runtime=runtime, + executor=executor, + ) + llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) + llvm_graph_mod.set_input(**params) + llvm_graph_mod.run(**inputs) + expected_output = llvm_graph_mod.get_output(0).numpy() + + tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5) + + if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/unittest/test_tir_transform_profiling_instr.py b/tests/python/unittest/test_tir_transform_profiling_instr.py new file mode 100644 index 000000000000..d14e2a4c8925 --- /dev/null +++ b/tests/python/unittest/test_tir_transform_profiling_instr.py @@ -0,0 +1,340 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import tvm.testing +from tvm import te +from tvm.ir.module import IRModule +from tvm.script import tir as T +import numpy + +default_lwp_test_config = { + "tir.instrument_lwp": True, + "tir.lwp_disable_func_prof": True, + "tir.reset_start_id": True, +} + + +@T.prim_func +def input1(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + for i, j in T.grid(8, 8): + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + for k, l in T.grid(8, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2 + + +@T.prim_func +def input2(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + D = T.match_buffer(d, (8, 8, 128), dtype="int32") + for i in T.serial(0, 8): + for j in T.serial(0, 8): + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + for j in T.serial(0, 8): + for k, l in T.grid(8, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2 + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + + +@T.prim_func +def input3(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + D = T.match_buffer(d, (8, 8, 128), dtype="int32") + for i in T.serial(0, 8): + for j in T.parallel(0, 8): + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + for j in T.serial(0, 8): + for k in T.parallel(0, 8): + for l in T.serial(0, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2 + for k in T.parallel(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + + +@T.prim_func +def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + for i, j in T.grid(8, 8): + T.evaluate(T.start_profile_intrinsic(3, dtype="handle")) + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(3, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(5, dtype="handle")) + for k, l in T.grid(8, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(5, dtype="handle")) + + +@T.prim_func +def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + T.evaluate(T.start_profile_intrinsic(1, dtype="handle")) + for i in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(2, dtype="handle")) + for j in T.serial(0, 8): + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(2, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(1, dtype="handle")) + + +@T.prim_func +def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + T.evaluate(T.start_profile_intrinsic(1, dtype="handle")) + for i in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(2, dtype="handle")) + for j in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(3, dtype="handle")) + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(3, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(5, dtype="handle")) + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(5, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(2, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(1, dtype="handle")) + + +@T.prim_func +def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + D = T.match_buffer(d, (8, 8, 128), dtype="int32") + for i in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(2, dtype="handle")) + for j in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(3, dtype="handle")) + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(3, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(5, dtype="handle")) + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + T.evaluate(T.end_profile_intrinsic(5, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(2, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(7, dtype="handle")) + for j in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(8, dtype="handle")) + for k, l in T.grid(8, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2 + T.evaluate(T.end_profile_intrinsic(8, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(10, dtype="handle")) + for k, l in T.grid(8, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + T.evaluate(T.end_profile_intrinsic(10, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(7, dtype="handle")) + + +@T.prim_func +def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + T.evaluate(T.start_profile_intrinsic(1, dtype="handle")) + for i in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(2, dtype="handle")) + for j in T.serial(0, 8): + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2 + T.evaluate(T.end_profile_intrinsic(2, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(1, dtype="handle")) + + +@T.prim_func +def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None: + A = T.match_buffer(a, (8, 8, 128), dtype="int32") + B = T.match_buffer(b, (8, 8, 128), dtype="int32") + C = T.match_buffer(c, (8, 8, 128), dtype="int32") + D = T.match_buffer(d, (8, 8, 128), dtype="int32") + for i in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(2, dtype="handle")) + for j in T.parallel(0, 8): + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2 + for k in T.serial(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + T.evaluate(T.end_profile_intrinsic(2, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(7, dtype="handle")) + for j in T.serial(0, 8): + T.evaluate(T.start_profile_intrinsic(8, dtype="handle")) + for k in T.parallel(0, 8): + for l in T.serial(0, 16): + with T.block("C"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2 + T.evaluate(T.end_profile_intrinsic(8, dtype="handle")) + T.evaluate(T.start_profile_intrinsic(10, dtype="handle")) + for k in T.parallel(0, 8): + for l in T.serial(0, 16): + with T.block("B"): + vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l]) + C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl] + T.evaluate(T.end_profile_intrinsic(10, dtype="handle")) + T.evaluate(T.end_profile_intrinsic(7, dtype="handle")) + + +# By default, only loops with siblings are instrumented. +def test1(): + with tvm.transform.PassContext(config=default_lwp_test_config): + mod = tvm.IRModule.from_expr(input1) + mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod) + tvm.ir.assert_structural_equal(mod["main"], test1_expected_output) + + +# By default, only loops with siblings are instrumented. Here, 'lwp_max_depth' +# doesn't have any effect unless 'instr_siblings' is set to False (ex: test3). +def test2(): + test2_config = default_lwp_test_config.copy() + test2_config.update({"tir.lwp_max_depth": 3}) + with tvm.transform.PassContext(config=test2_config): + mod = tvm.IRModule.from_expr(input1) + mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod) + tvm.ir.assert_structural_equal(mod["main"], test1_expected_output) + + +# test3: Use 'lwp_max_depth' to instrument loops upto a certain depth. This flag +# is effective only when 'instr_siblings' is disabled. Also, note that inner-most +# loops are always excluded from instrumentation unless overwritten using +# 'lwp_min_height' (ex: test5) +def test3(): + test3_config = default_lwp_test_config.copy() + test3_config.update({"tir.lwp_max_depth": 3, "tir.instr_siblings": False}) + with tvm.transform.PassContext(config=test3_config): + mod = tvm.IRModule.from_expr(input1) + mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod) + tvm.ir.assert_structural_equal(mod["main"], test3_expected_output) + + +# test4: Use 'lwp_min_height' to exclude inner loops upto a certain height from +# instrumentation. +def test4(): + with tvm.transform.PassContext(config=default_lwp_test_config): + mod = tvm.IRModule.from_expr(input2) + mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod) + tvm.ir.assert_structural_equal(mod["main"], test4_expected_output) + + +# test5: Use both 'lwp_min_height' and 'lwp_max_depth'. +# instrumentation. +def test5(): + test5_config = default_lwp_test_config.copy() + test5_config.update( + {"tir.lwp_max_depth": 3, "tir.instr_siblings": False, "tir.lwp_min_height": 2} + ) + with tvm.transform.PassContext(config=test5_config): + mod = tvm.IRModule.from_expr(input1) + mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod) + tvm.ir.assert_structural_equal(mod["main"], test5_expected_output) + + +# test6: Tests instrumentation for the parallel loops +def test6(): + with tvm.transform.PassContext(config=default_lwp_test_config): + mod = tvm.IRModule.from_expr(input3) + mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod) + tvm.ir.assert_structural_equal(mod["main"], test6_expected_output) + + +if __name__ == "__main__": + tvm.testing.main()