diff --git a/CMakeLists.txt b/CMakeLists.txt index c56a929e276d..9e35b8e4bbad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF) tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON) tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO) tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF) +tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF) # 3rdparty libraries tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include") @@ -407,6 +408,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake) include(cmake/modules/contrib/TensorRT.cmake) include(cmake/modules/contrib/VitisAI.cmake) include(cmake/modules/contrib/Verilator.cmake) +include(cmake/modules/contrib/PAPI.cmake) include(cmake/modules/Git.cmake) include(cmake/modules/LibInfo.cmake) include(cmake/modules/RustExt.cmake) diff --git a/cmake/modules/contrib/PAPI.cmake b/cmake/modules/contrib/PAPI.cmake new file mode 100644 index 000000000000..257591451ca8 --- /dev/null +++ b/cmake/modules/contrib/PAPI.cmake @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(USE_PAPI) + find_package(PkgConfig REQUIRED) + + set(ENV{PKG_CONFIG_PATH} "${USE_PAPI}:$ENV{PKG_CONFIG_PATH}") + pkg_check_modules(PAPI REQUIRED IMPORTED_TARGET papi>=6.0) + list(APPEND TVM_RUNTIME_LINKER_LIBS PkgConfig::PAPI) + list(APPEND RUNTIME_SRCS src/runtime/contrib/papi/papi.cc) +endif() diff --git a/docs/index.rst b/docs/index.rst index a7ae68c87b01..491c42712e9a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -78,6 +78,7 @@ For Developers :caption: MISC vta/index + profiling/index Index diff --git a/docs/profiling/index.rst b/docs/profiling/index.rst new file mode 100644 index 000000000000..9443fef25ea6 --- /dev/null +++ b/docs/profiling/index.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Profiling Deep Learning Models +==================================== + +.. toctree:: + :maxdepth: 1 + + papi diff --git a/docs/profiling/papi.rst b/docs/profiling/papi.rst new file mode 100644 index 000000000000..b7c23b2c0c73 --- /dev/null +++ b/docs/profiling/papi.rst @@ -0,0 +1,114 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +Getting Started With PAPI +========================= + +The Performance Application Programming Interface (PAPI) is a library that +provides performance counters on a variety of platforms. Performance counters +provide accurate low-level information about processors behavior during a given +execution run. This information can contain simple metrics like total cycle +count, cache misses, and instructions executed as well as more high level +information like total FLOPS and warp occupancy. PAPI makes these metrics +available while profiling. + +Installing PAPI +--------------- + +PAPI can either be installed using your package manager (``apt-get install libpapi-dev`` +on Ubuntu), or from source here: +https://bitbucket.org/icl/papi/src/master/. + + +Building TVM With PAPI +---------------------- + +To include PAPI in your build of TVM, set the following line in you ``config.cmake``: + +.. code:: + + set(USE_PAPI ON) + +If PAPI is installed in a non-standard place, you can specify where it is like so: + +.. code:: + + set(USE_PAPI path/to/papi.pc) + + +Using PAPI While Profiling +-------------------------- + +If TVM has been built with PAPI (see above), then you can pass a +:py:class:`tvm.runtime.profiling.PAPIMetricCollector` to +:py:meth:`tvm.runtime.GraphModule.profile` to collect performance metrics. Here +is an example: + +.. code:: python + + target = "llvm" + dev = tvm.cpu() + mod, params = mlp.get_workload(1) + + exe = relay.vm.compile(mod, target, params=params) + vm = profiler_vm.VirtualMachineProfiler(exe, dev) + + data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev) + report = vm.profile( + [data], + func_name="main", + collectors=[tvm.runtime.profiling.PAPIMetricCollector()], + ) + print(report) + +.. code:: + + Name perf::CACHE-MISSES perf::CYCLES perf::STALLED-CYCLES-BACKEND perf::INSTRUCTIONS perf::STALLED-CYCLES-FRONTEND + fused_nn_dense_nn_bias_add_nn_relu 2,494 1,570,698 85,608 675,564 39,583 + fused_nn_dense_nn_bias_add_nn_relu_1 1,149 655,101 13,278 202,297 21,380 + fused_nn_dense_nn_bias_add 288 600,184 8,321 163,446 19,513 + fused_nn_batch_flatten 301 587,049 4,636 158,636 18,565 + fused_nn_softmax 154 575,143 8,018 160,738 18,995 + ---------- + Sum 4,386 3,988,175 119,861 1,360,681 118,036 + Total 10,644 8,327,360 179,310 2,660,569 270,044 + +You can also change which metrics are collected: + +.. code:: python + + report = vm.profile( + [data], + func_name="main", + collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"])], + ) + +.. code:: + + Name PAPI_FP_OPS + fused_nn_dense_nn_bias_add_nn_relu 200,832 + fused_nn_dense_nn_bias_add_nn_relu_1 16,448 + fused_nn_dense_nn_bias_add 1,548 + fused_nn_softmax 160 + fused_nn_batch_flatten 0 + ---------- + Sum 218,988 + Total 218,988 + +You can find a list of available metrics by running the ``papi_avail`` and +``papi_native_avail`` commands. diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h new file mode 100644 index 000000000000..ff2d75c483eb --- /dev/null +++ b/include/tvm/runtime/contrib/papi.h @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*! + * \brief Performance counters for profiling via the PAPI library. + */ +#ifndef TVM_RUNTIME_CONTRIB_PAPI_H_ +#define TVM_RUNTIME_CONTRIB_PAPI_H_ + +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace profiling { + +/*! \brief Construct a metric collector that collects data from hardware + * performance counters using the Performance Application Programming Interface + * (PAPI). + * + * \param metrics A mapping from a device type to the metrics that should be + * collected on that device. You can find the names of available metrics by + * running `papi_native_avail`. + */ +TVM_DLL MetricCollector CreatePAPIMetricCollector(Map> metrics); +} // namespace profiling +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_CONTRIB_PAPI_H_ diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 5b44e020f4e4..b1e8d98c1441 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -37,6 +37,7 @@ #include namespace tvm { + namespace runtime { /*! \brief Base class for all implementations. @@ -150,6 +151,26 @@ class Timer : public ObjectRef { Timer DefaultTimer(Device dev); namespace profiling { +/*! \brief Wrapper for `Device` because `Device` is not passable across the + * PackedFunc interface. + */ +struct DeviceWrapperNode : public Object { + /*! The device */ + Device device; + + /*! Constructor */ + explicit DeviceWrapperNode(Device device) : device(device) {} + + static constexpr const char* _type_key = "runtime.profiling.DeviceWrapper"; + TVM_DECLARE_BASE_OBJECT_INFO(DeviceWrapperNode, Object); +}; + +/*! \brief Wrapper for `Device`. */ +class DeviceWrapper : public ObjectRef { + public: + explicit DeviceWrapper(Device dev) { data_ = make_object(dev); } + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DeviceWrapper, ObjectRef, DeviceWrapperNode); +}; /*! \brief Data collected from a profiling run. Includes per-call metrics and per-device metrics. */ @@ -200,6 +221,57 @@ class Report : public ObjectRef { TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Report, ObjectRef, ReportNode); }; +/*! \brief Interface for user defined profiling metric collection. + * + * Users can register their own collector by registering a packed function with + * the name "runtime.profiling.metrics.my_collector_name" where + * "my_collector_name" is the name of their collector. This function should + * take an Array of Device as input which contains the devices the collector + * will be run on. + * + * `MetricCollectorNode`s will be called in the following fashion. + * \code + * MetricCollector mc; + * for (auto op : model) { + * auto o = mc.Start(); + * op(); + * auto metrics = mc.Stop(o); // metrics are added the profiling report + * } + * \endcode + */ +class MetricCollectorNode : public Object { + public: + /*! \brief Initialization call. Called before profiling has started. Any + * expensive precomputation should happen here. + * \param devs The list of devices this collector will be run on. + */ + virtual void Init(Array devs) = 0; + /*! \brief Start colling metrics for a function call. + * \param dev The device the call will be run on. + * \returns An object used to maintain state of the metric collection. This + * object will be passed to the corresponding `Stop` call. If the device is + * not supported, this function will return a nullptr ObjectRef. + */ + virtual ObjectRef Start(Device dev) = 0; + /*! \brief Stop collecting metrics. + * \param obj The object created by the corresponding `Start` call. + * \returns A set of metric names and the associated values. Values must be + * one of DurationNode, PercentNode, CountNode, or StringObj. + */ + virtual Map Stop(ObjectRef obj) = 0; + + virtual ~MetricCollectorNode() {} + + static constexpr const char* _type_key = "runtime.profiling.MetricCollector"; + TVM_DECLARE_BASE_OBJECT_INFO(MetricCollectorNode, Object); +}; + +/*! \brief Wrapper for `MetricCollectorNode`. */ +class MetricCollector : public ObjectRef { + public: + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MetricCollector, ObjectRef, MetricCollectorNode); +}; + /*! Information about a single function or operator call. */ struct CallFrame { /*! Device on which the call was made */ @@ -210,6 +282,10 @@ struct CallFrame { Timer timer; /*! Extra performance metrics */ std::unordered_map extra_metrics; + /*! User defined metric collectors. Each pair is the MetricCollector and its + * associated data (returned from MetricCollector.Start). + */ + std::vector> extra_collectors; }; /*! Runtime profiler for function and/or operator calls. Used in the graph @@ -217,9 +293,10 @@ struct CallFrame { * * Example usage: * \code{.cpp} - * Profiler prof; * Device cpu, gpu; - * prof.Start({cpu, gpu}); + * Profiler prof({cpu, gpu}); + * my_gpu_kernel(); // do a warmup iteration + * prof.Start(); * prof.StartCall("my_gpu_kernel", gpu); * my_gpu_kernel(); * prof.StopCall(); @@ -232,13 +309,24 @@ struct CallFrame { */ class Profiler { public: - /*! \brief Start the profiler. + /*! Constructor. + * + * The profiler should be constructed before you do any warmup iterations. + * + * \note + * Calling this constructor will reset the TVM threadpool. It is necessary in + * order to install thread handlers required by certain collectors. + * * \param devs The list of devices the profiler will be running on. Should * include all devices used by profiled operators. + * \param metric_collectors Additional `MetricCollector`s to use with this profiler. + */ + explicit Profiler(std::vector devs, std::vector metric_collectors); + /*! \brief Start the profiler. * * This function should only be called once per object. */ - void Start(const std::vector& devs); + void Start(); /*! \brief Stop the profiler. * * This function should only be called once per object after start has been called. @@ -270,12 +358,14 @@ class Profiler { /*! \brief Check if the profiler is currently running. * \returns Whether or not the profiler is running. */ - bool IsRunning() const { return !global_timers_.empty(); } + bool IsRunning() const { return is_running_; } private: - std::vector> global_timers_; + std::vector devs_; + bool is_running_{false}; std::vector calls_; std::stack in_flight_; + std::vector collectors_; }; /* \brief A duration in time. */ diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h index 95a64049fd45..43636ddbdb1f 100644 --- a/include/tvm/runtime/threading_backend.h +++ b/include/tvm/runtime/threading_backend.h @@ -94,6 +94,14 @@ void Yield(); */ int MaxConcurrency(); +/*! + * \brief Reset the threads in the pool. All current threads are destroyed and + * new ones are created. + * + * Note that this does nothing when openmp is used. + */ +void ResetThreadPool(); + } // namespace threading } // namespace runtime } // namespace tvm diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index dc043353c475..622f27c358b6 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -268,23 +268,28 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0): ret = self._run_individual(number, repeat, min_repeat_ms) return ret.strip(",").split(",") if ret else [] - def profile(self, **input_dict): + def profile(self, collectors=None, **input_dict): """Run forward execution of the graph and collect overall and per-op performance metrics. Parameters ---------- + collectors : Optional[Sequence[MetricCollector]] + Extra metrics to collect. + input_dict : dict of str to NDArray List of input values to be feed to + Return ------ timing_results : str Per-operator and whole graph timing results in a table format. """ + collectors = [] if collectors is None else collectors if input_dict: self.set_input(**input_dict) - return self._profile() + return self._profile(collectors) def exit(self): """Exits the dump folder and all its contents""" diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py index e1c3dc66a360..b3043d8b8760 100644 --- a/python/tvm/runtime/profiler_vm.py +++ b/python/tvm/runtime/profiler_vm.py @@ -50,7 +50,7 @@ def get_stat(self, sort_by_time=True): # pylint: disable=unused-argument warnings.warn("get_stat has been removed, use profile instead") return "" - def profile(self, *args, func_name="main", **kwargs): + def profile(self, *args, func_name="main", collectors=None, **kwargs): """Profile a function call. Parameters @@ -58,6 +58,9 @@ def profile(self, *args, func_name="main", **kwargs): func_name : str The name of the function. + collectors : Optional[Sequence[MetricCollector]] + Extra metrics to collect. + args : list[tvm.runtime.NDArray] or list[np.ndarray] The arguments to the function. @@ -69,6 +72,7 @@ def profile(self, *args, func_name="main", **kwargs): timing_results : str Overall and per-op timing results formatted in a table. """ + collectors = [] if collectors is None else collectors if args or kwargs: self.set_input(func_name, *args, **kwargs) - return self._profile(func_name) + return self._profile(func_name, collectors) diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py new file mode 100644 index 000000000000..8857cf419602 --- /dev/null +++ b/python/tvm/runtime/profiling/__init__.py @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Registration of profiling objects in python.""" + +from typing import Dict, Sequence, Optional +from ... import _ffi +from . import _ffi_api +from .. import Object, Device + + +@_ffi.register_object("runtime.profiling.Report") +class Report(Object): + """A container for information gathered during a profiling run. + + Attributes + ---------- + calls : Array[Dict[str, Object]] + Per-call profiling metrics (function name, runtime, device, ...). + + device_metrics : Dict[Device, Dict[str, Object]] + Per-device metrics collected over the entire run. + """ + + def csv(self): + """Convert this profiling report into CSV format. + + This only includes calls and not overall metrics. + + Returns + ------- + csv : str + `calls` in CSV format. + """ + return _ffi_api.AsCSV(self) + + +@_ffi.register_object("runtime.profiling.MetricCollector") +class MetricCollector(Object): + """Interface for user defined profiling metric collection.""" + + +@_ffi.register_object("runtime.profiling.DeviceWrapper") +class DeviceWrapper(Object): + """Wraps a tvm.runtime.Device""" + + def __init__(self, dev: Device): + self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev) + + +# We only enable this class when TVM is build with PAPI support +if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None: + + @_ffi.register_object("runtime.profiling.PAPIMetricCollector") + class PAPIMetricCollector(MetricCollector): + """Collects performance counter information using the Performance + Application Programming Interface (PAPI). + """ + + def __init__(self, metric_names: Optional[Dict[Device, Sequence[str]]] = None): + """ + Parameters + ---------- + metric_names : Optional[Dict[Device, Sequence[str]]] + List of per-device metrics to collect. You can find a list of valid + metrics by runing `papi_native_avail` from the command line. + """ + metric_names = {} if metric_names is None else metric_names + wrapped = dict() + for dev, names in metric_names.items(): + wrapped[DeviceWrapper(dev)] = names + self.__init_handle_by_constructor__(_ffi_api.PAPIMetricCollector, wrapped) diff --git a/python/tvm/runtime/profiling.py b/python/tvm/runtime/profiling/_ffi_api.py similarity index 52% rename from python/tvm/runtime/profiling.py rename to python/tvm/runtime/profiling/_ffi_api.py index 5a1cd6796b64..d26b847a699f 100644 --- a/python/tvm/runtime/profiling.py +++ b/python/tvm/runtime/profiling/_ffi_api.py @@ -14,35 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Registration of profiling objects in python.""" - -from .. import _ffi -from . import Object +"""FFI for profiling""" +from ... import _ffi _ffi._init_api("runtime.profiling", __name__) - - -@_ffi.register_object("runtime.profiling.Report") -class Report(Object): - """A container for information gathered during a profiling run. - - Attributes - ---------- - calls : Array[Dict[str, Object]] - Per-call profiling metrics (function name, runtime, device, ...). - - device_metrics : Dict[Device, Dict[str, Object]] - Per-device metrics collected over the entire run. - """ - - def csv(self): - """Convert this profiling report into CSV format. - - This only includes calls and not overall metrics. - - Returns - ------- - csv : str - `calls` in CSV format. - """ - return AsCSV(self) diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc new file mode 100644 index 000000000000..b9ba8f9984e9 --- /dev/null +++ b/src/runtime/contrib/papi/papi.cc @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include + +#include +#include + +namespace tvm { +namespace runtime { +namespace profiling { + +#define PAPI_CALL(func) \ + { \ + int e = (func); \ + if (e != PAPI_OK) { \ + LOG(FATAL) << "PAPIError: in function " #func " " << e << " " \ + << std::string(PAPI_strerror(e)); \ + } \ + } + +static const std::unordered_map> default_metric_names = { + {kDLCPU, + {"perf::CYCLES", "perf::STALLED-CYCLES-FRONTEND", "perf::STALLED-CYCLES-BACKEND", + "perf::INSTRUCTIONS", "perf::CACHE-MISSES"}}, + {kDLCUDA, {"cuda:::event:elapsed_cycles_sm:device=0"}}}; + +/*! \brief Object that holds the values of counters at the start of a function call. */ +struct PAPIEventSetNode : public Object { + /*! \brief The starting values of counters for all metrics of a specific device. */ + std::vector start_values; + /*! \brief The device these counters are for. */ + Device dev; + + explicit PAPIEventSetNode(std::vector start_values, Device dev) + : start_values(start_values), dev(dev) {} + + static constexpr const char* _type_key = "PAPIEventSetNode"; + TVM_DECLARE_FINAL_OBJECT_INFO(PAPIEventSetNode, Object); +}; + +/* Get the PAPI component id for the given device. + * \param dev The device to get the component for. + * \returns PAPI component id for the device. Returns -1 if the device is not + * supported by PAPI. + */ +int component_for_device(Device dev) { + std::string component_name; + switch (dev.device_type) { + case kDLCPU: + component_name = "perf_event"; + break; + case kDLCUDA: + component_name = "cuda"; + break; + case kDLROCM: + component_name = "rocm"; + break; + default: + LOG(WARNING) << "PAPI does not support device " << DeviceName(dev.device_type); + return -1; + } + int cidx = PAPI_get_component_index(component_name.c_str()); + if (cidx < 0) { + LOG(FATAL) << "Cannot find PAPI component \"" << component_name + << "\". Maybe you need to build PAPI with support for this component (use " + "`./configure --with-components=" + << component_name << "`)."; + } + return cidx; +} + +/*! \brief MetricCollectorNode for PAPI metrics. + * + * PAPI (Performance Application Programming Interface) collects metrics on a + * variety of platforms including cpu, cuda and rocm. + * + * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/. + */ +struct PAPIMetricCollectorNode final : public MetricCollectorNode { + /*! \brief Construct a metric collector that collects a specific set of metrics. + * + * \param metrics A mapping from a device type to the metrics that should be + * collected on that device. You can find the names of available metrics by + * running `papi_native_avail`. + */ + explicit PAPIMetricCollectorNode(Map> metrics) { + for (auto& p : metrics) { + papi_metric_names[p.first->device] = {}; + for (auto& metric : p.second) { + papi_metric_names[p.first->device].push_back(metric); + } + } + } + explicit PAPIMetricCollectorNode() {} + + /*! \brief Initialization call. + * \param devices The devices this collector will be running on + */ + void Init(Array devices) { + if (!PAPI_is_initialized()) { + if (sizeof(long_long) > sizeof(int64_t)) { + LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when " + "reporting metrics."; + } + CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT) + << "Error while initializing PAPI"; + } + + // If no metrics were provided we use the default set. The names were not + // initialized in the constructor because we did not know which devices we + // were running on. + if (papi_metric_names.size() == 0) { + for (auto wrapped_device : devices) { + Device device = wrapped_device->device; + auto it = default_metric_names.find(device.device_type); + if (it != default_metric_names.end()) { + papi_metric_names[device] = it->second; + } + } + } + + // create event sets for each device + for (auto wrapped_device : devices) { + Device device = wrapped_device->device; + int cidx = component_for_device(device); + // unknown device, skipping + if (cidx < 0) { + continue; + } + + auto it = papi_metric_names.find(device); + // skip devices with no metrics defined + if (it == papi_metric_names.end() || it->second.size() == 0) { + continue; + } + auto& metric_names = it->second; + + const PAPI_component_info_t* component = PAPI_get_component_info(cidx); + if (component->disabled) { + std::string help_message = ""; + switch (device.device_type) { + case kDLCPU: + help_message = + "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`"; + break; + case kDLCUDA: + help_message = + "Try enabling gpu profiling with `modprobe nvidia " + "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding " + "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to " + "`/etc/modprobe.d/nvidia-kernel-common.conf`."; + break; + default: + break; + } + LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type) + << ": " << component->disabled_reason << "\n" + << help_message; + continue; + } + + int event_set = PAPI_NULL; + PAPI_CALL(PAPI_create_eventset(&event_set)); + PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx)); + if (device.device_type == kDLCPU) { + // we set PAPI_INHERIT to make it so threads created after this inherit the event_set. + PAPI_option_t opt; + memset(&opt, 0x0, sizeof(PAPI_option_t)); + opt.inherit.inherit = PAPI_INHERIT_ALL; + opt.inherit.eventset = event_set; + PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt)); + } + + if (static_cast(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) { + PAPI_CALL(PAPI_set_multiplex(event_set)); + } + + // add all the metrics + for (auto metric : metric_names) { + int e = PAPI_add_named_event(event_set, metric.c_str()); + if (e != PAPI_OK) { + LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric + << "."; + } + } + // Because we may have multiple calls in flight at the same time, we + // start all the timers when we initialize. Then we calculate the metrics + // counts for a call by comparing counter values at the start vs end of + // the call. + PAPI_CALL(PAPI_start(event_set)); + event_sets[device] = event_set; + } + } + /*! \brief Called right before a function call. Reads starting values of the + * measured metrics. + * + * \param dev The device the function will be run on. + * \returns A `PAPIEventSetNode` containing values for the counters at the + * start of the call. Passed to a corresponding `Stop` call. + */ + ObjectRef Start(Device dev) final { + // Record counter values at the start of the call, so we can calculate the + // metrics for the call by comparing the values at the end of the call. + auto it = event_sets.find(dev); + if (it != event_sets.end()) { + int event_set = it->second; + std::vector values(papi_metric_names[dev].size()); + PAPI_CALL(PAPI_read(event_set, values.data())); + return ObjectRef(make_object(values, dev)); + } else { + return ObjectRef(nullptr); + } + } + /*! \brief Called right after a function call. Reads ending values of the + * measured metrics. Computes the change in each metric from the + * corresponding `Start` call. + * + * \param obj `PAPIEventSetNode` created by a call to `Start`. + * \returns A mapping from metric name to value. + */ + Map Stop(ObjectRef obj) final { + const PAPIEventSetNode* event_set_node = obj.as(); + std::vector end_values(papi_metric_names[event_set_node->dev].size()); + PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data())); + std::unordered_map reported_metrics; + for (size_t i = 0; i < end_values.size(); i++) { + if (end_values[i] < event_set_node->start_values[i]) { + LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1."; + reported_metrics[papi_metric_names[event_set_node->dev][i]] = + ObjectRef(make_object(-1)); + } else { + reported_metrics[papi_metric_names[event_set_node->dev][i]] = + ObjectRef(make_object(end_values[i] - event_set_node->start_values[i])); + } + } + return reported_metrics; + } + + ~PAPIMetricCollectorNode() final { + for (auto p : event_sets) { + PAPI_CALL(PAPI_stop(p.second, NULL)); + PAPI_CALL(PAPI_cleanup_eventset(p.second)); + PAPI_CALL(PAPI_destroy_eventset(&p.second)); + } + } + + /*! \brief Device-specific event sets. Contains the running counters (the int values) for that + * device. */ + std::unordered_map event_sets; + /*! \brief Device-specific metric names. Order of names matches the order in the corresponding + * `event_set`. */ + std::unordered_map> papi_metric_names; + + static constexpr const char* _type_key = "runtime.profiling.PAPIMetricCollector"; + TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode); +}; + +/*! \brief Wrapper for `PAPIMetricCollectorNode`. */ +class PAPIMetricCollector : public MetricCollector { + public: + explicit PAPIMetricCollector(Map> metrics) { + data_ = make_object(metrics); + } + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector, + PAPIMetricCollectorNode); +}; + +MetricCollector CreatePAPIMetricCollector(Map> metrics) { + return PAPIMetricCollector(metrics); +} + +TVM_REGISTER_OBJECT_TYPE(PAPIEventSetNode); +TVM_REGISTER_OBJECT_TYPE(PAPIMetricCollectorNode); + +TVM_REGISTER_GLOBAL("runtime.profiling.PAPIMetricCollector") + .set_body_typed([](Map> metrics) { + return PAPIMetricCollector(metrics); + }); + +} // namespace profiling +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 1ea01b19e8aa..2fa73971d000 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -276,16 +276,20 @@ class GraphExecutorDebug : public GraphExecutor { * the module compared to GraphRuntimeDebug::RunIndividual as it runs the * entire graph in order. * + * \param collectors Optional user defined `MetricCollector`s to use with this profiling run. + * * \returns A table of per-op runtimes and total times. */ - profiling::Report Profile() { + profiling::Report Profile(Array collectors) { + std::vector cs(collectors.begin(), collectors.end()); + profiling::Profiler prof(devices_, cs); + // warm up. 1 iteration does not seem enough. for (int i = 0; i < 3; i++) { GraphExecutor::Run(); } - profiling::Profiler prof; - prof.Start(devices_); + prof.Start(); for (size_t i = 0; i < op_execs_.size(); ++i) { if (op_execs_[i]) { // get argument shapes @@ -359,7 +363,10 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, *rv = this->RunIndividual(number, repeat, min_repeat_ms); }); } else if (name == "profile") { - return TypedPackedFunc([sptr_to_self, this]() { return this->Profile(); }); + return TypedPackedFunc)>( + [sptr_to_self, this](Array collectors) { + return this->Profile(collectors); + }); } else { return GraphExecutor::GetFunction(name, sptr_to_self); } diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index ab9d674fad50..c149298f11a5 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -23,8 +23,10 @@ */ #include +#include #include #include +#include #include #include @@ -100,16 +102,37 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start); namespace profiling { -void Profiler::Start(const std::vector& devs) { - CHECK(global_timers_.empty()) << "You can only call Start once per Profiler."; +Profiler::Profiler(std::vector devs, std::vector metric_collectors) + : devs_(devs), collectors_(metric_collectors) { + is_running_ = false; + std::vector wrapped_devs; for (auto dev : devs) { - global_timers_.emplace_back(dev, Timer::Start(dev)); + wrapped_devs.push_back(DeviceWrapper(make_object(dev))); + } + for (auto& x : collectors_) { + x->Init(wrapped_devs); + } + // reset the thread pool so that PAPI eventset hooks are set in all threads. + threading::ResetThreadPool(); +} + +void Profiler::Start() { + is_running_ = true; + for (auto dev : devs_) { + StartCall("Total", dev, {}); } } void Profiler::StartCall(String name, Device dev, std::unordered_map extra_metrics) { - in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics}); + std::vector> objs; + for (auto& collector : collectors_) { + ObjectRef obj = collector->Start(dev); + if (obj.defined()) { + objs.emplace_back(collector, obj); + } + } + in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics, objs}); } void Profiler::StopCall(std::unordered_map extra_metrics) { @@ -118,14 +141,21 @@ void Profiler::StopCall(std::unordered_map extra_metrics for (auto& p : extra_metrics) { cf.extra_metrics[p.first] = p.second; } + // collect the extra metrics from user defined collectors + for (const auto& obj : cf.extra_collectors) { + auto collector_metrics = obj.first->Stop(obj.second); + for (auto& p : collector_metrics) { + cf.extra_metrics[p.first] = p.second; + } + } in_flight_.pop(); calls_.push_back(cf); } void Profiler::Stop() { - // Stop all global timers. We wait to synchronize until we are making the report. - for (auto p : global_timers_) { - p.second->Stop(); + is_running_ = false; + for (size_t i = 0; i < devs_.size(); i++) { + StopCall(); } } @@ -396,31 +426,11 @@ std::string DeviceString(Device dev) { } Report Profiler::Report(bool aggregate, bool sort) { - std::vector> global_times; - for (auto p : global_timers_) { - global_times.emplace_back(p.first, p.second->SyncAndGetElapsedNanos() / 1e3); - } - - double overall_time = 0; - for (auto p : global_times) { - overall_time = std::max(overall_time, p.second); - } - - std::unordered_map> device_metrics; - for (auto p : global_times) { - std::unordered_map row; - row["Name"] = String("Total"); - row["Duration (us)"] = ObjectRef(make_object(p.second)); - row["Percent"] = ObjectRef(make_object(p.second / overall_time * 100)); - row["Device"] = String(DeviceString(p.first)); - device_metrics[DeviceString(p.first)] = row; - } - - std::vector> rows; + // sync all timers and normalize rows + std::vector> rows; for (auto& cf : calls_) { std::unordered_map row; double us = cf.timer->SyncAndGetElapsedNanos() / 1e3; - row["Percent"] = ObjectRef(make_object(us / overall_time * 100)); row["Duration (us)"] = ObjectRef(make_object(us)); row["Count"] = ObjectRef(make_object(1)); row["Name"] = cf.name; @@ -431,7 +441,30 @@ Report Profiler::Report(bool aggregate, bool sort) { rows.push_back(row); } - return profiling::Report(rows, device_metrics); + // the last couple of call frames are the overall times + double overall_time_us = 0; + std::unordered_map> device_metrics; + for (size_t i = 0; i < devs_.size(); i++) { + auto row = rows[rows.size() - 1]; + rows.pop_back(); + device_metrics[Downcast(row["Device"])] = row; + overall_time_us = + std::max(overall_time_us, row["Duration (us)"].as()->microseconds); + } + + // Calculate percentages + for (auto& row : rows) { + row["Percent"] = ObjectRef(make_object( + row["Duration (us)"].as()->microseconds / overall_time_us * 100)); + } + + // convert to map + std::vector> converted_rows; + for (const auto& row : rows) { + converted_rows.push_back(row); + } + + return profiling::Report(converted_rows, device_metrics); } Report::Report(Array> calls, @@ -446,8 +479,13 @@ TVM_REGISTER_OBJECT_TYPE(DurationNode); TVM_REGISTER_OBJECT_TYPE(PercentNode); TVM_REGISTER_OBJECT_TYPE(CountNode); TVM_REGISTER_OBJECT_TYPE(ReportNode); +TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode); +TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode); TVM_REGISTER_GLOBAL("runtime.profiling.AsCSV").set_body_typed([](Report n) { return n->AsCSV(); }); +TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) { + return DeviceWrapper(dev); +}); } // namespace profiling } // namespace runtime } // namespace tvm diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index cab04ec0db4a..c11e9f7ac084 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -258,26 +258,29 @@ class SpscTaskQueue { class ThreadPool { public: ThreadPool() : num_workers_(tvm::runtime::threading::MaxConcurrency()) { - for (int i = 0; i < num_workers_; ++i) { - // The SpscTaskQueue only hosts ONE item at a time - queues_.emplace_back(std::unique_ptr(new SpscTaskQueue())); - } const char* exclude_worker0 = getenv("TVM_EXCLUDE_WORKER0"); if (exclude_worker0 && atoi(exclude_worker0) == 0) { exclude_worker0_ = false; } - threads_ = std::unique_ptr( - new tvm::runtime::threading::ThreadGroup( - num_workers_, [this](int worker_id) { this->RunWorker(worker_id); }, - exclude_worker0_ /* include_main_thread */)); - num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_); + Init(); } + ~ThreadPool() { for (std::unique_ptr& q : queues_) { q->SignalForKill(); } threads_.reset(); } + + void Reset() { + for (std::unique_ptr& q : queues_) { + q->SignalForKill(); + } + queues_.clear(); + threads_.reset(); + Init(); + } + int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, int need_sync) { ParallelLauncher* launcher = ParallelLauncher::ThreadLocal(); ICHECK(!launcher->is_worker) @@ -323,6 +326,19 @@ class ThreadPool { } private: + // Shared initialization code + void Init() { + for (int i = 0; i < num_workers_; ++i) { + // The SpscTaskQueue only hosts ONE item at a time + queues_.emplace_back(std::unique_ptr(new SpscTaskQueue())); + } + threads_ = std::unique_ptr( + new tvm::runtime::threading::ThreadGroup( + num_workers_, [this](int worker_id) { this->RunWorker(worker_id); }, + exclude_worker0_ /* include_main_thread */)); + num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_); + } + // Internal worker function. void RunWorker(int worker_id) { SpscTaskQueue* queue = queues_[worker_id].get(); @@ -359,6 +375,10 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads); }); +namespace threading { +void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); } +} // namespace threading + } // namespace runtime } // namespace tvm diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc index e8b948d3d2ae..ca9b22a9099d 100644 --- a/src/runtime/vm/executable.cc +++ b/src/runtime/vm/executable.cc @@ -273,6 +273,13 @@ void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) { primitive_names[packed_index] = it.first; } strm->Write(primitive_names); + // TODO(tkonolige): cannot serialize ObjectRefs with dmlc's serializer. + // std::vector>> primitive_attrs; + // for (const auto& it : this->op_attrs) { + // auto packed_index = static_cast(it.first); + // primitive_attrs.push_back({packed_index, it.second}); + // } + // strm->Write(primitive_attrs); } // Serialize a virtual machine instruction. It creates a list that contains the @@ -569,6 +576,12 @@ void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) { for (size_t i = 0; i < primitive_names.size(); i++) { this->primitive_map.insert({primitive_names[i], i}); } + // TODO(tkonolige): cannot serialize ObjectRefs with dmlc's serializer. + // std::vector>> primitive_attrs; + // STREAM_CHECK(strm->Read(&primitive_attrs), "primitive attrs"); + // for (auto p : primitive_attrs) { + // this->op_attrs.insert(p); + // } } // Extract the `cnt` number of fields started at `start` from the list diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index a7d65944d581..6d893114d623 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -43,26 +43,31 @@ namespace vm { PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { if (name == "profile") { - return TypedPackedFunc([sptr_to_self, this](String arg_name) { - std::vector devices; - for (auto dev : devices_) { - if (dev.device_type > 0) { - devices.push_back(dev); - } - } - - auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self); - // warmup - for (int i = 0; i < 3; i++) { - invoke(arg_name); - } - - prof_ = profiling::Profiler(); // reset profiler - prof_.Start(devices); - invoke(arg_name); - prof_.Stop(); - return prof_.Report(); - }); + return TypedPackedFunc)>( + [sptr_to_self, this](String arg_name, Array collectors) { + std::vector devices; + for (auto dev : devices_) { + if (dev.device_type > 0) { + devices.push_back(dev); + } + } + + std::vector cs(collectors.begin(), collectors.end()); + prof_ = profiling::Profiler(devices, cs); + + auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self); + // warmup + for (int i = 0; i < 3; i++) { + invoke(arg_name); + } + + prof_.operator*().Start(); + invoke(arg_name); + prof_.operator*().Stop(); + auto report = prof_.operator*().Report(); + prof_ = dmlc::optional(); // releases hardware counters + return report; + }); } else { return VirtualMachine::GetFunction(name, sptr_to_self); } @@ -80,7 +85,7 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun Index output_size, const std::vector& args) { ICHECK(exec_); ICHECK(!devices_.empty()) << "Device has not been initialized yet."; - if (prof_.IsRunning()) { + if (prof_ && prof_.operator*().IsRunning()) { // The device of any input of the operator is used for synchronization. ICHECK_GT(arg_count, 0U); ObjectRef arg = args[0]; @@ -122,11 +127,11 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun } metrics["Argument Shapes"] = profiling::ShapeString(shapes); - prof_.StartCall(packed_index_map_[packed_index], dev, metrics); + prof_.operator*().StartCall(packed_index_map_[packed_index], dev, metrics); } VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args); - if (prof_.IsRunning()) { - prof_.StopCall(); + if (prof_ && prof_.operator*().IsRunning()) { + prof_.operator*().StopCall(); } } diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h index 521a9bd454e7..1efefda52b97 100644 --- a/src/runtime/vm/profiler/vm.h +++ b/src/runtime/vm/profiler/vm.h @@ -25,6 +25,7 @@ #ifndef TVM_RUNTIME_VM_PROFILER_VM_H_ #define TVM_RUNTIME_VM_PROFILER_VM_H_ +#include #include #include @@ -39,7 +40,7 @@ namespace vm { class VirtualMachineDebug : public VirtualMachine { public: - VirtualMachineDebug() : VirtualMachine() {} + VirtualMachineDebug() : VirtualMachine(), prof_({}) {} PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) final; @@ -52,7 +53,7 @@ class VirtualMachineDebug : public VirtualMachine { const std::vector& args) final; std::unordered_map packed_index_map_; - profiling::Profiler prof_; + dmlc::optional prof_; }; } // namespace vm diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py index ee8032550b39..7420e32b5416 100644 --- a/tests/python/unittest/test_runtime_profiling.py +++ b/tests/python/unittest/test_runtime_profiling.py @@ -18,6 +18,7 @@ import pytest from io import StringIO import csv +import os import tvm.testing from tvm.runtime import profiler_vm @@ -26,6 +27,24 @@ from tvm.contrib.debugger import debug_executor +def read_csv(report): + f = StringIO(report.csv()) + headers = [] + rows = [] + reader = csv.reader(f, delimiter=",") + # force parsing + in_header = True + for row in reader: + if in_header: + headers = row + in_header = False + rows = [[] for x in headers] + else: + for i in range(len(row)): + rows[i].append(row[i]) + return dict(zip(headers, rows)) + + @pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled") @tvm.testing.parametrize_targets def test_vm(target, dev): @@ -39,14 +58,9 @@ def test_vm(target, dev): assert "fused_nn_softmax" in str(report) assert "Total" in str(report) - f = StringIO(report.csv()) - reader = csv.reader(f, delimiter=",") - # force parsing - in_header = True - for row in reader: - if in_header: - assert "Hash" in row - in_header = False + csv = read_csv(report) + assert "Hash" in csv.keys() + assert all([float(x) > 0 for x in csv["Duration (us)"]]) @tvm.testing.parametrize_targets @@ -61,3 +75,39 @@ def test_graph_executor(target, dev): assert "fused_nn_softmax" in str(report) assert "Total" in str(report) assert "Hash" in str(report) + + +@tvm.testing.parametrize_targets("cuda", "llvm") +@pytest.mark.skipif( + tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None, + reason="PAPI profiling not enabled", +) +def test_papi(target, dev): + target = tvm.target.Target(target) + if str(target.kind) == "llvm": + metric = "PAPI_FP_OPS" + elif str(target.kind) == "cuda": + metric = "cuda:::event:shared_load:device=0" + else: + pytest.skip(f"Target {target.kind} not supported by this test") + mod, params = mlp.get_workload(1) + + exe = relay.vm.compile(mod, target, params=params) + vm = profiler_vm.VirtualMachineProfiler(exe, dev) + + data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev) + report = vm.profile( + [data], + func_name="main", + collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})], + ) + print(report) + assert metric in str(report) + + csv = read_csv(report) + assert metric in csv.keys() + assert any([float(x) > 0 for x in csv[metric]]) + + +if __name__ == "__main__": + test_papi("llvm", tvm.cpu())