From 5d6b72402316c0e56926f60cb09d0ada2619bcc2 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 5 May 2021 14:01:06 -0700
Subject: [PATCH 01/23] [PROFILING] Use PAPI to collect hardware performance
 counters on CPU and CUDA

This PR adds an optional dependency on PAPI
(https://bitbucket.org/icl/papi/) in order to collect hardware
performance counters on CPU and CUDA. These performance counters include
data like total cycles, instructions executed, and cache misses. Users
can control which performance counters are collected by setting the
TVM_PAPI_${DEVICE}_METRICS environment variable to a semicolon separated
list of metrics.
---
 CMakeLists.txt                                |   2 +
 cmake/modules/contrib/PAPI.cmake              |  25 ++
 include/tvm/runtime/c_backend_api.h           |  10 +
 include/tvm/runtime/profiling.h               |  87 +++++-
 src/runtime/contrib/papi/papi.cc              | 275 ++++++++++++++++++
 .../debug/graph_executor_debug.cc             |   5 +-
 src/runtime/profiling.cc                      |  94 ++++--
 src/runtime/thread_pool.cc                    |  21 ++
 src/runtime/vm/executable.cc                  |  13 +
 src/runtime/vm/profiler/vm.cc                 |  19 +-
 src/runtime/vm/profiler/vm.h                  |   5 +-
 .../python/unittest/test_runtime_profiling.py |  53 +++-
 12 files changed, 553 insertions(+), 56 deletions(-)
 create mode 100644 cmake/modules/contrib/PAPI.cmake
 create mode 100644 src/runtime/contrib/papi/papi.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c56a929e276d..ae590aa2045d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
 tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
 tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
+tvm_option(USE_PAPI "Use PAPI (The Performance Application Programming Interface) to read performance counters" OFF)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -407,6 +408,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
+include(cmake/modules/contrib/PAPI.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/modules/contrib/PAPI.cmake b/cmake/modules/contrib/PAPI.cmake
new file mode 100644
index 000000000000..257591451ca8
--- /dev/null
+++ b/cmake/modules/contrib/PAPI.cmake
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_PAPI)
+  find_package(PkgConfig REQUIRED)
+
+  set(ENV{PKG_CONFIG_PATH} "${USE_PAPI}:$ENV{PKG_CONFIG_PATH}")
+  pkg_check_modules(PAPI REQUIRED IMPORTED_TARGET papi>=6.0)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS PkgConfig::PAPI)
+  list(APPEND RUNTIME_SRCS src/runtime/contrib/papi/papi.cc)
+endif()
diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index bb6ff1de8f29..14be14d954fc 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -166,6 +166,16 @@ TVM_DLL int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv);
  */
 TVM_DLL int TVMBackendRunOnce(void** handle, int (*f)(void*), void* cdata, int nbytes);
 
+/*!
+ * \brief Reset the threads in the pool. All current threads are destroyed and
+ * new ones are created.
+ *
+ * Note that this does nothing when openmp is used.
+ *
+ * \return 0 when no error is thrown, -1 when failure happens
+ */
+TVM_DLL int TVMBackendResetPool();
+
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 5b44e020f4e4..aff1f4df24f8 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -200,6 +200,52 @@ class Report : public ObjectRef {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Report, ObjectRef, ReportNode);
 };
 
+/*! \brief Interface for user defined profiling metric collection.
+ *
+ * Users can register their own collector by registering a packed function with
+ * the name "runtime.profiling.metrics.my_collector_name" where
+ * "my_collector_name" is the name of their collector. This function should
+ * take an Array of Device as input which contains the devices the collector
+ * will be run on.
+ *
+ * `MetricCollectorNode`s will be called in the following fashion.
+ * \code
+ * MetricCollector mc;
+ * for (auto op : model) {
+ *   auto o = mc.Start();
+ *   op();
+ *   auto metrics = mc.Stop(o); // metrics are added the profiling report
+ * }
+ * \endcode
+ */
+class MetricCollectorNode : public Object {
+ public:
+  /*! \brief Start colling metrics for a function call.
+   * \param dev The device the call will be run on.
+   * \returns An object used to maintain state of the metric collection. This
+   * object will be passed to the corresponding `Stop` call. If the device is
+   * not supported, this function will return a nullptr ObjectRef.
+   */
+  virtual ObjectRef Start(Device dev) = 0;
+  /*! \brief Stop collecting metrics.
+   * \param obj The object created by the corresponding `Start` call.
+   * \returns A set of metric names and the associated values. Values must be
+   * one of DurationNode, PercentNode, CountNode, or StringObj.
+   */
+  virtual Map<String, ObjectRef> Stop(ObjectRef obj) = 0;
+
+  virtual ~MetricCollectorNode() {}
+
+  static constexpr const char* _type_key = "runtime.profiling.MetricCollectorNode";
+  TVM_DECLARE_BASE_OBJECT_INFO(MetricCollectorNode, Object);
+};
+
+/*! \brief Wrapper for `MetricCollectorNode`. */
+class MetricCollector : public ObjectRef {
+ public:
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MetricCollector, ObjectRef, MetricCollectorNode);
+};
+
 /*! Information about a single function or operator call. */
 struct CallFrame {
   /*! Device on which the call was made */
@@ -210,6 +256,8 @@ struct CallFrame {
   Timer timer;
   /*! Extra performance metrics */
   std::unordered_map<std::string, ObjectRef> extra_metrics;
+  /*! User defined metric collectors */
+  std::vector<std::pair<MetricCollector, ObjectRef>> extra_collectors;
 };
 
 /*! Runtime profiler for function and/or operator calls. Used in the graph
@@ -217,9 +265,10 @@ struct CallFrame {
  *
  * Example usage:
  * \code{.cpp}
- * Profiler prof;
  * Device cpu, gpu;
- * prof.Start({cpu, gpu});
+ * Profiler prof({cpu, gpu});
+ * my_gpu_kernel(); // do a warmup iteration
+ * prof.Start();
  * prof.StartCall("my_gpu_kernel", gpu);
  * my_gpu_kernel();
  * prof.StopCall();
@@ -232,13 +281,19 @@ struct CallFrame {
  */
 class Profiler {
  public:
-  /*! \brief Start the profiler.
+  /*! Constructor.
+   *
+   * The profiler should be constructed before you do any warmup iterations.
+   *
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
+   */
+  explicit Profiler(std::vector<Device> devs);
+  /*! \brief Start the profiler.
    *
    * This function should only be called once per object.
    */
-  void Start(const std::vector<Device>& devs);
+  void Start();
   /*! \brief Stop the profiler.
    *
    * This function should only be called once per object after start has been called.
@@ -270,12 +325,14 @@ class Profiler {
   /*! \brief Check if the profiler is currently running.
    * \returns Whether or not the profiler is running.
    */
-  bool IsRunning() const { return !global_timers_.empty(); }
+  bool IsRunning() const { return is_running_; }
 
  private:
-  std::vector<std::pair<Device, Timer>> global_timers_;
+  std::vector<Device> devs_;
+  bool is_running_{false};
   std::vector<CallFrame> calls_;
   std::stack<CallFrame> in_flight_;
+  std::vector<MetricCollector> collectors_;
 };
 
 /* \brief A duration in time. */
@@ -329,6 +386,24 @@ class CountNode : public Object {
  */
 String ShapeString(const std::vector<NDArray>& shapes);
 
+/*! \brief Wrapper for `Device`. */
+struct DeviceWrapperNode : public Object {
+  /*! The device */
+  Device device;
+
+  /*! Constructor */
+  explicit DeviceWrapperNode(Device device) : device(device) {}
+
+  static constexpr const char* _type_key = "DeviceWrapperNode";
+  TVM_DECLARE_BASE_OBJECT_INFO(DeviceWrapperNode, Object);
+};
+
+/*! \brief Wrapper for `Device`. */
+class DeviceWrapper : public ObjectRef {
+ public:
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DeviceWrapper, ObjectRef, DeviceWrapperNode);
+};
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
new file mode 100644
index 000000000000..dd5b3850f20e
--- /dev/null
+++ b/src/runtime/contrib/papi/papi.cc
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \brief Performance counters for profiling via the PAPI library.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
+#define TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
+
+#include <papi.h>
+#include <tvm/runtime/profiling.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace profiling {
+
+#define PAPI_CALL(func)                                                         \
+  {                                                                             \
+    int e = (func);                                                             \
+    if (e < 0) {                                                                \
+      LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)); \
+    }                                                                           \
+  }
+
+static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_metrics = {
+    {kDLCPU,
+     {"perf::CYCLES", "perf::STALLED-CYCLES-FRONTEND", "perf::STALLED-CYCLES-BACKEND",
+      "perf::INSTRUCTIONS", "perf::CACHE-MISSES"}},
+    {kDLGPU, {"cuda:::event:elapsed_cycles_sm:device=0"}}};
+
+/*! \brief Object that holds the values of counters at the start of a function call. */
+struct PAPIEventSetNode : public Object {
+  /*! \brief The starting values of counters for all metrics of a specific device. */
+  std::vector<long_long> start_values;
+  /*! \brief The device these counters are for. */
+  Device dev;
+
+  explicit PAPIEventSetNode(std::vector<long_long> start_values, Device dev)
+      : start_values(start_values), dev(dev) {}
+
+  static constexpr const char* _type_key = "PAPIEventSetNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIEventSetNode, Object);
+};
+
+int component_for_device(Device dev) {
+  std::string component_name;
+  switch (dev.device_type) {
+    case kDLCPU:
+    case kDLCPUPinned:
+      component_name = "perf_event";
+      break;
+    case kDLGPU:
+      component_name = "cuda";
+      break;
+    case kDLROCM:
+      component_name = "rocm";
+      break;
+    default:
+      LOG(WARNING) << "PAPI does not support device " << DeviceName(dev.device_type);
+      return -1;
+  }
+  int cidx = PAPI_get_component_index(component_name.c_str());
+  if (cidx < 0) {
+    LOG(FATAL) << "Cannot find PAPI component \"" << component_name
+               << "\". Maybe you need to build PAPI with support for this component (use "
+                  "`./configure --components="
+               << component_name << "`).";
+  }
+  return cidx;
+}
+
+/*! \brief MetricCollectorNode for PAPI metrics.
+ *
+ * PAPI (Performance Application Programming Interface) collects metrics on a
+ * variety of platforms including cpu, cuda and rocm.
+ *
+ * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
+ *
+ * Users can change the metrics collected for by setting the environment
+ * variable `TVM_PAPI_${device_name}_METRICS` with a semicolon seperated list
+ * of metrics. Use the `papi_native_avail` tool to find the name of all
+ * available metrics.
+ */
+struct PAPIMetricCollectorNode final : public MetricCollectorNode {
+  explicit PAPIMetricCollectorNode(Array<DeviceWrapper> devices) {
+    if (!PAPI_is_initialized()) {
+      PAPI_CALL(PAPI_library_init(PAPI_VER_CURRENT));
+    }
+
+    // create event sets for each device
+    for (auto wrapped_device : devices) {
+      Device device = wrapped_device->device;
+      int cidx = component_for_device(device);
+      // unknown device, skipping
+      if (cidx < 0) {
+        continue;
+      }
+
+      const PAPI_component_info_t* component;
+      component = PAPI_get_component_info(cidx);
+      if (component->disabled) {
+        std::string help_message = "";
+        switch (device.device_type) {
+          case kDLCPU:
+          case kDLCPUPinned:
+            help_message =
+                "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
+            break;
+          case kDLGPU:
+            help_message =
+                "Try enabling gpu profiling with `modprobe nvidia "
+                "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "
+                "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
+                "`/etc/modprobe.d/nvidia-kernel-common.conf`.";
+            break;
+          default:
+            break;
+        }
+        LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type)
+                     << ": " << component->disabled_reason << "\n"
+                     << help_message;
+        continue;
+      }
+
+      int event_set = PAPI_NULL;
+      PAPI_CALL(PAPI_create_eventset(&event_set));
+      PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
+      if (device.device_type == kDLCPU) {
+        // we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
+        PAPI_option_t opt;
+        memset(&opt, 0x0, sizeof(PAPI_option_t));
+        opt.inherit.inherit = PAPI_INHERIT_ALL;
+        opt.inherit.eventset = event_set;
+        PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
+      }
+
+      // load default metrics for device or read them from an environment variable
+      std::vector<std::string> metrics;
+      std::string dev_name = DeviceName(device.device_type);
+      std::transform(dev_name.begin(), dev_name.end(), dev_name.begin(),
+                     [](unsigned char c) { return std::toupper(c); });
+      const char* env_p =
+          std::getenv((std::string("TVM_PAPI_") + dev_name + std::string("_METRICS")).c_str());
+      if (env_p != nullptr) {
+        std::string metric_string = env_p;
+        size_t loc = 0;
+        while (loc < metric_string.size()) {
+          size_t next = metric_string.find(';', loc);
+          if (next == metric_string.npos) {
+            next = metric_string.size();
+          }
+          metrics.push_back(metric_string.substr(loc, next - loc));
+          loc = next + 1;
+        }
+      } else {
+        auto it = default_metrics.find(device.device_type);
+        if (it != default_metrics.end()) {
+          metrics = it->second;
+        } else {
+          LOG(WARNING) << "No default metrics set for " << dev_name
+                       << ". You can specify metrics with the environment variable TVM_PAPI_"
+                       << dev_name << "_METRICS.";
+        }
+      }
+      // skip if no metrics exist
+      if (metrics.size() == 0) {
+        continue;
+      }
+      papi_metrics[device] = metrics;
+
+      if (static_cast<int>(metrics.size()) > PAPI_num_cmp_hwctrs(cidx)) {
+        PAPI_CALL(PAPI_set_multiplex(event_set));
+      }
+
+      // add all the metrics
+      for (auto metric : metrics) {
+        int e = PAPI_add_named_event(event_set, metric.c_str());
+        if (e != PAPI_OK) {
+          LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
+                     << ".";
+        }
+      }
+      // Because we may have multiple calls in flight at the same time, we
+      // start all the timers when we initialize. Then we calculate the metrics
+      // counts for a call by comparing counter values at the start vs end of
+      // the call.
+      PAPI_CALL(PAPI_start(event_set));
+      event_sets[device] = event_set;
+    }
+  }
+
+  /*! \brief Called right before a function call.
+   * \param dev The device the function will be run on.
+   * \returns A `PAPIEventSetNode` containing values for the counters at the
+   * start of the call. Passed to a corresponding `Stop` call.
+   */
+  ObjectRef Start(Device dev) final {
+    // Record counter values at the start of the call, so we can calculate the
+    // metrics for the call by comparing the values at the end of the call.
+    auto it = event_sets.find(dev);
+    if (it != event_sets.end()) {
+      int event_set = it->second;
+      std::vector<long_long> values(papi_metrics[dev].size());
+      PAPI_CALL(PAPI_read(event_set, values.data()));
+      return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
+    } else {
+      return ObjectRef(nullptr);
+    }
+  }
+
+  /*! \brief Called right after a function call.
+   * \param obj `PAPIEventSetNode` created by a call to `Start`.
+   * \returns A mapping from metric name to value.
+   */
+  Map<String, ObjectRef> Stop(ObjectRef obj) final {
+    const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
+    std::vector<long_long> end_values(papi_metrics[event_set_node->dev].size());
+    PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
+    std::unordered_map<String, ObjectRef> reported_metrics;
+    for (size_t i = 0; i < end_values.size(); i++) {
+      reported_metrics[papi_metrics[event_set_node->dev][i]] =
+          ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+    }
+    return reported_metrics;
+  }
+
+  ~PAPIMetricCollectorNode() final {
+    for (auto p : event_sets) {
+      PAPI_CALL(PAPI_stop(p.second, NULL));
+      PAPI_CALL(PAPI_cleanup_eventset(p.second));
+      PAPI_CALL(PAPI_destroy_eventset(&p.second));
+    }
+  }
+
+  /*! \brief Device-specific event sets. Contains the running counters for that device. */
+  std::unordered_map<Device, int> event_sets;
+  /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
+   * `event_set`. */
+  std::unordered_map<Device, std::vector<std::string>> papi_metrics;
+
+  static constexpr const char* _type_key = "PAPIMetricCollectorNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
+};
+
+TVM_REGISTER_GLOBAL("runtime.profiling.metrics.papi")
+    .set_body_typed([](Array<DeviceWrapper> devices) {
+      return MetricCollector(make_object<PAPIMetricCollectorNode>(devices));
+    });
+
+TVM_REGISTER_OBJECT_TYPE(PAPIEventSetNode);
+TVM_REGISTER_OBJECT_TYPE(PAPIMetricCollectorNode);
+
+}  // namespace profiling
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 1ea01b19e8aa..b29df7c97f76 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -279,13 +279,14 @@ class GraphExecutorDebug : public GraphExecutor {
    * \returns A table of per-op runtimes and total times.
    */
   profiling::Report Profile() {
+    profiling::Profiler prof(devices_);
+
     // warm up. 1 iteration does not seem enough.
     for (int i = 0; i < 3; i++) {
       GraphExecutor::Run();
     }
 
-    profiling::Profiler prof;
-    prof.Start(devices_);
+    prof.Start();
     for (size_t i = 0; i < op_execs_.size(); ++i) {
       if (op_execs_[i]) {
         // get argument shapes
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index ab9d674fad50..a3e69883debc 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -23,6 +23,7 @@
  */
 
 #include <tvm/ir/expr.h>
+#include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
 
@@ -100,16 +101,38 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
 
 namespace profiling {
 
-void Profiler::Start(const std::vector<Device>& devs) {
-  CHECK(global_timers_.empty()) << "You can only call Start once per Profiler.";
+Profiler::Profiler(std::vector<Device> devs) : devs_(devs) {
+  is_running_ = false;
+  std::vector<DeviceWrapper> wrapped_devs;
   for (auto dev : devs) {
-    global_timers_.emplace_back(dev, Timer::Start(dev));
+    wrapped_devs.push_back(DeviceWrapper(make_object<DeviceWrapperNode>(dev)));
+  }
+  for (const auto& name : Registry::ListNames()) {
+    if (name.find("runtime.profiling.metrics.") == 0) {
+      collectors_.push_back(Registry::Get(name)->operator()(Array<DeviceWrapper>(wrapped_devs)));
+    }
+  }
+  // reset the thread pool so that PAPI eventset hooks are set in all threads.
+  TVMBackendResetPool();
+}
+
+void Profiler::Start() {
+  is_running_ = true;
+  for (auto dev : devs_) {
+    StartCall("Total", dev, {});
   }
 }
 
 void Profiler::StartCall(String name, Device dev,
                          std::unordered_map<std::string, ObjectRef> extra_metrics) {
-  in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics});
+  std::vector<std::pair<MetricCollector, ObjectRef>> objs;
+  for (auto& collector : collectors_) {
+    ObjectRef obj = collector->Start(dev);
+    if (obj.defined()) {
+      objs.emplace_back(collector, obj);
+    }
+  }
+  in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics, objs});
 }
 
 void Profiler::StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics) {
@@ -118,14 +141,21 @@ void Profiler::StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics
   for (auto& p : extra_metrics) {
     cf.extra_metrics[p.first] = p.second;
   }
+  // collect the extra metrics from user defined collectors
+  for (const auto& obj : cf.extra_collectors) {
+    auto collector_metrics = obj.first->Stop(obj.second);
+    for (auto& p : collector_metrics) {
+      cf.extra_metrics[p.first] = p.second;
+    }
+  }
   in_flight_.pop();
   calls_.push_back(cf);
 }
 
 void Profiler::Stop() {
-  // Stop all global timers. We wait to synchronize until we are making the report.
-  for (auto p : global_timers_) {
-    p.second->Stop();
+  is_running_ = false;
+  for (size_t i = 0; i < devs_.size(); i++) {
+    StopCall();
   }
 }
 
@@ -396,31 +426,11 @@ std::string DeviceString(Device dev) {
 }
 
 Report Profiler::Report(bool aggregate, bool sort) {
-  std::vector<std::pair<Device, double>> global_times;
-  for (auto p : global_timers_) {
-    global_times.emplace_back(p.first, p.second->SyncAndGetElapsedNanos() / 1e3);
-  }
-
-  double overall_time = 0;
-  for (auto p : global_times) {
-    overall_time = std::max(overall_time, p.second);
-  }
-
-  std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
-  for (auto p : global_times) {
-    std::unordered_map<String, ObjectRef> row;
-    row["Name"] = String("Total");
-    row["Duration (us)"] = ObjectRef(make_object<DurationNode>(p.second));
-    row["Percent"] = ObjectRef(make_object<PercentNode>(p.second / overall_time * 100));
-    row["Device"] = String(DeviceString(p.first));
-    device_metrics[DeviceString(p.first)] = row;
-  }
-
-  std::vector<Map<String, ObjectRef>> rows;
+  // sync all timers and normalize rows
+  std::vector<std::unordered_map<String, ObjectRef>> rows;
   for (auto& cf : calls_) {
     std::unordered_map<String, ObjectRef> row;
     double us = cf.timer->SyncAndGetElapsedNanos() / 1e3;
-    row["Percent"] = ObjectRef(make_object<PercentNode>(us / overall_time * 100));
     row["Duration (us)"] = ObjectRef(make_object<DurationNode>(us));
     row["Count"] = ObjectRef(make_object<CountNode>(1));
     row["Name"] = cf.name;
@@ -431,7 +441,29 @@ Report Profiler::Report(bool aggregate, bool sort) {
     rows.push_back(row);
   }
 
-  return profiling::Report(rows, device_metrics);
+  // the last couple of call frames are the overall times
+  double overall_time = 0;
+  std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
+  for (size_t i = 0; i < devs_.size(); i++) {
+    auto row = rows[rows.size() - 1];
+    rows.pop_back();
+    device_metrics[Downcast<String>(row["Device"])] = row;
+    overall_time = std::max(overall_time, row["Duration (us)"].as<DurationNode>()->microseconds);
+  }
+
+  // Calculate percentages
+  for (auto& row : rows) {
+    row["Percent"] = ObjectRef(make_object<PercentNode>(
+        row["Duration (us)"].as<DurationNode>()->microseconds / overall_time * 100));
+  }
+
+  // convert to map
+  std::vector<Map<String, ObjectRef>> converted_rows;
+  for (const auto& row : rows) {
+    converted_rows.push_back(row);
+  }
+
+  return profiling::Report(converted_rows, device_metrics);
 }
 
 Report::Report(Array<Map<String, ObjectRef>> calls,
@@ -446,6 +478,8 @@ TVM_REGISTER_OBJECT_TYPE(DurationNode);
 TVM_REGISTER_OBJECT_TYPE(PercentNode);
 TVM_REGISTER_OBJECT_TYPE(CountNode);
 TVM_REGISTER_OBJECT_TYPE(ReportNode);
+TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode);
+TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode);
 
 TVM_REGISTER_GLOBAL("runtime.profiling.AsCSV").set_body_typed([](Report n) { return n->AsCSV(); });
 }  // namespace profiling
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index cab04ec0db4a..5d8735e7aae8 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -278,6 +278,22 @@ class ThreadPool {
     }
     threads_.reset();
   }
+  void Reset() {
+    for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
+      q->SignalForKill();
+    }
+    queues_.clear();
+    threads_.reset();
+    for (int i = 0; i < num_workers_; ++i) {
+      // The SpscTaskQueue only hosts ONE item at a time
+      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
+    }
+    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
+        new tvm::runtime::threading::ThreadGroup(
+            num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
+            exclude_worker0_ /* include_main_thread */));
+    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
+  }
   int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, int need_sync) {
     ParallelLauncher* launcher = ParallelLauncher::ThreadLocal();
     ICHECK(!launcher->is_worker)
@@ -408,3 +424,8 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
 #endif
   return 0;
 }
+
+int TVMBackendResetPool() {
+  tvm::runtime::ThreadPool::ThreadLocal()->Reset();
+  return 0;
+}
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index e8b948d3d2ae..2791f0499267 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -273,6 +273,13 @@ void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
     primitive_names[packed_index] = it.first;
   }
   strm->Write(primitive_names);
+  // TODO(tkonolige): cannot serialize ObjectRefs
+  // std::vector<std::pair<size_t, Map<String, ObjectRef>>> primitive_attrs;
+  // for (const auto& it : this->op_attrs) {
+  //   auto packed_index = static_cast<size_t>(it.first);
+  //   primitive_attrs.push_back({packed_index, it.second});
+  // }
+  // strm->Write(primitive_attrs);
 }
 
 // Serialize a virtual machine instruction. It creates a list that contains the
@@ -569,6 +576,12 @@ void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
   for (size_t i = 0; i < primitive_names.size(); i++) {
     this->primitive_map.insert({primitive_names[i], i});
   }
+  // TODO(tkonolige): cannot serialize ObjectRefs
+  // std::vector<std::pair<size_t, Map<String, ObjectRef>>> primitive_attrs;
+  // STREAM_CHECK(strm->Read(&primitive_attrs), "primitive attrs");
+  // for (auto p : primitive_attrs) {
+  //   this->op_attrs.insert(p);
+  // }
 }
 
 // Extract the `cnt` number of fields started at `start` from the list
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index a7d65944d581..b04889441a52 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -51,17 +51,20 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
         }
       }
 
+      prof_ = profiling::Profiler(devices);
+
       auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
       // warmup
       for (int i = 0; i < 3; i++) {
         invoke(arg_name);
       }
 
-      prof_ = profiling::Profiler();  // reset profiler
-      prof_.Start(devices);
+      prof_.operator*().Start();
       invoke(arg_name);
-      prof_.Stop();
-      return prof_.Report();
+      prof_.operator*().Stop();
+      auto report = prof_.operator*().Report();
+      prof_ = dmlc::optional<profiling::Profiler>();  // releases hardware counters
+      return report;
     });
   } else {
     return VirtualMachine::GetFunction(name, sptr_to_self);
@@ -80,7 +83,7 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
                                        Index output_size, const std::vector<ObjectRef>& args) {
   ICHECK(exec_);
   ICHECK(!devices_.empty()) << "Device has not been initialized yet.";
-  if (prof_.IsRunning()) {
+  if (prof_ && prof_.operator*().IsRunning()) {
     // The device of any input of the operator is used for synchronization.
     ICHECK_GT(arg_count, 0U);
     ObjectRef arg = args[0];
@@ -122,11 +125,11 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
     }
     metrics["Argument Shapes"] = profiling::ShapeString(shapes);
 
-    prof_.StartCall(packed_index_map_[packed_index], dev, metrics);
+    prof_.operator*().StartCall(packed_index_map_[packed_index], dev, metrics);
   }
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args);
-  if (prof_.IsRunning()) {
-    prof_.StopCall();
+  if (prof_ && prof_.operator*().IsRunning()) {
+    prof_.operator*().StopCall();
   }
 }
 
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 521a9bd454e7..1efefda52b97 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -25,6 +25,7 @@
 #ifndef TVM_RUNTIME_VM_PROFILER_VM_H_
 #define TVM_RUNTIME_VM_PROFILER_VM_H_
 
+#include <dmlc/optional.h>
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/vm/vm.h>
 
@@ -39,7 +40,7 @@ namespace vm {
 
 class VirtualMachineDebug : public VirtualMachine {
  public:
-  VirtualMachineDebug() : VirtualMachine() {}
+  VirtualMachineDebug() : VirtualMachine(), prof_({}) {}
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;
 
@@ -52,7 +53,7 @@ class VirtualMachineDebug : public VirtualMachine {
                     const std::vector<ObjectRef>& args) final;
 
   std::unordered_map<Index, std::string> packed_index_map_;
-  profiling::Profiler prof_;
+  dmlc::optional<profiling::Profiler> prof_;
 };
 
 }  // namespace vm
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index ee8032550b39..0f6c838fd8b0 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -18,6 +18,7 @@
 import pytest
 from io import StringIO
 import csv
+import os
 
 import tvm.testing
 from tvm.runtime import profiler_vm
@@ -26,6 +27,24 @@
 from tvm.contrib.debugger import debug_executor
 
 
+def read_csv(report):
+    f = StringIO(report.csv())
+    headers = []
+    rows = []
+    reader = csv.reader(f, delimiter=",")
+    # force parsing
+    in_header = True
+    for row in reader:
+        if in_header:
+            headers = row
+            in_header = False
+            rows = [[] for x in headers]
+        else:
+            for i in range(len(row)):
+                rows[i].append(row[i])
+    return dict(zip(headers, rows))
+
+
 @pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
 @tvm.testing.parametrize_targets
 def test_vm(target, dev):
@@ -39,14 +58,9 @@ def test_vm(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
 
-    f = StringIO(report.csv())
-    reader = csv.reader(f, delimiter=",")
-    # force parsing
-    in_header = True
-    for row in reader:
-        if in_header:
-            assert "Hash" in row
-            in_header = False
+    csv = read_csv(report)
+    assert "Hash" in csv.keys()
+    assert all([float(x) > 0 for x in csv["Duration (us)"]])
 
 
 @tvm.testing.parametrize_targets
@@ -61,3 +75,26 @@ def test_graph_executor(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
     assert "Hash" in str(report)
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.skipif(
+    tvm.get_global_func("runtime.profiling.metrics.papi", allow_missing=True) is None,
+    reason="PAPI profiling not enabled",
+)
+def test_papi_gpu():
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.vm.compile(mod, "cuda", params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, tvm.gpu())
+
+    data = np.random.rand(1, 1, 28, 28).astype("float32")
+    report = vm.profile([data], func_name="main")
+    assert "cuda::" in str(report)
+
+    metric = "cuda:::event:shared_load:device=0"
+    os.environ["TVM_PAPI_GPU_METRICS"] = metric
+    report = vm.profile([data], func_name="main")
+    csv = read_csv(report)
+    assert metric in csv.keys()
+    assert any([float(x) > 0 for x in csv[metric]])

From d55c7296170de5cb6bc5784ab61d2de4484a37d7 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 6 May 2021 09:43:43 -0700
Subject: [PATCH 02/23] Update CMakeLists.txt

Co-authored-by: Leandro Nunes <leandro.nunes@arm.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae590aa2045d..9e35b8e4bbad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,7 @@ tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
 tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
 tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
-tvm_option(USE_PAPI "Use PAPI (The Performance Application Programming Interface) to read performance counters" OFF)
+tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")

From 41dbd74f8b279af8eb5326663c4446c4052840e6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 6 May 2021 11:26:54 -0700
Subject: [PATCH 03/23] move thread pool reset out of crt

---
 include/tvm/runtime/c_backend_api.h     | 10 ----------
 include/tvm/runtime/threading_backend.h |  8 ++++++++
 src/runtime/profiling.cc                |  3 ++-
 src/runtime/thread_pool.cc              |  9 ++++-----
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index 14be14d954fc..bb6ff1de8f29 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -166,16 +166,6 @@ TVM_DLL int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv);
  */
 TVM_DLL int TVMBackendRunOnce(void** handle, int (*f)(void*), void* cdata, int nbytes);
 
-/*!
- * \brief Reset the threads in the pool. All current threads are destroyed and
- * new ones are created.
- *
- * Note that this does nothing when openmp is used.
- *
- * \return 0 when no error is thrown, -1 when failure happens
- */
-TVM_DLL int TVMBackendResetPool();
-
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif
diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h
index 95a64049fd45..43636ddbdb1f 100644
--- a/include/tvm/runtime/threading_backend.h
+++ b/include/tvm/runtime/threading_backend.h
@@ -94,6 +94,14 @@ void Yield();
  */
 int MaxConcurrency();
 
+/*!
+ * \brief Reset the threads in the pool. All current threads are destroyed and
+ * new ones are created.
+ *
+ * Note that this does nothing when openmp is used.
+ */
+void ResetThreadPool();
+
 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index a3e69883debc..befb7478df98 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -26,6 +26,7 @@
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <chrono>
 #include <iomanip>
@@ -113,7 +114,7 @@ Profiler::Profiler(std::vector<Device> devs) : devs_(devs) {
     }
   }
   // reset the thread pool so that PAPI eventset hooks are set in all threads.
-  TVMBackendResetPool();
+  threading::ResetThreadPool();
 }
 
 void Profiler::Start() {
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 5d8735e7aae8..4daf6b0688fe 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -375,6 +375,10 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe
   ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads);
 });
 
+namespace threading {
+void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
+}  // namespace threading
+
 }  // namespace runtime
 }  // namespace tvm
 
@@ -424,8 +428,3 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
 #endif
   return 0;
 }
-
-int TVMBackendResetPool() {
-  tvm::runtime::ThreadPool::ThreadLocal()->Reset();
-  return 0;
-}

From 8fdb98158cbee6cdb39261f97908440031ea4873 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 6 May 2021 13:19:20 -0700
Subject: [PATCH 04/23] add docs

---
 docs/profiling/index.rst | 26 ++++++++++++++++
 docs/profiling/papi.rst  | 66 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 docs/profiling/index.rst
 create mode 100644 docs/profiling/papi.rst

diff --git a/docs/profiling/index.rst b/docs/profiling/index.rst
new file mode 100644
index 000000000000..19883ee76aae
--- /dev/null
+++ b/docs/profiling/index.rst
@@ -0,0 +1,26 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _vta-index:
+
+Profiling Deep Learning Models
+====================================
+
+.. toctree::
+   :maxdepth: 1
+
+   papi
diff --git a/docs/profiling/papi.rst b/docs/profiling/papi.rst
new file mode 100644
index 000000000000..7dc5c34ded9e
--- /dev/null
+++ b/docs/profiling/papi.rst
@@ -0,0 +1,66 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+Getting Started With PAPI
+=========================
+
+The Performance Application Programming Interface (PAPI) is a library that
+provides performance counters on a variety of platforms. Performance counters
+provide accurate low-level information about processors behavior during a given
+execution run. This information can contain simple metrics like total cycle
+count, cache misses, and instructions executed as well as more high level
+information like total FLOPS and warp occupancy. PAPI makes these metrics
+available while profiling.
+
+Installing PAPI
+---------------
+
+PAPI can either be installed using your package manager (``apt-get install
+libpapi-dev`` on Ubuntu), or from source here:
+https://bitbucket.org/icl/papi/src/master/.
+
+
+Building TVM With PAPI
+----------------------
+
+To include PAPI in your build of TVM, set the following line in you ``config.cmake``:
+
+.. code::
+
+   set(USE_PAPI ON)
+
+If PAPI is installed in a non-standard place, you can specify where it is like so:
+
+.. code::
+
+   set(USE_PAPI path/to/papi.pc)
+
+
+Using PAPI While Profiling
+--------------------------
+
+If TVM has been built with PAPI (see above), then calling the
+:py:meth:`tvm.runtime.GraphModule.profile` will automatically include results
+from a default set of performance counters. To change which performance
+counters are reported, set the ``TVM_PAPI_${DEVICE}_METRICS`` environment
+variable (where ``${DEVICE}`` is the device you are running on; ``GPU`` if
+using a gpu, ``CPU`` for the cpu) to a semicolon separated list of metrics. For
+example, ``TVM_PAPI_CPU_METRICS=perf::INSTRUCTIONS;perf::BRANCH-INSTRUCTIONS``
+would report the number of instructions executed and the number of branch
+instructions executed. You can find a list of available metrics by running the
+``papi_avail`` and ``papi_native_avail`` commands.

From 339dc7cb877b9f2a0878d51f8c2c68b305cdfa0a Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 6 May 2021 13:19:34 -0700
Subject: [PATCH 05/23] comments

---
 include/tvm/runtime/profiling.h  |  8 +++-
 src/runtime/contrib/papi/papi.cc | 64 ++++++++++++++++++++------------
 src/runtime/vm/executable.cc     |  4 +-
 3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index aff1f4df24f8..c07ca8ecb3ae 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -256,7 +256,9 @@ struct CallFrame {
   Timer timer;
   /*! Extra performance metrics */
   std::unordered_map<std::string, ObjectRef> extra_metrics;
-  /*! User defined metric collectors */
+  /*! User defined metric collectors. Each pair is the MetricCollector and its
+   * associated data (returned from MetricCollector.Start).
+   */
   std::vector<std::pair<MetricCollector, ObjectRef>> extra_collectors;
 };
 
@@ -386,7 +388,9 @@ class CountNode : public Object {
  */
 String ShapeString(const std::vector<NDArray>& shapes);
 
-/*! \brief Wrapper for `Device`. */
+/*! \brief Wrapper for `Device` because `Device` is not passable across the
+ * PackedFunc interface.
+ */
 struct DeviceWrapperNode : public Object {
   /*! The device */
   Device device;
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index dd5b3850f20e..44583de9a921 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -36,12 +36,12 @@ namespace profiling {
 #define PAPI_CALL(func)                                                         \
   {                                                                             \
     int e = (func);                                                             \
-    if (e < 0) {                                                                \
-      LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)); \
+    if (e != PAPI_OK) {                                                                \
+      LOG(FATAL) << "PAPIError: in function " #func " " << e << " " << std::string(PAPI_strerror(e)); \
     }                                                                           \
   }
 
-static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_metrics = {
+static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_metric_names = {
     {kDLCPU,
      {"perf::CYCLES", "perf::STALLED-CYCLES-FRONTEND", "perf::STALLED-CYCLES-BACKEND",
       "perf::INSTRUCTIONS", "perf::CACHE-MISSES"}},
@@ -61,6 +61,11 @@ struct PAPIEventSetNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(PAPIEventSetNode, Object);
 };
 
+/* Get the PAPI component id for the given device.
+ * \param dev The device to get the component for.
+ * \returns PAPI component id for the device. Returns -1 if the device is not
+ * supported by PAPI.
+ */
 int component_for_device(Device dev) {
   std::string component_name;
   switch (dev.device_type) {
@@ -103,7 +108,10 @@ int component_for_device(Device dev) {
 struct PAPIMetricCollectorNode final : public MetricCollectorNode {
   explicit PAPIMetricCollectorNode(Array<DeviceWrapper> devices) {
     if (!PAPI_is_initialized()) {
-      PAPI_CALL(PAPI_library_init(PAPI_VER_CURRENT));
+      if(sizeof(long_long) > sizeof(int64_t)) {
+        LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when reporting metrics.";
+      }
+      CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT) << "Error while initializing PAPI";
     }
 
     // create event sets for each device
@@ -115,8 +123,7 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
         continue;
       }
 
-      const PAPI_component_info_t* component;
-      component = PAPI_get_component_info(cidx);
+      const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
       if (component->disabled) {
         std::string help_message = "";
         switch (device.device_type) {
@@ -154,7 +161,7 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
       }
 
       // load default metrics for device or read them from an environment variable
-      std::vector<std::string> metrics;
+      std::vector<std::string> metric_names;
       std::string dev_name = DeviceName(device.device_type);
       std::transform(dev_name.begin(), dev_name.end(), dev_name.begin(),
                      [](unsigned char c) { return std::toupper(c); });
@@ -168,13 +175,13 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
           if (next == metric_string.npos) {
             next = metric_string.size();
           }
-          metrics.push_back(metric_string.substr(loc, next - loc));
+          metric_names.push_back(metric_string.substr(loc, next - loc));
           loc = next + 1;
         }
       } else {
-        auto it = default_metrics.find(device.device_type);
-        if (it != default_metrics.end()) {
-          metrics = it->second;
+        auto it = default_metric_names.find(device.device_type);
+        if (it != default_metric_names.end()) {
+          metric_names = it->second;
         } else {
           LOG(WARNING) << "No default metrics set for " << dev_name
                        << ". You can specify metrics with the environment variable TVM_PAPI_"
@@ -182,17 +189,17 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
         }
       }
       // skip if no metrics exist
-      if (metrics.size() == 0) {
+      if (metric_names.size() == 0) {
         continue;
       }
-      papi_metrics[device] = metrics;
+      papi_metric_names[device] = metric_names;
 
-      if (static_cast<int>(metrics.size()) > PAPI_num_cmp_hwctrs(cidx)) {
+      if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
         PAPI_CALL(PAPI_set_multiplex(event_set));
       }
 
       // add all the metrics
-      for (auto metric : metrics) {
+      for (auto metric : metric_names) {
         int e = PAPI_add_named_event(event_set, metric.c_str());
         if (e != PAPI_OK) {
           LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
@@ -208,7 +215,9 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
     }
   }
 
-  /*! \brief Called right before a function call.
+  /*! \brief Called right before a function call. Reads starting values of the
+   * measured metrics.
+   *
    * \param dev The device the function will be run on.
    * \returns A `PAPIEventSetNode` containing values for the counters at the
    * start of the call. Passed to a corresponding `Stop` call.
@@ -219,7 +228,7 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
     auto it = event_sets.find(dev);
     if (it != event_sets.end()) {
       int event_set = it->second;
-      std::vector<long_long> values(papi_metrics[dev].size());
+      std::vector<long_long> values(papi_metric_names[dev].size());
       PAPI_CALL(PAPI_read(event_set, values.data()));
       return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
     } else {
@@ -227,18 +236,27 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
     }
   }
 
-  /*! \brief Called right after a function call.
+  /*! \brief Called right after a function call. Reads ending values of the
+   * measured metrics. Computes the change in each metric from the
+   * corresponding `Start` call.
+   *
    * \param obj `PAPIEventSetNode` created by a call to `Start`.
    * \returns A mapping from metric name to value.
    */
   Map<String, ObjectRef> Stop(ObjectRef obj) final {
     const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
-    std::vector<long_long> end_values(papi_metrics[event_set_node->dev].size());
+    std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
     PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
     std::unordered_map<String, ObjectRef> reported_metrics;
     for (size_t i = 0; i < end_values.size(); i++) {
-      reported_metrics[papi_metrics[event_set_node->dev][i]] =
-          ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+      if (end_values[i] < event_set_node->start_values[i]) {
+        LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
+        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+            ObjectRef(make_object<CountNode>(-1));
+      } else {
+        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+            ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+      }
     }
     return reported_metrics;
   }
@@ -251,11 +269,11 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
     }
   }
 
-  /*! \brief Device-specific event sets. Contains the running counters for that device. */
+  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that device. */
   std::unordered_map<Device, int> event_sets;
   /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
    * `event_set`. */
-  std::unordered_map<Device, std::vector<std::string>> papi_metrics;
+  std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
 
   static constexpr const char* _type_key = "PAPIMetricCollectorNode";
   TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 2791f0499267..ca9b22a9099d 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -273,7 +273,7 @@ void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
     primitive_names[packed_index] = it.first;
   }
   strm->Write(primitive_names);
-  // TODO(tkonolige): cannot serialize ObjectRefs
+  // TODO(tkonolige): cannot serialize ObjectRefs with dmlc's serializer.
   // std::vector<std::pair<size_t, Map<String, ObjectRef>>> primitive_attrs;
   // for (const auto& it : this->op_attrs) {
   //   auto packed_index = static_cast<size_t>(it.first);
@@ -576,7 +576,7 @@ void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
   for (size_t i = 0; i < primitive_names.size(); i++) {
     this->primitive_map.insert({primitive_names[i], i});
   }
-  // TODO(tkonolige): cannot serialize ObjectRefs
+  // TODO(tkonolige): cannot serialize ObjectRefs with dmlc's serializer.
   // std::vector<std::pair<size_t, Map<String, ObjectRef>>> primitive_attrs;
   // STREAM_CHECK(strm->Read(&primitive_attrs), "primitive attrs");
   // for (auto p : primitive_attrs) {

From a1729cee1fd00a8bb9de43fafa779250a8cf7299 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 6 May 2021 14:15:06 -0700
Subject: [PATCH 06/23] formatting

---
 src/runtime/contrib/papi/papi.cc | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index 44583de9a921..0ebd441c6d4e 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -33,12 +33,13 @@ namespace tvm {
 namespace runtime {
 namespace profiling {
 
-#define PAPI_CALL(func)                                                         \
-  {                                                                             \
-    int e = (func);                                                             \
-    if (e != PAPI_OK) {                                                                \
-      LOG(FATAL) << "PAPIError: in function " #func " " << e << " " << std::string(PAPI_strerror(e)); \
-    }                                                                           \
+#define PAPI_CALL(func)                                             \
+  {                                                                 \
+    int e = (func);                                                 \
+    if (e != PAPI_OK) {                                             \
+      LOG(FATAL) << "PAPIError: in function " #func " " << e << " " \
+                 << std::string(PAPI_strerror(e));                  \
+    }                                                               \
   }
 
 static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_metric_names = {
@@ -108,10 +109,12 @@ int component_for_device(Device dev) {
 struct PAPIMetricCollectorNode final : public MetricCollectorNode {
   explicit PAPIMetricCollectorNode(Array<DeviceWrapper> devices) {
     if (!PAPI_is_initialized()) {
-      if(sizeof(long_long) > sizeof(int64_t)) {
-        LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when reporting metrics.";
+      if (sizeof(long_long) > sizeof(int64_t)) {
+        LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
+                        "reporting metrics.";
       }
-      CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT) << "Error while initializing PAPI";
+      CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
+          << "Error while initializing PAPI";
     }
 
     // create event sets for each device
@@ -269,7 +272,8 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
     }
   }
 
-  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that device. */
+  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that
+   * device. */
   std::unordered_map<Device, int> event_sets;
   /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
    * `event_set`. */

From 3866076c9c8a9c8b8ad321391087eff60543cda5 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 6 May 2021 16:16:34 -0700
Subject: [PATCH 07/23] forgot one doc

---
 docs/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/index.rst b/docs/index.rst
index a7ae68c87b01..491c42712e9a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -78,6 +78,7 @@ For Developers
    :caption: MISC
 
    vta/index
+   profiling/index
 
 
 Index

From 4c4378bc2d332457ec58994d5a5f974de2f37445 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 7 Jun 2021 12:42:53 -0700
Subject: [PATCH 08/23] kDLGPU -> kDLCUDA

---
 src/runtime/contrib/papi/papi.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index 0ebd441c6d4e..59e1be601be1 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -46,7 +46,7 @@ static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_
     {kDLCPU,
      {"perf::CYCLES", "perf::STALLED-CYCLES-FRONTEND", "perf::STALLED-CYCLES-BACKEND",
       "perf::INSTRUCTIONS", "perf::CACHE-MISSES"}},
-    {kDLGPU, {"cuda:::event:elapsed_cycles_sm:device=0"}}};
+    {kDLCUDA, {"cuda:::event:elapsed_cycles_sm:device=0"}}};
 
 /*! \brief Object that holds the values of counters at the start of a function call. */
 struct PAPIEventSetNode : public Object {
@@ -71,10 +71,9 @@ int component_for_device(Device dev) {
   std::string component_name;
   switch (dev.device_type) {
     case kDLCPU:
-    case kDLCPUPinned:
       component_name = "perf_event";
       break;
-    case kDLGPU:
+    case kDLCUDA:
       component_name = "cuda";
       break;
     case kDLROCM:
@@ -131,11 +130,10 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
         std::string help_message = "";
         switch (device.device_type) {
           case kDLCPU:
-          case kDLCPUPinned:
             help_message =
                 "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
             break;
-          case kDLGPU:
+          case kDLCUDA:
             help_message =
                 "Try enabling gpu profiling with `modprobe nvidia "
                 "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "

From 3c816be56254bc96ff6a3b3de34efecf885afdf6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 10 Jun 2021 14:14:12 -0700
Subject: [PATCH 09/23] Refactor API to more closely match pass instrument's.

---
 include/tvm/runtime/profiling.h               |  51 +--
 python/tvm/contrib/debugger/debug_executor.py |   8 +-
 python/tvm/runtime/profiler_vm.py             |   7 +-
 python/tvm/runtime/profiling.py               |  48 ---
 src/runtime/contrib/papi/papi.cc              | 313 ++++++++----------
 .../debug/graph_executor_debug.cc             |  12 +-
 src/runtime/profiling.cc                      |  12 +-
 src/runtime/vm/profiler/vm.cc                 |  48 +--
 .../python/unittest/test_runtime_profiling.py |  36 +-
 9 files changed, 238 insertions(+), 297 deletions(-)
 delete mode 100644 python/tvm/runtime/profiling.py

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index c07ca8ecb3ae..3af3b62cb9f7 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -37,6 +37,7 @@
 #include <vector>
 
 namespace tvm {
+
 namespace runtime {
 
 /*! \brief Base class for all implementations.
@@ -150,6 +151,26 @@ class Timer : public ObjectRef {
 Timer DefaultTimer(Device dev);
 
 namespace profiling {
+/*! \brief Wrapper for `Device` because `Device` is not passable across the
+ * PackedFunc interface.
+ */
+struct DeviceWrapperNode : public Object {
+  /*! The device */
+  Device device;
+
+  /*! Constructor */
+  explicit DeviceWrapperNode(Device device) : device(device) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.DeviceWrapper";
+  TVM_DECLARE_BASE_OBJECT_INFO(DeviceWrapperNode, Object);
+};
+
+/*! \brief Wrapper for `Device`. */
+class DeviceWrapper : public ObjectRef {
+ public:
+  explicit DeviceWrapper(Device dev) { data_ = make_object<DeviceWrapperNode>(dev); }
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DeviceWrapper, ObjectRef, DeviceWrapperNode);
+};
 
 /*! \brief Data collected from a profiling run. Includes per-call metrics and per-device metrics.
  */
@@ -220,6 +241,11 @@ class Report : public ObjectRef {
  */
 class MetricCollectorNode : public Object {
  public:
+  /*! \brief Initialization call. Called before profiling has started. Any
+   * expensive precomputation should happen here.
+   * \param devs The list of devices this collector will be run on.
+   */
+  virtual void Init(Array<DeviceWrapper> devs) = 0;
   /*! \brief Start colling metrics for a function call.
    * \param dev The device the call will be run on.
    * \returns An object used to maintain state of the metric collection. This
@@ -236,7 +262,7 @@ class MetricCollectorNode : public Object {
 
   virtual ~MetricCollectorNode() {}
 
-  static constexpr const char* _type_key = "runtime.profiling.MetricCollectorNode";
+  static constexpr const char* _type_key = "runtime.profiling.MetricCollector";
   TVM_DECLARE_BASE_OBJECT_INFO(MetricCollectorNode, Object);
 };
 
@@ -289,8 +315,9 @@ class Profiler {
    *
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
+   * \param metric_collectors Additional `MetricCollector`s to use with this profiler.
    */
-  explicit Profiler(std::vector<Device> devs);
+  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors);
   /*! \brief Start the profiler.
    *
    * This function should only be called once per object.
@@ -388,26 +415,6 @@ class CountNode : public Object {
  */
 String ShapeString(const std::vector<NDArray>& shapes);
 
-/*! \brief Wrapper for `Device` because `Device` is not passable across the
- * PackedFunc interface.
- */
-struct DeviceWrapperNode : public Object {
-  /*! The device */
-  Device device;
-
-  /*! Constructor */
-  explicit DeviceWrapperNode(Device device) : device(device) {}
-
-  static constexpr const char* _type_key = "DeviceWrapperNode";
-  TVM_DECLARE_BASE_OBJECT_INFO(DeviceWrapperNode, Object);
-};
-
-/*! \brief Wrapper for `Device`. */
-class DeviceWrapper : public ObjectRef {
- public:
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DeviceWrapper, ObjectRef, DeviceWrapperNode);
-};
-
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index dc043353c475..e3508499bb0c 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -268,14 +268,18 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
-    def profile(self, **input_dict):
+    def profile(self, collectors=[], **input_dict):
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.
 
         Parameters
         ----------
+        collectors : Sequence[MetricCollector]
+            Extra metrics to collect.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
+
         Return
         ------
         timing_results : str
@@ -284,7 +288,7 @@ def profile(self, **input_dict):
         if input_dict:
             self.set_input(**input_dict)
 
-        return self._profile()
+        return self._profile(collectors)
 
     def exit(self):
         """Exits the dump folder and all its contents"""
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
index e1c3dc66a360..313a66c119f9 100644
--- a/python/tvm/runtime/profiler_vm.py
+++ b/python/tvm/runtime/profiler_vm.py
@@ -50,7 +50,7 @@ def get_stat(self, sort_by_time=True):  # pylint: disable=unused-argument
         warnings.warn("get_stat has been removed, use profile instead")
         return ""
 
-    def profile(self, *args, func_name="main", **kwargs):
+    def profile(self, *args, func_name="main", collectors=[], **kwargs):
         """Profile a function call.
 
         Parameters
@@ -58,6 +58,9 @@ def profile(self, *args, func_name="main", **kwargs):
         func_name : str
             The name of the function.
 
+        collectors : Sequence[MetricCollector]
+            Extra metrics to collect.
+
         args : list[tvm.runtime.NDArray] or list[np.ndarray]
             The arguments to the function.
 
@@ -71,4 +74,4 @@ def profile(self, *args, func_name="main", **kwargs):
         """
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
-        return self._profile(func_name)
+        return self._profile(func_name, collectors)
diff --git a/python/tvm/runtime/profiling.py b/python/tvm/runtime/profiling.py
deleted file mode 100644
index 5a1cd6796b64..000000000000
--- a/python/tvm/runtime/profiling.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Registration of profiling objects in python."""
-
-from .. import _ffi
-from . import Object
-
-_ffi._init_api("runtime.profiling", __name__)
-
-
-@_ffi.register_object("runtime.profiling.Report")
-class Report(Object):
-    """A container for information gathered during a profiling run.
-
-    Attributes
-    ----------
-    calls : Array[Dict[str, Object]]
-        Per-call profiling metrics (function name, runtime, device, ...).
-
-    device_metrics : Dict[Device, Dict[str, Object]]
-        Per-device metrics collected over the entire run.
-    """
-
-    def csv(self):
-        """Convert this profiling report into CSV format.
-
-        This only includes calls and not overall metrics.
-
-        Returns
-        -------
-        csv : str
-            `calls` in CSV format.
-        """
-        return AsCSV(self)
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index 59e1be601be1..503d817d58b7 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -16,17 +16,10 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-/*!
- * \brief Performance counters for profiling via the PAPI library.
- */
-#ifndef TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
-#define TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
-
 #include <papi.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/contrib/papi.h>
 
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 namespace tvm {
@@ -92,204 +85,162 @@ int component_for_device(Device dev) {
   }
   return cidx;
 }
+PAPIMetricCollectorNode::PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics) {
+  for (auto& p : metrics) {
+    papi_metric_names[p.first->device] = {};
+    for (auto& metric : p.second) {
+      papi_metric_names[p.first->device].push_back(metric);
+    }
+  }
+}
 
-/*! \brief MetricCollectorNode for PAPI metrics.
- *
- * PAPI (Performance Application Programming Interface) collects metrics on a
- * variety of platforms including cpu, cuda and rocm.
- *
- * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
- *
- * Users can change the metrics collected for by setting the environment
- * variable `TVM_PAPI_${device_name}_METRICS` with a semicolon seperated list
- * of metrics. Use the `papi_native_avail` tool to find the name of all
- * available metrics.
- */
-struct PAPIMetricCollectorNode final : public MetricCollectorNode {
-  explicit PAPIMetricCollectorNode(Array<DeviceWrapper> devices) {
-    if (!PAPI_is_initialized()) {
-      if (sizeof(long_long) > sizeof(int64_t)) {
-        LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
-                        "reporting metrics.";
-      }
-      CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
-          << "Error while initializing PAPI";
+PAPIMetricCollectorNode::~PAPIMetricCollectorNode() {
+  for (auto p : event_sets) {
+    PAPI_CALL(PAPI_stop(p.second, NULL));
+    PAPI_CALL(PAPI_cleanup_eventset(p.second));
+    PAPI_CALL(PAPI_destroy_eventset(&p.second));
+  }
+}
+
+void PAPIMetricCollectorNode::Init(Array<DeviceWrapper> devices) {
+  if (!PAPI_is_initialized()) {
+    if (sizeof(long_long) > sizeof(int64_t)) {
+      LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
+                      "reporting metrics.";
     }
+    CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
+        << "Error while initializing PAPI";
+  }
 
-    // create event sets for each device
+  // If no metrics were provided we use the default set. The names were not
+  // initialized in the constructor because we did not know which devices we
+  // were running on.
+  if (papi_metric_names.size() == 0) {
     for (auto wrapped_device : devices) {
       Device device = wrapped_device->device;
-      int cidx = component_for_device(device);
-      // unknown device, skipping
-      if (cidx < 0) {
-        continue;
-      }
-
-      const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
-      if (component->disabled) {
-        std::string help_message = "";
-        switch (device.device_type) {
-          case kDLCPU:
-            help_message =
-                "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
-            break;
-          case kDLCUDA:
-            help_message =
-                "Try enabling gpu profiling with `modprobe nvidia "
-                "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "
-                "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
-                "`/etc/modprobe.d/nvidia-kernel-common.conf`.";
-            break;
-          default:
-            break;
-        }
-        LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type)
-                     << ": " << component->disabled_reason << "\n"
-                     << help_message;
-        continue;
-      }
-
-      int event_set = PAPI_NULL;
-      PAPI_CALL(PAPI_create_eventset(&event_set));
-      PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
-      if (device.device_type == kDLCPU) {
-        // we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
-        PAPI_option_t opt;
-        memset(&opt, 0x0, sizeof(PAPI_option_t));
-        opt.inherit.inherit = PAPI_INHERIT_ALL;
-        opt.inherit.eventset = event_set;
-        PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
+      auto it = default_metric_names.find(device.device_type);
+      if (it != default_metric_names.end()) {
+        papi_metric_names[device] = it->second;
       }
+    }
+  }
 
-      // load default metrics for device or read them from an environment variable
-      std::vector<std::string> metric_names;
-      std::string dev_name = DeviceName(device.device_type);
-      std::transform(dev_name.begin(), dev_name.end(), dev_name.begin(),
-                     [](unsigned char c) { return std::toupper(c); });
-      const char* env_p =
-          std::getenv((std::string("TVM_PAPI_") + dev_name + std::string("_METRICS")).c_str());
-      if (env_p != nullptr) {
-        std::string metric_string = env_p;
-        size_t loc = 0;
-        while (loc < metric_string.size()) {
-          size_t next = metric_string.find(';', loc);
-          if (next == metric_string.npos) {
-            next = metric_string.size();
-          }
-          metric_names.push_back(metric_string.substr(loc, next - loc));
-          loc = next + 1;
-        }
-      } else {
-        auto it = default_metric_names.find(device.device_type);
-        if (it != default_metric_names.end()) {
-          metric_names = it->second;
-        } else {
-          LOG(WARNING) << "No default metrics set for " << dev_name
-                       << ". You can specify metrics with the environment variable TVM_PAPI_"
-                       << dev_name << "_METRICS.";
-        }
-      }
-      // skip if no metrics exist
-      if (metric_names.size() == 0) {
-        continue;
-      }
-      papi_metric_names[device] = metric_names;
+  // create event sets for each device
+  for (auto wrapped_device : devices) {
+    Device device = wrapped_device->device;
+    int cidx = component_for_device(device);
+    // unknown device, skipping
+    if (cidx < 0) {
+      continue;
+    }
 
-      if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
-        PAPI_CALL(PAPI_set_multiplex(event_set));
+    auto it = papi_metric_names.find(device);
+    // skip devices with no metrics defined
+    if (it == papi_metric_names.end() || it->second.size() == 0) {
+      continue;
+    }
+    auto& metric_names = it->second;
+
+    const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
+    if (component->disabled) {
+      std::string help_message = "";
+      switch (device.device_type) {
+        case kDLCPU:
+          help_message = "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
+          break;
+        case kDLCUDA:
+          help_message =
+              "Try enabling gpu profiling with `modprobe nvidia "
+              "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "
+              "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
+              "`/etc/modprobe.d/nvidia-kernel-common.conf`.";
+          break;
+        default:
+          break;
       }
+      LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type)
+                   << ": " << component->disabled_reason << "\n"
+                   << help_message;
+      continue;
+    }
 
-      // add all the metrics
-      for (auto metric : metric_names) {
-        int e = PAPI_add_named_event(event_set, metric.c_str());
-        if (e != PAPI_OK) {
-          LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
-                     << ".";
-        }
-      }
-      // Because we may have multiple calls in flight at the same time, we
-      // start all the timers when we initialize. Then we calculate the metrics
-      // counts for a call by comparing counter values at the start vs end of
-      // the call.
-      PAPI_CALL(PAPI_start(event_set));
-      event_sets[device] = event_set;
+    int event_set = PAPI_NULL;
+    PAPI_CALL(PAPI_create_eventset(&event_set));
+    PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
+    if (device.device_type == kDLCPU) {
+      // we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
+      PAPI_option_t opt;
+      memset(&opt, 0x0, sizeof(PAPI_option_t));
+      opt.inherit.inherit = PAPI_INHERIT_ALL;
+      opt.inherit.eventset = event_set;
+      PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
     }
-  }
 
-  /*! \brief Called right before a function call. Reads starting values of the
-   * measured metrics.
-   *
-   * \param dev The device the function will be run on.
-   * \returns A `PAPIEventSetNode` containing values for the counters at the
-   * start of the call. Passed to a corresponding `Stop` call.
-   */
-  ObjectRef Start(Device dev) final {
-    // Record counter values at the start of the call, so we can calculate the
-    // metrics for the call by comparing the values at the end of the call.
-    auto it = event_sets.find(dev);
-    if (it != event_sets.end()) {
-      int event_set = it->second;
-      std::vector<long_long> values(papi_metric_names[dev].size());
-      PAPI_CALL(PAPI_read(event_set, values.data()));
-      return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
-    } else {
-      return ObjectRef(nullptr);
+    if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
+      PAPI_CALL(PAPI_set_multiplex(event_set));
     }
-  }
 
-  /*! \brief Called right after a function call. Reads ending values of the
-   * measured metrics. Computes the change in each metric from the
-   * corresponding `Start` call.
-   *
-   * \param obj `PAPIEventSetNode` created by a call to `Start`.
-   * \returns A mapping from metric name to value.
-   */
-  Map<String, ObjectRef> Stop(ObjectRef obj) final {
-    const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
-    std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
-    PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
-    std::unordered_map<String, ObjectRef> reported_metrics;
-    for (size_t i = 0; i < end_values.size(); i++) {
-      if (end_values[i] < event_set_node->start_values[i]) {
-        LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
-        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
-            ObjectRef(make_object<CountNode>(-1));
-      } else {
-        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
-            ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+    // add all the metrics
+    for (auto metric : metric_names) {
+      int e = PAPI_add_named_event(event_set, metric.c_str());
+      if (e != PAPI_OK) {
+        LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
+                   << ".";
       }
     }
-    return reported_metrics;
+    // Because we may have multiple calls in flight at the same time, we
+    // start all the timers when we initialize. Then we calculate the metrics
+    // counts for a call by comparing counter values at the start vs end of
+    // the call.
+    PAPI_CALL(PAPI_start(event_set));
+    event_sets[device] = event_set;
   }
+}
 
-  ~PAPIMetricCollectorNode() final {
-    for (auto p : event_sets) {
-      PAPI_CALL(PAPI_stop(p.second, NULL));
-      PAPI_CALL(PAPI_cleanup_eventset(p.second));
-      PAPI_CALL(PAPI_destroy_eventset(&p.second));
-    }
+ObjectRef PAPIMetricCollectorNode::Start(Device dev) {
+  // Record counter values at the start of the call, so we can calculate the
+  // metrics for the call by comparing the values at the end of the call.
+  auto it = event_sets.find(dev);
+  if (it != event_sets.end()) {
+    int event_set = it->second;
+    std::vector<long_long> values(papi_metric_names[dev].size());
+    PAPI_CALL(PAPI_read(event_set, values.data()));
+    return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
+  } else {
+    return ObjectRef(nullptr);
   }
+}
 
-  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that
-   * device. */
-  std::unordered_map<Device, int> event_sets;
-  /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
-   * `event_set`. */
-  std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
-
-  static constexpr const char* _type_key = "PAPIMetricCollectorNode";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
-};
+Map<String, ObjectRef> PAPIMetricCollectorNode::Stop(ObjectRef obj) {
+  const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
+  std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
+  PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
+  std::unordered_map<String, ObjectRef> reported_metrics;
+  for (size_t i = 0; i < end_values.size(); i++) {
+    if (end_values[i] < event_set_node->start_values[i]) {
+      LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
+      reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+          ObjectRef(make_object<CountNode>(-1));
+    } else {
+      reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+          ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+    }
+  }
+  return reported_metrics;
+}
 
-TVM_REGISTER_GLOBAL("runtime.profiling.metrics.papi")
-    .set_body_typed([](Array<DeviceWrapper> devices) {
-      return MetricCollector(make_object<PAPIMetricCollectorNode>(devices));
-    });
+PAPIMetricCollector::PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
+  data_ = make_object<PAPIMetricCollectorNode>(metrics);
+}
 
 TVM_REGISTER_OBJECT_TYPE(PAPIEventSetNode);
 TVM_REGISTER_OBJECT_TYPE(PAPIMetricCollectorNode);
 
+TVM_REGISTER_GLOBAL("runtime.profiling.PAPIMetricCollector")
+    .set_body_typed([](Map<DeviceWrapper, Array<String>> metrics) {
+      return PAPIMetricCollector(metrics);
+    });
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index b29df7c97f76..2fa73971d000 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -276,10 +276,13 @@ class GraphExecutorDebug : public GraphExecutor {
    * the module compared to GraphRuntimeDebug::RunIndividual as it runs the
    * entire graph in order.
    *
+   * \param collectors Optional user defined `MetricCollector`s to use with this profiling run.
+   *
    * \returns A table of per-op runtimes and total times.
    */
-  profiling::Report Profile() {
-    profiling::Profiler prof(devices_);
+  profiling::Report Profile(Array<profiling::MetricCollector> collectors) {
+    std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
+    profiling::Profiler prof(devices_, cs);
 
     // warm up. 1 iteration does not seem enough.
     for (int i = 0; i < 3; i++) {
@@ -360,7 +363,10 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
   } else if (name == "profile") {
-    return TypedPackedFunc<profiling::Report()>([sptr_to_self, this]() { return this->Profile(); });
+    return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
+        [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
+          return this->Profile(collectors);
+        });
   } else {
     return GraphExecutor::GetFunction(name, sptr_to_self);
   }
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index befb7478df98..84679ed579a2 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -102,16 +102,15 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
 
 namespace profiling {
 
-Profiler::Profiler(std::vector<Device> devs) : devs_(devs) {
+Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors)
+    : devs_(devs), collectors_(metric_collectors) {
   is_running_ = false;
   std::vector<DeviceWrapper> wrapped_devs;
   for (auto dev : devs) {
     wrapped_devs.push_back(DeviceWrapper(make_object<DeviceWrapperNode>(dev)));
   }
-  for (const auto& name : Registry::ListNames()) {
-    if (name.find("runtime.profiling.metrics.") == 0) {
-      collectors_.push_back(Registry::Get(name)->operator()(Array<DeviceWrapper>(wrapped_devs)));
-    }
+  for (auto& x : collectors_) {
+    x->Init(wrapped_devs);
   }
   // reset the thread pool so that PAPI eventset hooks are set in all threads.
   threading::ResetThreadPool();
@@ -483,6 +482,9 @@ TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode);
 TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode);
 
 TVM_REGISTER_GLOBAL("runtime.profiling.AsCSV").set_body_typed([](Report n) { return n->AsCSV(); });
+TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) {
+  return DeviceWrapper(dev);
+});
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index b04889441a52..6d893114d623 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -43,29 +43,31 @@ namespace vm {
 PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
                                             const ObjectPtr<Object>& sptr_to_self) {
   if (name == "profile") {
-    return TypedPackedFunc<profiling::Report(String)>([sptr_to_self, this](String arg_name) {
-      std::vector<Device> devices;
-      for (auto dev : devices_) {
-        if (dev.device_type > 0) {
-          devices.push_back(dev);
-        }
-      }
-
-      prof_ = profiling::Profiler(devices);
-
-      auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
-      // warmup
-      for (int i = 0; i < 3; i++) {
-        invoke(arg_name);
-      }
-
-      prof_.operator*().Start();
-      invoke(arg_name);
-      prof_.operator*().Stop();
-      auto report = prof_.operator*().Report();
-      prof_ = dmlc::optional<profiling::Profiler>();  // releases hardware counters
-      return report;
-    });
+    return TypedPackedFunc<profiling::Report(String, Array<profiling::MetricCollector>)>(
+        [sptr_to_self, this](String arg_name, Array<profiling::MetricCollector> collectors) {
+          std::vector<Device> devices;
+          for (auto dev : devices_) {
+            if (dev.device_type > 0) {
+              devices.push_back(dev);
+            }
+          }
+
+          std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
+          prof_ = profiling::Profiler(devices, cs);
+
+          auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
+          // warmup
+          for (int i = 0; i < 3; i++) {
+            invoke(arg_name);
+          }
+
+          prof_.operator*().Start();
+          invoke(arg_name);
+          prof_.operator*().Stop();
+          auto report = prof_.operator*().Report();
+          prof_ = dmlc::optional<profiling::Profiler>();  // releases hardware counters
+          return report;
+        });
   } else {
     return VirtualMachine::GetFunction(name, sptr_to_self);
   }
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 0f6c838fd8b0..824afcf237f7 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -77,24 +77,38 @@ def test_graph_executor(target, dev):
     assert "Hash" in str(report)
 
 
-@tvm.testing.requires_cuda
+@tvm.testing.parametrize_targets("cuda", "llvm")
 @pytest.mark.skipif(
-    tvm.get_global_func("runtime.profiling.metrics.papi", allow_missing=True) is None,
+    tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None,
     reason="PAPI profiling not enabled",
 )
-def test_papi_gpu():
+def test_papi(target, dev):
+    target = tvm.target.Target(target)
+    if str(target.kind) == "llvm":
+        metric = "PAPI_FP_OPS"
+    elif str(target.kind) == "cuda":
+        metric = "cuda:::event:shared_load:device=0"
+    else:
+        pytest.skip(f"Target {target.kind} not supported by this test")
     mod, params = mlp.get_workload(1)
 
-    exe = relay.vm.compile(mod, "cuda", params=params)
-    vm = profiler_vm.VirtualMachineProfiler(exe, tvm.gpu())
+    exe = relay.vm.compile(mod, target, params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
 
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    report = vm.profile([data], func_name="main")
-    assert "cuda::" in str(report)
+    print(dev.device_type)
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
+    report = vm.profile(
+        [data],
+        func_name="main",
+        collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})],
+    )
+    print(report)
+    assert metric in str(report)
 
-    metric = "cuda:::event:shared_load:device=0"
-    os.environ["TVM_PAPI_GPU_METRICS"] = metric
-    report = vm.profile([data], func_name="main")
     csv = read_csv(report)
     assert metric in csv.keys()
     assert any([float(x) > 0 for x in csv[metric]])
+
+
+if __name__ == "__main__":
+    test_papi("llvm", tvm.cpu())

From a4321af77c0f658e294e3e27c6e7f42ed69f4dce Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 10 Jun 2021 15:24:06 -0700
Subject: [PATCH 10/23] forgot files

---
 include/tvm/runtime/contrib/papi.h       | 95 ++++++++++++++++++++++++
 python/tvm/runtime/profiling/__init__.py | 83 +++++++++++++++++++++
 python/tvm/runtime/profiling/_ffi_api.py | 19 +++++
 3 files changed, 197 insertions(+)
 create mode 100644 include/tvm/runtime/contrib/papi.h
 create mode 100644 python/tvm/runtime/profiling/__init__.py
 create mode 100644 python/tvm/runtime/profiling/_ffi_api.py

diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h
new file mode 100644
index 000000000000..2dc19127a4e2
--- /dev/null
+++ b/include/tvm/runtime/contrib/papi.h
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \brief Performance counters for profiling via the PAPI library.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
+#define TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
+
+#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+
+#include <unordered_map>
+
+namespace tvm {
+namespace runtime {
+namespace profiling {
+
+/*! \brief MetricCollectorNode for PAPI metrics.
+ *
+ * PAPI (Performance Application Programming Interface) collects metrics on a
+ * variety of platforms including cpu, cuda and rocm.
+ *
+ * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
+ *
+ * Users can change the metrics collected for by setting the environment
+ * variable `TVM_PAPI_${device_name}_METRICS` with a semicolon seperated list
+ * of metrics. Use the `papi_native_avail` tool to find the name of all
+ * available metrics.
+ */
+struct PAPIMetricCollectorNode final : public MetricCollectorNode {
+  explicit PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics);
+  explicit PAPIMetricCollectorNode() {}
+
+  /*! \brief Initialization call.
+   * \param device The devices this collector will be running on
+   */
+  void Init(Array<DeviceWrapper> devices);
+  /*! \brief Called right before a function call. Reads starting values of the
+   * measured metrics.
+   *
+   * \param dev The device the function will be run on.
+   * \returns A `PAPIEventSetNode` containing values for the counters at the
+   * start of the call. Passed to a corresponding `Stop` call.
+   */
+  ObjectRef Start(Device dev) final;
+  /*! \brief Called right after a function call. Reads ending values of the
+   * measured metrics. Computes the change in each metric from the
+   * corresponding `Start` call.
+   *
+   * \param obj `PAPIEventSetNode` created by a call to `Start`.
+   * \returns A mapping from metric name to value.
+   */
+  Map<String, ObjectRef> Stop(ObjectRef obj) final;
+
+  ~PAPIMetricCollectorNode() final;
+
+  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that
+   * device. */
+  std::unordered_map<Device, int> event_sets;
+  /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
+   * `event_set`. */
+  std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
+
+  static constexpr const char* _type_key = "runtime.profiling.PAPIMetricCollector";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
+};
+
+/*! \brief Wrapper for `PAPIMetricCollectorNode`. */
+class PAPIMetricCollector : public MetricCollector {
+ public:
+  explicit PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics);
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector, PAPIMetricCollectorNode);
+};
+}  // namespace profiling
+}  // namespace runtime
+}  // namespace tvm
+
+#endif
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
new file mode 100644
index 000000000000..89cda498b322
--- /dev/null
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Registration of profiling objects in python."""
+
+from ... import _ffi
+from . import _ffi_api
+from .. import Object, Device
+from typing import Dict, Sequence
+
+
+@_ffi.register_object("runtime.profiling.Report")
+class Report(Object):
+    """A container for information gathered during a profiling run.
+
+    Fields
+    ----------
+    calls : Array[Dict[str, Object]]
+        Per-call profiling metrics (function name, runtime, device, ...).
+
+    device_metrics : Dict[Device, Dict[str, Object]]
+        Per-device metrics collected over the entire run.
+    """
+
+    def csv(self):
+        """Convert this profiling report into CSV format.
+
+        This only includes calls and not overall metrics.
+
+        Returns
+        -------
+        csv : str
+            `calls` in CSV format.
+        """
+        return _ffi_api.AsCSV(self)
+
+
+@_ffi.register_object("runtime.profiling.MetricCollector")
+class MetricCollector(Object):
+    """Interface for user defined profiling metric collection."""
+
+    pass
+
+
+@_ffi.register_object("runtime.profiling.DeviceWrapper")
+class DeviceWrapper(Object):
+    """Wraps a tvm.runtime.Device"""
+
+    def __init__(self, dev: Device):
+        self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
+
+
+@_ffi.register_object("runtime.profiling.PAPIMetricCollector")
+class PAPIMetricCollector(MetricCollector):
+    """Collects performance counter information using the Performance
+    Application Programming Interface (PAPI).
+    """
+
+    def __init__(self, metric_names: Dict[Device, Sequence[str]]):
+        """
+        Parameters
+        ----------
+        metric_names : Dict[Device, Sequence[str]]
+            List of per-device metrics to collect. You can find a list of valid
+            metrics by runing `papi_native_avail` from the command line.
+        """
+        wrapped = dict()
+        for dev, names in metric_names.items():
+            wrapped[DeviceWrapper(dev)] = names
+        self.__init_handle_by_constructor__(_ffi_api.PAPIMetricCollector, wrapped)
diff --git a/python/tvm/runtime/profiling/_ffi_api.py b/python/tvm/runtime/profiling/_ffi_api.py
new file mode 100644
index 000000000000..d321e2e18a1b
--- /dev/null
+++ b/python/tvm/runtime/profiling/_ffi_api.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from ... import _ffi
+
+_ffi._init_api("runtime.profiling", __name__)

From 3342ff29a2ba1fffa29a76f83c566de0ec666727 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 14 Jun 2021 12:57:12 -0700
Subject: [PATCH 11/23] formatting

---
 include/tvm/runtime/contrib/papi.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h
index 2dc19127a4e2..9ffb7b4777d9 100644
--- a/include/tvm/runtime/contrib/papi.h
+++ b/include/tvm/runtime/contrib/papi.h
@@ -19,14 +19,16 @@
 /*!
  * \brief Performance counters for profiling via the PAPI library.
  */
-#ifndef TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
-#define TVM_RUNTIME_CONTRIB_PAPI_PAPI_H_
+#ifndef TVM_RUNTIME_CONTRIB_PAPI_H_
+#define TVM_RUNTIME_CONTRIB_PAPI_H_
 
-#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/container/array.h>
 #include <tvm/runtime/container/map.h>
+#include <tvm/runtime/profiling.h>
 
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
@@ -86,10 +88,11 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
 class PAPIMetricCollector : public MetricCollector {
  public:
   explicit PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics);
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector, PAPIMetricCollectorNode);
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector,
+                                        PAPIMetricCollectorNode);
 };
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
 
-#endif
+#endif  // TVM_RUNTIME_CONTRIB_PAPI_H_

From d2accca29891c6a2e0660d866b0879ba300ee5d6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 14 Jun 2021 14:01:48 -0700
Subject: [PATCH 12/23] more lint

---
 python/tvm/contrib/debugger/debug_executor.py | 2 +-
 python/tvm/runtime/profiling/__init__.py      | 4 +---
 python/tvm/runtime/profiling/_ffi_api.py      | 1 +
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index e3508499bb0c..cd6eaf073089 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -268,7 +268,7 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
-    def profile(self, collectors=[], **input_dict):
+    def profile(self, collectors=[], **input_dict):  # pylint: disable=dangerous-default-value
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.
 
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 89cda498b322..b955b364e783 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -16,10 +16,10 @@
 # under the License.
 """Registration of profiling objects in python."""
 
+from typing import Dict, Sequence
 from ... import _ffi
 from . import _ffi_api
 from .. import Object, Device
-from typing import Dict, Sequence
 
 
 @_ffi.register_object("runtime.profiling.Report")
@@ -52,8 +52,6 @@ def csv(self):
 class MetricCollector(Object):
     """Interface for user defined profiling metric collection."""
 
-    pass
-
 
 @_ffi.register_object("runtime.profiling.DeviceWrapper")
 class DeviceWrapper(Object):
diff --git a/python/tvm/runtime/profiling/_ffi_api.py b/python/tvm/runtime/profiling/_ffi_api.py
index d321e2e18a1b..d26b847a699f 100644
--- a/python/tvm/runtime/profiling/_ffi_api.py
+++ b/python/tvm/runtime/profiling/_ffi_api.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""FFI for profiling"""
 from ... import _ffi
 
 _ffi._init_api("runtime.profiling", __name__)

From 95836d217cf5e28ec98c160e560456e4ff3df815 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 14 Jun 2021 16:00:21 -0700
Subject: [PATCH 13/23] fix docs

---
 include/tvm/runtime/contrib/papi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h
index 9ffb7b4777d9..a2aeed5bf969 100644
--- a/include/tvm/runtime/contrib/papi.h
+++ b/include/tvm/runtime/contrib/papi.h
@@ -51,7 +51,7 @@ struct PAPIMetricCollectorNode final : public MetricCollectorNode {
   explicit PAPIMetricCollectorNode() {}
 
   /*! \brief Initialization call.
-   * \param device The devices this collector will be running on
+   * \param devices The devices this collector will be running on
    */
   void Init(Array<DeviceWrapper> devices);
   /*! \brief Called right before a function call. Reads starting values of the

From 51c73e5e5ada3105dded525090030bd971049478 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 15 Jun 2021 14:54:26 -0700
Subject: [PATCH 14/23] optional loading of papi metric collector in python

---
 python/tvm/runtime/profiling/__init__.py | 35 ++++++++++++------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index b955b364e783..d95fb532c058 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -61,21 +61,22 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
-@_ffi.register_object("runtime.profiling.PAPIMetricCollector")
-class PAPIMetricCollector(MetricCollector):
-    """Collects performance counter information using the Performance
-    Application Programming Interface (PAPI).
-    """
-
-    def __init__(self, metric_names: Dict[Device, Sequence[str]]):
+if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector"):
+    @_ffi.register_object("runtime.profiling.PAPIMetricCollector")
+    class PAPIMetricCollector(MetricCollector):
+        """Collects performance counter information using the Performance
+        Application Programming Interface (PAPI).
         """
-        Parameters
-        ----------
-        metric_names : Dict[Device, Sequence[str]]
-            List of per-device metrics to collect. You can find a list of valid
-            metrics by runing `papi_native_avail` from the command line.
-        """
-        wrapped = dict()
-        for dev, names in metric_names.items():
-            wrapped[DeviceWrapper(dev)] = names
-        self.__init_handle_by_constructor__(_ffi_api.PAPIMetricCollector, wrapped)
+
+        def __init__(self, metric_names: Dict[Device, Sequence[str]]):
+            """
+            Parameters
+            ----------
+            metric_names : Dict[Device, Sequence[str]]
+                List of per-device metrics to collect. You can find a list of valid
+                metrics by runing `papi_native_avail` from the command line.
+            """
+            wrapped = dict()
+            for dev, names in metric_names.items():
+                wrapped[DeviceWrapper(dev)] = names
+            self.__init_handle_by_constructor__(_ffi_api.PAPIMetricCollector, wrapped)

From 78f72859484c3b8670c96932a13b885df01b111c Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 16 Jun 2021 10:28:51 -0700
Subject: [PATCH 15/23] more formatting

---
 python/tvm/runtime/profiling/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index d95fb532c058..c6266530fef9 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -62,6 +62,7 @@ def __init__(self, dev: Device):
 
 
 if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector"):
+
     @_ffi.register_object("runtime.profiling.PAPIMetricCollector")
     class PAPIMetricCollector(MetricCollector):
         """Collects performance counter information using the Performance

From 8ab30be0738efa53ea18012e1aa5df937595e05b Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 16 Jun 2021 12:13:44 -0700
Subject: [PATCH 16/23] fix check

---
 python/tvm/runtime/profiling/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index c6266530fef9..aea07243fb63 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -61,7 +61,7 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
-if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector"):
+if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None:
 
     @_ffi.register_object("runtime.profiling.PAPIMetricCollector")
     class PAPIMetricCollector(MetricCollector):

From b17850dafa7da03e60e982e078ef58f84ff08c2a Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 29 Jun 2021 10:26:59 -0700
Subject: [PATCH 17/23] update docs and default value

---
 docs/profiling/papi.rst                       | 72 +++++++++++++++----
 include/tvm/runtime/contrib/papi.h            | 11 +--
 python/tvm/runtime/profiling/__init__.py      |  5 +-
 .../python/unittest/test_runtime_profiling.py |  1 -
 4 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/docs/profiling/papi.rst b/docs/profiling/papi.rst
index 7dc5c34ded9e..b7c23b2c0c73 100644
--- a/docs/profiling/papi.rst
+++ b/docs/profiling/papi.rst
@@ -30,8 +30,8 @@ available while profiling.
 Installing PAPI
 ---------------
 
-PAPI can either be installed using your package manager (``apt-get install
-libpapi-dev`` on Ubuntu), or from source here:
+PAPI can either be installed using your package manager (``apt-get install libpapi-dev``
+on Ubuntu), or from source here:
 https://bitbucket.org/icl/papi/src/master/.
 
 
@@ -54,13 +54,61 @@ If PAPI is installed in a non-standard place, you can specify where it is like s
 Using PAPI While Profiling
 --------------------------
 
-If TVM has been built with PAPI (see above), then calling the
-:py:meth:`tvm.runtime.GraphModule.profile` will automatically include results
-from a default set of performance counters. To change which performance
-counters are reported, set the ``TVM_PAPI_${DEVICE}_METRICS`` environment
-variable (where ``${DEVICE}`` is the device you are running on; ``GPU`` if
-using a gpu, ``CPU`` for the cpu) to a semicolon separated list of metrics. For
-example, ``TVM_PAPI_CPU_METRICS=perf::INSTRUCTIONS;perf::BRANCH-INSTRUCTIONS``
-would report the number of instructions executed and the number of branch
-instructions executed. You can find a list of available metrics by running the
-``papi_avail`` and ``papi_native_avail`` commands.
+If TVM has been built with PAPI (see above), then you can pass a
+:py:class:`tvm.runtime.profiling.PAPIMetricCollector` to
+:py:meth:`tvm.runtime.GraphModule.profile` to collect performance metrics. Here
+is an example:
+
+.. code:: python
+
+    target = "llvm"
+    dev = tvm.cpu()
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.vm.compile(mod, target, params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
+
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
+    report = vm.profile(
+        [data],
+        func_name="main",
+        collectors=[tvm.runtime.profiling.PAPIMetricCollector()],
+    )
+    print(report)
+
+.. code::
+
+   Name                                    perf::CACHE-MISSES   perf::CYCLES  perf::STALLED-CYCLES-BACKEND  perf::INSTRUCTIONS  perf::STALLED-CYCLES-FRONTEND
+   fused_nn_dense_nn_bias_add_nn_relu                   2,494      1,570,698                        85,608             675,564                         39,583
+   fused_nn_dense_nn_bias_add_nn_relu_1                 1,149        655,101                        13,278             202,297                         21,380
+   fused_nn_dense_nn_bias_add                             288        600,184                         8,321             163,446                         19,513
+   fused_nn_batch_flatten                                 301        587,049                         4,636             158,636                         18,565
+   fused_nn_softmax                                       154        575,143                         8,018             160,738                         18,995
+   ----------
+   Sum                                                  4,386      3,988,175                       119,861           1,360,681                        118,036
+   Total                                               10,644      8,327,360                       179,310           2,660,569                        270,044
+
+You can also change which metrics are collected:
+
+.. code:: python
+
+    report = vm.profile(
+        [data],
+        func_name="main",
+        collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"])],
+    )
+
+.. code::
+
+   Name                                  PAPI_FP_OPS
+   fused_nn_dense_nn_bias_add_nn_relu        200,832
+   fused_nn_dense_nn_bias_add_nn_relu_1       16,448
+   fused_nn_dense_nn_bias_add                  1,548
+   fused_nn_softmax                              160
+   fused_nn_batch_flatten                          0
+   ----------
+   Sum                                       218,988
+   Total                                     218,988
+
+You can find a list of available metrics by running the ``papi_avail`` and
+``papi_native_avail`` commands.
diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h
index a2aeed5bf969..b9a2a194448b 100644
--- a/include/tvm/runtime/contrib/papi.h
+++ b/include/tvm/runtime/contrib/papi.h
@@ -40,13 +40,14 @@ namespace profiling {
  * variety of platforms including cpu, cuda and rocm.
  *
  * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
- *
- * Users can change the metrics collected for by setting the environment
- * variable `TVM_PAPI_${device_name}_METRICS` with a semicolon seperated list
- * of metrics. Use the `papi_native_avail` tool to find the name of all
- * available metrics.
  */
 struct PAPIMetricCollectorNode final : public MetricCollectorNode {
+  /*! \brief Construct a metric collector that collects a specific set of metrics.
+   *
+   * \param metrics A mapping from a device type to the metrics that should be
+   * collected on that device. You can find the names of available metrics by
+   * running `papi_native_avail`.
+   */
   explicit PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics);
   explicit PAPIMetricCollectorNode() {}
 
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index aea07243fb63..22a9b922d77f 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -26,7 +26,7 @@
 class Report(Object):
     """A container for information gathered during a profiling run.
 
-    Fields
+    Attributes
     ----------
     calls : Array[Dict[str, Object]]
         Per-call profiling metrics (function name, runtime, device, ...).
@@ -61,6 +61,7 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
+# We only enable this class when TVM is build with PAPI support
 if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None:
 
     @_ffi.register_object("runtime.profiling.PAPIMetricCollector")
@@ -69,7 +70,7 @@ class PAPIMetricCollector(MetricCollector):
         Application Programming Interface (PAPI).
         """
 
-        def __init__(self, metric_names: Dict[Device, Sequence[str]]):
+        def __init__(self, metric_names: Dict[Device, Sequence[str]] = {}):
             """
             Parameters
             ----------
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 824afcf237f7..7420e32b5416 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -95,7 +95,6 @@ def test_papi(target, dev):
     exe = relay.vm.compile(mod, target, params=params)
     vm = profiler_vm.VirtualMachineProfiler(exe, dev)
 
-    print(dev.device_type)
     data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
     report = vm.profile(
         [data],

From 6a70fcb5f44b8db8e9b1ec8b0d7330914b0ebeda Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 29 Jun 2021 10:39:37 -0700
Subject: [PATCH 18/23] formatting

---
 python/tvm/runtime/profiling/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 22a9b922d77f..d75142a7dcc6 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -70,7 +70,9 @@ class PAPIMetricCollector(MetricCollector):
         Application Programming Interface (PAPI).
         """
 
-        def __init__(self, metric_names: Dict[Device, Sequence[str]] = {}):
+        def __init__(
+            self, metric_names: Dict[Device, Sequence[str]] = {}
+        ):  # pylint: disable=dangerous-default-value
             """
             Parameters
             ----------

From 573231acb7f670cd6b5fb99535d9bd403dbd2552 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 29 Jun 2021 11:55:09 -0700
Subject: [PATCH 19/23] addressing andrews comments

---
 python/tvm/contrib/debugger/debug_executor.py | 5 +++--
 python/tvm/runtime/profiling/__init__.py      | 7 +++----
 src/runtime/profiling.cc                      | 7 ++++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index cd6eaf073089..622f27c358b6 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -268,13 +268,13 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
-    def profile(self, collectors=[], **input_dict):  # pylint: disable=dangerous-default-value
+    def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.
 
         Parameters
         ----------
-        collectors : Sequence[MetricCollector]
+        collectors : Optional[Sequence[MetricCollector]]
             Extra metrics to collect.
 
         input_dict : dict of str to NDArray
@@ -285,6 +285,7 @@ def profile(self, collectors=[], **input_dict):  # pylint: disable=dangerous-def
         timing_results : str
             Per-operator and whole graph timing results in a table format.
         """
+        collectors = [] if collectors is None else collectors
         if input_dict:
             self.set_input(**input_dict)
 
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index d75142a7dcc6..83370b1e520d 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -70,16 +70,15 @@ class PAPIMetricCollector(MetricCollector):
         Application Programming Interface (PAPI).
         """
 
-        def __init__(
-            self, metric_names: Dict[Device, Sequence[str]] = {}
-        ):  # pylint: disable=dangerous-default-value
+        def __init__(self, metric_names: Optional[Dict[Device, Sequence[str]]] = None):
             """
             Parameters
             ----------
-            metric_names : Dict[Device, Sequence[str]]
+            metric_names : Optional[Dict[Device, Sequence[str]]]
                 List of per-device metrics to collect. You can find a list of valid
                 metrics by runing `papi_native_avail` from the command line.
             """
+            metric_names = {} if metric_names is None else metric_names
             wrapped = dict()
             for dev, names in metric_names.items():
                 wrapped[DeviceWrapper(dev)] = names
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 84679ed579a2..c149298f11a5 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -442,19 +442,20 @@ Report Profiler::Report(bool aggregate, bool sort) {
   }
 
   // the last couple of call frames are the overall times
-  double overall_time = 0;
+  double overall_time_us = 0;
   std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
   for (size_t i = 0; i < devs_.size(); i++) {
     auto row = rows[rows.size() - 1];
     rows.pop_back();
     device_metrics[Downcast<String>(row["Device"])] = row;
-    overall_time = std::max(overall_time, row["Duration (us)"].as<DurationNode>()->microseconds);
+    overall_time_us =
+        std::max(overall_time_us, row["Duration (us)"].as<DurationNode>()->microseconds);
   }
 
   // Calculate percentages
   for (auto& row : rows) {
     row["Percent"] = ObjectRef(make_object<PercentNode>(
-        row["Duration (us)"].as<DurationNode>()->microseconds / overall_time * 100));
+        row["Duration (us)"].as<DurationNode>()->microseconds / overall_time_us * 100));
   }
 
   // convert to map

From 19ad0560107a4088ed5cb3e7fbb9877e81d58bbc Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 1 Jul 2021 10:31:08 -0700
Subject: [PATCH 20/23] fix docs

---
 docs/profiling/index.rst                 | 2 --
 python/tvm/runtime/profiling/__init__.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/profiling/index.rst b/docs/profiling/index.rst
index 19883ee76aae..9443fef25ea6 100644
--- a/docs/profiling/index.rst
+++ b/docs/profiling/index.rst
@@ -15,8 +15,6 @@
     specific language governing permissions and limitations
     under the License.
 
-.. _vta-index:
-
 Profiling Deep Learning Models
 ====================================
 
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 83370b1e520d..8857cf419602 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -16,7 +16,7 @@
 # under the License.
 """Registration of profiling objects in python."""
 
-from typing import Dict, Sequence
+from typing import Dict, Sequence, Optional
 from ... import _ffi
 from . import _ffi_api
 from .. import Object, Device

From 34e657b136305ae8626a3ded4501aef5dca94a85 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 1 Jul 2021 16:09:52 -0700
Subject: [PATCH 21/23] address comments

---
 include/tvm/runtime/profiling.h   |  4 ++++
 python/tvm/runtime/profiler_vm.py |  5 +++--
 src/runtime/thread_pool.cc        | 29 ++++++++++++++---------------
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 3af3b62cb9f7..b1e8d98c1441 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -313,6 +313,10 @@ class Profiler {
    *
    * The profiler should be constructed before you do any warmup iterations.
    *
+   * \note
+   * Calling this constructor will reset the TVM threadpool. It is necessary in
+   * order to install thread handlers required by certain collectors.
+   *
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
    * \param metric_collectors Additional `MetricCollector`s to use with this profiler.
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
index 313a66c119f9..b3043d8b8760 100644
--- a/python/tvm/runtime/profiler_vm.py
+++ b/python/tvm/runtime/profiler_vm.py
@@ -50,7 +50,7 @@ def get_stat(self, sort_by_time=True):  # pylint: disable=unused-argument
         warnings.warn("get_stat has been removed, use profile instead")
         return ""
 
-    def profile(self, *args, func_name="main", collectors=[], **kwargs):
+    def profile(self, *args, func_name="main", collectors=None, **kwargs):
         """Profile a function call.
 
         Parameters
@@ -58,7 +58,7 @@ def profile(self, *args, func_name="main", collectors=[], **kwargs):
         func_name : str
             The name of the function.
 
-        collectors : Sequence[MetricCollector]
+        collectors : Optional[Sequence[MetricCollector]]
             Extra metrics to collect.
 
         args : list[tvm.runtime.NDArray] or list[np.ndarray]
@@ -72,6 +72,7 @@ def profile(self, *args, func_name="main", collectors=[], **kwargs):
         timing_results : str
             Overall and per-op timing results formatted in a table.
         """
+        collectors = [] if collectors is None else collectors
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
         return self._profile(func_name, collectors)
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 4daf6b0688fe..838c3dbf68a8 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -258,32 +258,21 @@ class SpscTaskQueue {
 class ThreadPool {
  public:
   ThreadPool() : num_workers_(tvm::runtime::threading::MaxConcurrency()) {
-    for (int i = 0; i < num_workers_; ++i) {
-      // The SpscTaskQueue only hosts ONE item at a time
-      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
-    }
     const char* exclude_worker0 = getenv("TVM_EXCLUDE_WORKER0");
     if (exclude_worker0 && atoi(exclude_worker0) == 0) {
       exclude_worker0_ = false;
     }
-    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
-        new tvm::runtime::threading::ThreadGroup(
-            num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
-            exclude_worker0_ /* include_main_thread */));
-    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
+    Init();
   }
+
   ~ThreadPool() {
     for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
       q->SignalForKill();
     }
     threads_.reset();
   }
-  void Reset() {
-    for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
-      q->SignalForKill();
-    }
-    queues_.clear();
-    threads_.reset();
+
+  void Init() {
     for (int i = 0; i < num_workers_; ++i) {
       // The SpscTaskQueue only hosts ONE item at a time
       queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
@@ -294,6 +283,16 @@ class ThreadPool {
             exclude_worker0_ /* include_main_thread */));
     num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
   }
+
+  void Reset() {
+    for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
+      q->SignalForKill();
+    }
+    queues_.clear();
+    threads_.reset();
+    Init();
+  }
+
   int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, int need_sync) {
     ParallelLauncher* launcher = ParallelLauncher::ThreadLocal();
     ICHECK(!launcher->is_worker)

From 3600171b36ba76101a4b4c80268d95f21e9ba4c0 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 6 Jul 2021 13:55:14 -0700
Subject: [PATCH 22/23] move shared initialization code into private function

---
 src/runtime/thread_pool.cc | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 838c3dbf68a8..c11e9f7ac084 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -272,18 +272,6 @@ class ThreadPool {
     threads_.reset();
   }
 
-  void Init() {
-    for (int i = 0; i < num_workers_; ++i) {
-      // The SpscTaskQueue only hosts ONE item at a time
-      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
-    }
-    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
-        new tvm::runtime::threading::ThreadGroup(
-            num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
-            exclude_worker0_ /* include_main_thread */));
-    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
-  }
-
   void Reset() {
     for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
       q->SignalForKill();
@@ -338,6 +326,19 @@ class ThreadPool {
   }
 
  private:
+  // Shared initialization code
+  void Init() {
+    for (int i = 0; i < num_workers_; ++i) {
+      // The SpscTaskQueue only hosts ONE item at a time
+      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
+    }
+    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
+        new tvm::runtime::threading::ThreadGroup(
+            num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
+            exclude_worker0_ /* include_main_thread */));
+    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
+  }
+
   // Internal worker function.
   void RunWorker(int worker_id) {
     SpscTaskQueue* queue = queues_[worker_id].get();

From 45bfc17a50fe783570e5c344954eb36b462ec3ce Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 12 Jul 2021 16:40:28 -0700
Subject: [PATCH 23/23] move most definitions from papi header to
 implementation file

---
 include/tvm/runtime/contrib/papi.h |  67 +------
 src/runtime/contrib/papi/papi.cc   | 301 +++++++++++++++++------------
 2 files changed, 184 insertions(+), 184 deletions(-)

diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h
index b9a2a194448b..ff2d75c483eb 100644
--- a/include/tvm/runtime/contrib/papi.h
+++ b/include/tvm/runtime/contrib/papi.h
@@ -26,72 +26,19 @@
 #include <tvm/runtime/container/map.h>
 #include <tvm/runtime/profiling.h>
 
-#include <string>
-#include <unordered_map>
-#include <vector>
-
 namespace tvm {
 namespace runtime {
 namespace profiling {
 
-/*! \brief MetricCollectorNode for PAPI metrics.
- *
- * PAPI (Performance Application Programming Interface) collects metrics on a
- * variety of platforms including cpu, cuda and rocm.
+/*! \brief Construct a metric collector that collects data from hardware
+ * performance counters using the Performance Application Programming Interface
+ * (PAPI).
  *
- * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
+ * \param metrics A mapping from a device type to the metrics that should be
+ * collected on that device. You can find the names of available metrics by
+ * running `papi_native_avail`.
  */
-struct PAPIMetricCollectorNode final : public MetricCollectorNode {
-  /*! \brief Construct a metric collector that collects a specific set of metrics.
-   *
-   * \param metrics A mapping from a device type to the metrics that should be
-   * collected on that device. You can find the names of available metrics by
-   * running `papi_native_avail`.
-   */
-  explicit PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics);
-  explicit PAPIMetricCollectorNode() {}
-
-  /*! \brief Initialization call.
-   * \param devices The devices this collector will be running on
-   */
-  void Init(Array<DeviceWrapper> devices);
-  /*! \brief Called right before a function call. Reads starting values of the
-   * measured metrics.
-   *
-   * \param dev The device the function will be run on.
-   * \returns A `PAPIEventSetNode` containing values for the counters at the
-   * start of the call. Passed to a corresponding `Stop` call.
-   */
-  ObjectRef Start(Device dev) final;
-  /*! \brief Called right after a function call. Reads ending values of the
-   * measured metrics. Computes the change in each metric from the
-   * corresponding `Start` call.
-   *
-   * \param obj `PAPIEventSetNode` created by a call to `Start`.
-   * \returns A mapping from metric name to value.
-   */
-  Map<String, ObjectRef> Stop(ObjectRef obj) final;
-
-  ~PAPIMetricCollectorNode() final;
-
-  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that
-   * device. */
-  std::unordered_map<Device, int> event_sets;
-  /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
-   * `event_set`. */
-  std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
-
-  static constexpr const char* _type_key = "runtime.profiling.PAPIMetricCollector";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
-};
-
-/*! \brief Wrapper for `PAPIMetricCollectorNode`. */
-class PAPIMetricCollector : public MetricCollector {
- public:
-  explicit PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics);
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector,
-                                        PAPIMetricCollectorNode);
-};
+TVM_DLL MetricCollector CreatePAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics);
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index 503d817d58b7..b9ba8f9984e9 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -80,157 +80,210 @@ int component_for_device(Device dev) {
   if (cidx < 0) {
     LOG(FATAL) << "Cannot find PAPI component \"" << component_name
                << "\". Maybe you need to build PAPI with support for this component (use "
-                  "`./configure --components="
+                  "`./configure --with-components="
                << component_name << "`).";
   }
   return cidx;
 }
-PAPIMetricCollectorNode::PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics) {
-  for (auto& p : metrics) {
-    papi_metric_names[p.first->device] = {};
-    for (auto& metric : p.second) {
-      papi_metric_names[p.first->device].push_back(metric);
+
+/*! \brief MetricCollectorNode for PAPI metrics.
+ *
+ * PAPI (Performance Application Programming Interface) collects metrics on a
+ * variety of platforms including cpu, cuda and rocm.
+ *
+ * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
+ */
+struct PAPIMetricCollectorNode final : public MetricCollectorNode {
+  /*! \brief Construct a metric collector that collects a specific set of metrics.
+   *
+   * \param metrics A mapping from a device type to the metrics that should be
+   * collected on that device. You can find the names of available metrics by
+   * running `papi_native_avail`.
+   */
+  explicit PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics) {
+    for (auto& p : metrics) {
+      papi_metric_names[p.first->device] = {};
+      for (auto& metric : p.second) {
+        papi_metric_names[p.first->device].push_back(metric);
+      }
     }
   }
-}
+  explicit PAPIMetricCollectorNode() {}
 
-PAPIMetricCollectorNode::~PAPIMetricCollectorNode() {
-  for (auto p : event_sets) {
-    PAPI_CALL(PAPI_stop(p.second, NULL));
-    PAPI_CALL(PAPI_cleanup_eventset(p.second));
-    PAPI_CALL(PAPI_destroy_eventset(&p.second));
-  }
-}
+  /*! \brief Initialization call.
+   * \param devices The devices this collector will be running on
+   */
+  void Init(Array<DeviceWrapper> devices) {
+    if (!PAPI_is_initialized()) {
+      if (sizeof(long_long) > sizeof(int64_t)) {
+        LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
+                        "reporting metrics.";
+      }
+      CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
+          << "Error while initializing PAPI";
+    }
 
-void PAPIMetricCollectorNode::Init(Array<DeviceWrapper> devices) {
-  if (!PAPI_is_initialized()) {
-    if (sizeof(long_long) > sizeof(int64_t)) {
-      LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
-                      "reporting metrics.";
+    // If no metrics were provided we use the default set. The names were not
+    // initialized in the constructor because we did not know which devices we
+    // were running on.
+    if (papi_metric_names.size() == 0) {
+      for (auto wrapped_device : devices) {
+        Device device = wrapped_device->device;
+        auto it = default_metric_names.find(device.device_type);
+        if (it != default_metric_names.end()) {
+          papi_metric_names[device] = it->second;
+        }
+      }
     }
-    CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
-        << "Error while initializing PAPI";
-  }
 
-  // If no metrics were provided we use the default set. The names were not
-  // initialized in the constructor because we did not know which devices we
-  // were running on.
-  if (papi_metric_names.size() == 0) {
+    // create event sets for each device
     for (auto wrapped_device : devices) {
       Device device = wrapped_device->device;
-      auto it = default_metric_names.find(device.device_type);
-      if (it != default_metric_names.end()) {
-        papi_metric_names[device] = it->second;
+      int cidx = component_for_device(device);
+      // unknown device, skipping
+      if (cidx < 0) {
+        continue;
       }
-    }
-  }
 
-  // create event sets for each device
-  for (auto wrapped_device : devices) {
-    Device device = wrapped_device->device;
-    int cidx = component_for_device(device);
-    // unknown device, skipping
-    if (cidx < 0) {
-      continue;
-    }
+      auto it = papi_metric_names.find(device);
+      // skip devices with no metrics defined
+      if (it == papi_metric_names.end() || it->second.size() == 0) {
+        continue;
+      }
+      auto& metric_names = it->second;
 
-    auto it = papi_metric_names.find(device);
-    // skip devices with no metrics defined
-    if (it == papi_metric_names.end() || it->second.size() == 0) {
-      continue;
-    }
-    auto& metric_names = it->second;
-
-    const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
-    if (component->disabled) {
-      std::string help_message = "";
-      switch (device.device_type) {
-        case kDLCPU:
-          help_message = "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
-          break;
-        case kDLCUDA:
-          help_message =
-              "Try enabling gpu profiling with `modprobe nvidia "
-              "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "
-              "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
-              "`/etc/modprobe.d/nvidia-kernel-common.conf`.";
-          break;
-        default:
-          break;
+      const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
+      if (component->disabled) {
+        std::string help_message = "";
+        switch (device.device_type) {
+          case kDLCPU:
+            help_message =
+                "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
+            break;
+          case kDLCUDA:
+            help_message =
+                "Try enabling gpu profiling with `modprobe nvidia "
+                "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "
+                "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
+                "`/etc/modprobe.d/nvidia-kernel-common.conf`.";
+            break;
+          default:
+            break;
+        }
+        LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type)
+                     << ": " << component->disabled_reason << "\n"
+                     << help_message;
+        continue;
       }
-      LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type)
-                   << ": " << component->disabled_reason << "\n"
-                   << help_message;
-      continue;
-    }
 
-    int event_set = PAPI_NULL;
-    PAPI_CALL(PAPI_create_eventset(&event_set));
-    PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
-    if (device.device_type == kDLCPU) {
-      // we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
-      PAPI_option_t opt;
-      memset(&opt, 0x0, sizeof(PAPI_option_t));
-      opt.inherit.inherit = PAPI_INHERIT_ALL;
-      opt.inherit.eventset = event_set;
-      PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
-    }
+      int event_set = PAPI_NULL;
+      PAPI_CALL(PAPI_create_eventset(&event_set));
+      PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
+      if (device.device_type == kDLCPU) {
+        // we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
+        PAPI_option_t opt;
+        memset(&opt, 0x0, sizeof(PAPI_option_t));
+        opt.inherit.inherit = PAPI_INHERIT_ALL;
+        opt.inherit.eventset = event_set;
+        PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
+      }
 
-    if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
-      PAPI_CALL(PAPI_set_multiplex(event_set));
-    }
+      if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
+        PAPI_CALL(PAPI_set_multiplex(event_set));
+      }
 
-    // add all the metrics
-    for (auto metric : metric_names) {
-      int e = PAPI_add_named_event(event_set, metric.c_str());
-      if (e != PAPI_OK) {
-        LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
-                   << ".";
+      // add all the metrics
+      for (auto metric : metric_names) {
+        int e = PAPI_add_named_event(event_set, metric.c_str());
+        if (e != PAPI_OK) {
+          LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
+                     << ".";
+        }
       }
+      // Because we may have multiple calls in flight at the same time, we
+      // start all the timers when we initialize. Then we calculate the metrics
+      // counts for a call by comparing counter values at the start vs end of
+      // the call.
+      PAPI_CALL(PAPI_start(event_set));
+      event_sets[device] = event_set;
     }
-    // Because we may have multiple calls in flight at the same time, we
-    // start all the timers when we initialize. Then we calculate the metrics
-    // counts for a call by comparing counter values at the start vs end of
-    // the call.
-    PAPI_CALL(PAPI_start(event_set));
-    event_sets[device] = event_set;
   }
-}
-
-ObjectRef PAPIMetricCollectorNode::Start(Device dev) {
-  // Record counter values at the start of the call, so we can calculate the
-  // metrics for the call by comparing the values at the end of the call.
-  auto it = event_sets.find(dev);
-  if (it != event_sets.end()) {
-    int event_set = it->second;
-    std::vector<long_long> values(papi_metric_names[dev].size());
-    PAPI_CALL(PAPI_read(event_set, values.data()));
-    return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
-  } else {
-    return ObjectRef(nullptr);
+  /*! \brief Called right before a function call. Reads starting values of the
+   * measured metrics.
+   *
+   * \param dev The device the function will be run on.
+   * \returns A `PAPIEventSetNode` containing values for the counters at the
+   * start of the call. Passed to a corresponding `Stop` call.
+   */
+  ObjectRef Start(Device dev) final {
+    // Record counter values at the start of the call, so we can calculate the
+    // metrics for the call by comparing the values at the end of the call.
+    auto it = event_sets.find(dev);
+    if (it != event_sets.end()) {
+      int event_set = it->second;
+      std::vector<long_long> values(papi_metric_names[dev].size());
+      PAPI_CALL(PAPI_read(event_set, values.data()));
+      return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
+    } else {
+      return ObjectRef(nullptr);
+    }
+  }
+  /*! \brief Called right after a function call. Reads ending values of the
+   * measured metrics. Computes the change in each metric from the
+   * corresponding `Start` call.
+   *
+   * \param obj `PAPIEventSetNode` created by a call to `Start`.
+   * \returns A mapping from metric name to value.
+   */
+  Map<String, ObjectRef> Stop(ObjectRef obj) final {
+    const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
+    std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
+    PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
+    std::unordered_map<String, ObjectRef> reported_metrics;
+    for (size_t i = 0; i < end_values.size(); i++) {
+      if (end_values[i] < event_set_node->start_values[i]) {
+        LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
+        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+            ObjectRef(make_object<CountNode>(-1));
+      } else {
+        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+            ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+      }
+    }
+    return reported_metrics;
   }
-}
 
-Map<String, ObjectRef> PAPIMetricCollectorNode::Stop(ObjectRef obj) {
-  const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
-  std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
-  PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
-  std::unordered_map<String, ObjectRef> reported_metrics;
-  for (size_t i = 0; i < end_values.size(); i++) {
-    if (end_values[i] < event_set_node->start_values[i]) {
-      LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
-      reported_metrics[papi_metric_names[event_set_node->dev][i]] =
-          ObjectRef(make_object<CountNode>(-1));
-    } else {
-      reported_metrics[papi_metric_names[event_set_node->dev][i]] =
-          ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+  ~PAPIMetricCollectorNode() final {
+    for (auto p : event_sets) {
+      PAPI_CALL(PAPI_stop(p.second, NULL));
+      PAPI_CALL(PAPI_cleanup_eventset(p.second));
+      PAPI_CALL(PAPI_destroy_eventset(&p.second));
     }
   }
-  return reported_metrics;
-}
 
-PAPIMetricCollector::PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
-  data_ = make_object<PAPIMetricCollectorNode>(metrics);
+  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that
+   * device. */
+  std::unordered_map<Device, int> event_sets;
+  /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
+   * `event_set`. */
+  std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
+
+  static constexpr const char* _type_key = "runtime.profiling.PAPIMetricCollector";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
+};
+
+/*! \brief Wrapper for `PAPIMetricCollectorNode`. */
+class PAPIMetricCollector : public MetricCollector {
+ public:
+  explicit PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
+    data_ = make_object<PAPIMetricCollectorNode>(metrics);
+  }
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector,
+                                        PAPIMetricCollectorNode);
+};
+
+MetricCollector CreatePAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
+  return PAPIMetricCollector(metrics);
 }
 
 TVM_REGISTER_OBJECT_TYPE(PAPIEventSetNode);