diff --git a/CMakeLists.txt b/CMakeLists.txt
index c56a929e276d..9e35b8e4bbad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
 tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
 tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
+tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -407,6 +408,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
+include(cmake/modules/contrib/PAPI.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/modules/contrib/PAPI.cmake b/cmake/modules/contrib/PAPI.cmake
new file mode 100644
index 000000000000..257591451ca8
--- /dev/null
+++ b/cmake/modules/contrib/PAPI.cmake
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_PAPI)
+  find_package(PkgConfig REQUIRED)
+
+  set(ENV{PKG_CONFIG_PATH} "${USE_PAPI}:$ENV{PKG_CONFIG_PATH}")
+  pkg_check_modules(PAPI REQUIRED IMPORTED_TARGET papi>=6.0)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS PkgConfig::PAPI)
+  list(APPEND RUNTIME_SRCS src/runtime/contrib/papi/papi.cc)
+endif()
diff --git a/docs/index.rst b/docs/index.rst
index a7ae68c87b01..491c42712e9a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -78,6 +78,7 @@ For Developers
    :caption: MISC
 
    vta/index
+   profiling/index
 
 
 Index
diff --git a/docs/profiling/index.rst b/docs/profiling/index.rst
new file mode 100644
index 000000000000..9443fef25ea6
--- /dev/null
+++ b/docs/profiling/index.rst
@@ -0,0 +1,24 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Profiling Deep Learning Models
+====================================
+
+.. toctree::
+   :maxdepth: 1
+
+   papi
diff --git a/docs/profiling/papi.rst b/docs/profiling/papi.rst
new file mode 100644
index 000000000000..b7c23b2c0c73
--- /dev/null
+++ b/docs/profiling/papi.rst
@@ -0,0 +1,114 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+Getting Started With PAPI
+=========================
+
+The Performance Application Programming Interface (PAPI) is a library that
+provides performance counters on a variety of platforms. Performance counters
+provide accurate low-level information about processors behavior during a given
+execution run. This information can contain simple metrics like total cycle
+count, cache misses, and instructions executed as well as more high level
+information like total FLOPS and warp occupancy. PAPI makes these metrics
+available while profiling.
+
+Installing PAPI
+---------------
+
+PAPI can either be installed using your package manager (``apt-get install libpapi-dev``
+on Ubuntu), or from source here:
+https://bitbucket.org/icl/papi/src/master/.
+
+
+Building TVM With PAPI
+----------------------
+
+To include PAPI in your build of TVM, set the following line in you ``config.cmake``:
+
+.. code::
+
+   set(USE_PAPI ON)
+
+If PAPI is installed in a non-standard place, you can specify where it is like so:
+
+.. code::
+
+   set(USE_PAPI path/to/papi.pc)
+
+
+Using PAPI While Profiling
+--------------------------
+
+If TVM has been built with PAPI (see above), then you can pass a
+:py:class:`tvm.runtime.profiling.PAPIMetricCollector` to
+:py:meth:`tvm.runtime.GraphModule.profile` to collect performance metrics. Here
+is an example:
+
+.. code:: python
+
+    target = "llvm"
+    dev = tvm.cpu()
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.vm.compile(mod, target, params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
+
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
+    report = vm.profile(
+        [data],
+        func_name="main",
+        collectors=[tvm.runtime.profiling.PAPIMetricCollector()],
+    )
+    print(report)
+
+.. code::
+
+   Name                                    perf::CACHE-MISSES   perf::CYCLES  perf::STALLED-CYCLES-BACKEND  perf::INSTRUCTIONS  perf::STALLED-CYCLES-FRONTEND
+   fused_nn_dense_nn_bias_add_nn_relu                   2,494      1,570,698                        85,608             675,564                         39,583
+   fused_nn_dense_nn_bias_add_nn_relu_1                 1,149        655,101                        13,278             202,297                         21,380
+   fused_nn_dense_nn_bias_add                             288        600,184                         8,321             163,446                         19,513
+   fused_nn_batch_flatten                                 301        587,049                         4,636             158,636                         18,565
+   fused_nn_softmax                                       154        575,143                         8,018             160,738                         18,995
+   ----------
+   Sum                                                  4,386      3,988,175                       119,861           1,360,681                        118,036
+   Total                                               10,644      8,327,360                       179,310           2,660,569                        270,044
+
+You can also change which metrics are collected:
+
+.. code:: python
+
+    report = vm.profile(
+        [data],
+        func_name="main",
+        collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"])],
+    )
+
+.. code::
+
+   Name                                  PAPI_FP_OPS
+   fused_nn_dense_nn_bias_add_nn_relu        200,832
+   fused_nn_dense_nn_bias_add_nn_relu_1       16,448
+   fused_nn_dense_nn_bias_add                  1,548
+   fused_nn_softmax                              160
+   fused_nn_batch_flatten                          0
+   ----------
+   Sum                                       218,988
+   Total                                     218,988
+
+You can find a list of available metrics by running the ``papi_avail`` and
+``papi_native_avail`` commands.
diff --git a/include/tvm/runtime/contrib/papi.h b/include/tvm/runtime/contrib/papi.h
new file mode 100644
index 000000000000..ff2d75c483eb
--- /dev/null
+++ b/include/tvm/runtime/contrib/papi.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \brief Performance counters for profiling via the PAPI library.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_PAPI_H_
+#define TVM_RUNTIME_CONTRIB_PAPI_H_
+
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/profiling.h>
+
+namespace tvm {
+namespace runtime {
+namespace profiling {
+
+/*! \brief Construct a metric collector that collects data from hardware
+ * performance counters using the Performance Application Programming Interface
+ * (PAPI).
+ *
+ * \param metrics A mapping from a device type to the metrics that should be
+ * collected on that device. You can find the names of available metrics by
+ * running `papi_native_avail`.
+ */
+TVM_DLL MetricCollector CreatePAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics);
+}  // namespace profiling
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_PAPI_H_
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 5b44e020f4e4..b1e8d98c1441 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -37,6 +37,7 @@
 #include <vector>
 
 namespace tvm {
+
 namespace runtime {
 
 /*! \brief Base class for all implementations.
@@ -150,6 +151,26 @@ class Timer : public ObjectRef {
 Timer DefaultTimer(Device dev);
 
 namespace profiling {
+/*! \brief Wrapper for `Device` because `Device` is not passable across the
+ * PackedFunc interface.
+ */
+struct DeviceWrapperNode : public Object {
+  /*! The device */
+  Device device;
+
+  /*! Constructor */
+  explicit DeviceWrapperNode(Device device) : device(device) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.DeviceWrapper";
+  TVM_DECLARE_BASE_OBJECT_INFO(DeviceWrapperNode, Object);
+};
+
+/*! \brief Wrapper for `Device`. */
+class DeviceWrapper : public ObjectRef {
+ public:
+  explicit DeviceWrapper(Device dev) { data_ = make_object<DeviceWrapperNode>(dev); }
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DeviceWrapper, ObjectRef, DeviceWrapperNode);
+};
 
 /*! \brief Data collected from a profiling run. Includes per-call metrics and per-device metrics.
  */
@@ -200,6 +221,57 @@ class Report : public ObjectRef {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Report, ObjectRef, ReportNode);
 };
 
+/*! \brief Interface for user defined profiling metric collection.
+ *
+ * Users can register their own collector by registering a packed function with
+ * the name "runtime.profiling.metrics.my_collector_name" where
+ * "my_collector_name" is the name of their collector. This function should
+ * take an Array of Device as input which contains the devices the collector
+ * will be run on.
+ *
+ * `MetricCollectorNode`s will be called in the following fashion.
+ * \code
+ * MetricCollector mc;
+ * for (auto op : model) {
+ *   auto o = mc.Start();
+ *   op();
+ *   auto metrics = mc.Stop(o); // metrics are added the profiling report
+ * }
+ * \endcode
+ */
+class MetricCollectorNode : public Object {
+ public:
+  /*! \brief Initialization call. Called before profiling has started. Any
+   * expensive precomputation should happen here.
+   * \param devs The list of devices this collector will be run on.
+   */
+  virtual void Init(Array<DeviceWrapper> devs) = 0;
+  /*! \brief Start colling metrics for a function call.
+   * \param dev The device the call will be run on.
+   * \returns An object used to maintain state of the metric collection. This
+   * object will be passed to the corresponding `Stop` call. If the device is
+   * not supported, this function will return a nullptr ObjectRef.
+   */
+  virtual ObjectRef Start(Device dev) = 0;
+  /*! \brief Stop collecting metrics.
+   * \param obj The object created by the corresponding `Start` call.
+   * \returns A set of metric names and the associated values. Values must be
+   * one of DurationNode, PercentNode, CountNode, or StringObj.
+   */
+  virtual Map<String, ObjectRef> Stop(ObjectRef obj) = 0;
+
+  virtual ~MetricCollectorNode() {}
+
+  static constexpr const char* _type_key = "runtime.profiling.MetricCollector";
+  TVM_DECLARE_BASE_OBJECT_INFO(MetricCollectorNode, Object);
+};
+
+/*! \brief Wrapper for `MetricCollectorNode`. */
+class MetricCollector : public ObjectRef {
+ public:
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MetricCollector, ObjectRef, MetricCollectorNode);
+};
+
 /*! Information about a single function or operator call. */
 struct CallFrame {
   /*! Device on which the call was made */
@@ -210,6 +282,10 @@ struct CallFrame {
   Timer timer;
   /*! Extra performance metrics */
   std::unordered_map<std::string, ObjectRef> extra_metrics;
+  /*! User defined metric collectors. Each pair is the MetricCollector and its
+   * associated data (returned from MetricCollector.Start).
+   */
+  std::vector<std::pair<MetricCollector, ObjectRef>> extra_collectors;
 };
 
 /*! Runtime profiler for function and/or operator calls. Used in the graph
@@ -217,9 +293,10 @@ struct CallFrame {
  *
  * Example usage:
  * \code{.cpp}
- * Profiler prof;
  * Device cpu, gpu;
- * prof.Start({cpu, gpu});
+ * Profiler prof({cpu, gpu});
+ * my_gpu_kernel(); // do a warmup iteration
+ * prof.Start();
  * prof.StartCall("my_gpu_kernel", gpu);
  * my_gpu_kernel();
  * prof.StopCall();
@@ -232,13 +309,24 @@ struct CallFrame {
  */
 class Profiler {
  public:
-  /*! \brief Start the profiler.
+  /*! Constructor.
+   *
+   * The profiler should be constructed before you do any warmup iterations.
+   *
+   * \note
+   * Calling this constructor will reset the TVM threadpool. It is necessary in
+   * order to install thread handlers required by certain collectors.
+   *
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
+   * \param metric_collectors Additional `MetricCollector`s to use with this profiler.
+   */
+  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors);
+  /*! \brief Start the profiler.
    *
    * This function should only be called once per object.
    */
-  void Start(const std::vector<Device>& devs);
+  void Start();
   /*! \brief Stop the profiler.
    *
    * This function should only be called once per object after start has been called.
@@ -270,12 +358,14 @@ class Profiler {
   /*! \brief Check if the profiler is currently running.
    * \returns Whether or not the profiler is running.
    */
-  bool IsRunning() const { return !global_timers_.empty(); }
+  bool IsRunning() const { return is_running_; }
 
  private:
-  std::vector<std::pair<Device, Timer>> global_timers_;
+  std::vector<Device> devs_;
+  bool is_running_{false};
   std::vector<CallFrame> calls_;
   std::stack<CallFrame> in_flight_;
+  std::vector<MetricCollector> collectors_;
 };
 
 /* \brief A duration in time. */
diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h
index 95a64049fd45..43636ddbdb1f 100644
--- a/include/tvm/runtime/threading_backend.h
+++ b/include/tvm/runtime/threading_backend.h
@@ -94,6 +94,14 @@ void Yield();
  */
 int MaxConcurrency();
 
+/*!
+ * \brief Reset the threads in the pool. All current threads are destroyed and
+ * new ones are created.
+ *
+ * Note that this does nothing when openmp is used.
+ */
+void ResetThreadPool();
+
 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index dc043353c475..622f27c358b6 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -268,23 +268,28 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
-    def profile(self, **input_dict):
+    def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.
 
         Parameters
         ----------
+        collectors : Optional[Sequence[MetricCollector]]
+            Extra metrics to collect.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
+
         Return
         ------
         timing_results : str
             Per-operator and whole graph timing results in a table format.
         """
+        collectors = [] if collectors is None else collectors
         if input_dict:
             self.set_input(**input_dict)
 
-        return self._profile()
+        return self._profile(collectors)
 
     def exit(self):
         """Exits the dump folder and all its contents"""
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
index e1c3dc66a360..b3043d8b8760 100644
--- a/python/tvm/runtime/profiler_vm.py
+++ b/python/tvm/runtime/profiler_vm.py
@@ -50,7 +50,7 @@ def get_stat(self, sort_by_time=True):  # pylint: disable=unused-argument
         warnings.warn("get_stat has been removed, use profile instead")
         return ""
 
-    def profile(self, *args, func_name="main", **kwargs):
+    def profile(self, *args, func_name="main", collectors=None, **kwargs):
         """Profile a function call.
 
         Parameters
@@ -58,6 +58,9 @@ def profile(self, *args, func_name="main", **kwargs):
         func_name : str
             The name of the function.
 
+        collectors : Optional[Sequence[MetricCollector]]
+            Extra metrics to collect.
+
         args : list[tvm.runtime.NDArray] or list[np.ndarray]
             The arguments to the function.
 
@@ -69,6 +72,7 @@ def profile(self, *args, func_name="main", **kwargs):
         timing_results : str
             Overall and per-op timing results formatted in a table.
         """
+        collectors = [] if collectors is None else collectors
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
-        return self._profile(func_name)
+        return self._profile(func_name, collectors)
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
new file mode 100644
index 000000000000..8857cf419602
--- /dev/null
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Registration of profiling objects in python."""
+
+from typing import Dict, Sequence, Optional
+from ... import _ffi
+from . import _ffi_api
+from .. import Object, Device
+
+
+@_ffi.register_object("runtime.profiling.Report")
+class Report(Object):
+    """A container for information gathered during a profiling run.
+
+    Attributes
+    ----------
+    calls : Array[Dict[str, Object]]
+        Per-call profiling metrics (function name, runtime, device, ...).
+
+    device_metrics : Dict[Device, Dict[str, Object]]
+        Per-device metrics collected over the entire run.
+    """
+
+    def csv(self):
+        """Convert this profiling report into CSV format.
+
+        This only includes calls and not overall metrics.
+
+        Returns
+        -------
+        csv : str
+            `calls` in CSV format.
+        """
+        return _ffi_api.AsCSV(self)
+
+
+@_ffi.register_object("runtime.profiling.MetricCollector")
+class MetricCollector(Object):
+    """Interface for user defined profiling metric collection."""
+
+
+@_ffi.register_object("runtime.profiling.DeviceWrapper")
+class DeviceWrapper(Object):
+    """Wraps a tvm.runtime.Device"""
+
+    def __init__(self, dev: Device):
+        self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
+
+
+# We only enable this class when TVM is build with PAPI support
+if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None:
+
+    @_ffi.register_object("runtime.profiling.PAPIMetricCollector")
+    class PAPIMetricCollector(MetricCollector):
+        """Collects performance counter information using the Performance
+        Application Programming Interface (PAPI).
+        """
+
+        def __init__(self, metric_names: Optional[Dict[Device, Sequence[str]]] = None):
+            """
+            Parameters
+            ----------
+            metric_names : Optional[Dict[Device, Sequence[str]]]
+                List of per-device metrics to collect. You can find a list of valid
+                metrics by runing `papi_native_avail` from the command line.
+            """
+            metric_names = {} if metric_names is None else metric_names
+            wrapped = dict()
+            for dev, names in metric_names.items():
+                wrapped[DeviceWrapper(dev)] = names
+            self.__init_handle_by_constructor__(_ffi_api.PAPIMetricCollector, wrapped)
diff --git a/python/tvm/runtime/profiling.py b/python/tvm/runtime/profiling/_ffi_api.py
similarity index 52%
rename from python/tvm/runtime/profiling.py
rename to python/tvm/runtime/profiling/_ffi_api.py
index 5a1cd6796b64..d26b847a699f 100644
--- a/python/tvm/runtime/profiling.py
+++ b/python/tvm/runtime/profiling/_ffi_api.py
@@ -14,35 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Registration of profiling objects in python."""
-
-from .. import _ffi
-from . import Object
+"""FFI for profiling"""
+from ... import _ffi
 
 _ffi._init_api("runtime.profiling", __name__)
-
-
-@_ffi.register_object("runtime.profiling.Report")
-class Report(Object):
-    """A container for information gathered during a profiling run.
-
-    Attributes
-    ----------
-    calls : Array[Dict[str, Object]]
-        Per-call profiling metrics (function name, runtime, device, ...).
-
-    device_metrics : Dict[Device, Dict[str, Object]]
-        Per-device metrics collected over the entire run.
-    """
-
-    def csv(self):
-        """Convert this profiling report into CSV format.
-
-        This only includes calls and not overall metrics.
-
-        Returns
-        -------
-        csv : str
-            `calls` in CSV format.
-        """
-        return AsCSV(self)
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
new file mode 100644
index 000000000000..b9ba8f9984e9
--- /dev/null
+++ b/src/runtime/contrib/papi/papi.cc
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <papi.h>
+#include <tvm/runtime/contrib/papi.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace profiling {
+
+#define PAPI_CALL(func)                                             \
+  {                                                                 \
+    int e = (func);                                                 \
+    if (e != PAPI_OK) {                                             \
+      LOG(FATAL) << "PAPIError: in function " #func " " << e << " " \
+                 << std::string(PAPI_strerror(e));                  \
+    }                                                               \
+  }
+
+static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_metric_names = {
+    {kDLCPU,
+     {"perf::CYCLES", "perf::STALLED-CYCLES-FRONTEND", "perf::STALLED-CYCLES-BACKEND",
+      "perf::INSTRUCTIONS", "perf::CACHE-MISSES"}},
+    {kDLCUDA, {"cuda:::event:elapsed_cycles_sm:device=0"}}};
+
+/*! \brief Object that holds the values of counters at the start of a function call. */
+struct PAPIEventSetNode : public Object {
+  /*! \brief The starting values of counters for all metrics of a specific device. */
+  std::vector<long_long> start_values;
+  /*! \brief The device these counters are for. */
+  Device dev;
+
+  explicit PAPIEventSetNode(std::vector<long_long> start_values, Device dev)
+      : start_values(start_values), dev(dev) {}
+
+  static constexpr const char* _type_key = "PAPIEventSetNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIEventSetNode, Object);
+};
+
+/* Get the PAPI component id for the given device.
+ * \param dev The device to get the component for.
+ * \returns PAPI component id for the device. Returns -1 if the device is not
+ * supported by PAPI.
+ */
+int component_for_device(Device dev) {
+  std::string component_name;
+  switch (dev.device_type) {
+    case kDLCPU:
+      component_name = "perf_event";
+      break;
+    case kDLCUDA:
+      component_name = "cuda";
+      break;
+    case kDLROCM:
+      component_name = "rocm";
+      break;
+    default:
+      LOG(WARNING) << "PAPI does not support device " << DeviceName(dev.device_type);
+      return -1;
+  }
+  int cidx = PAPI_get_component_index(component_name.c_str());
+  if (cidx < 0) {
+    LOG(FATAL) << "Cannot find PAPI component \"" << component_name
+               << "\". Maybe you need to build PAPI with support for this component (use "
+                  "`./configure --with-components="
+               << component_name << "`).";
+  }
+  return cidx;
+}
+
+/*! \brief MetricCollectorNode for PAPI metrics.
+ *
+ * PAPI (Performance Application Programming Interface) collects metrics on a
+ * variety of platforms including cpu, cuda and rocm.
+ *
+ * PAPI is avaliable at https://bitbucket.org/icl/papi/src/master/.
+ */
+struct PAPIMetricCollectorNode final : public MetricCollectorNode {
+  /*! \brief Construct a metric collector that collects a specific set of metrics.
+   *
+   * \param metrics A mapping from a device type to the metrics that should be
+   * collected on that device. You can find the names of available metrics by
+   * running `papi_native_avail`.
+   */
+  explicit PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics) {
+    for (auto& p : metrics) {
+      papi_metric_names[p.first->device] = {};
+      for (auto& metric : p.second) {
+        papi_metric_names[p.first->device].push_back(metric);
+      }
+    }
+  }
+  explicit PAPIMetricCollectorNode() {}
+
+  /*! \brief Initialization call.
+   * \param devices The devices this collector will be running on
+   */
+  void Init(Array<DeviceWrapper> devices) {
+    if (!PAPI_is_initialized()) {
+      if (sizeof(long_long) > sizeof(int64_t)) {
+        LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
+                        "reporting metrics.";
+      }
+      CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
+          << "Error while initializing PAPI";
+    }
+
+    // If no metrics were provided we use the default set. The names were not
+    // initialized in the constructor because we did not know which devices we
+    // were running on.
+    if (papi_metric_names.size() == 0) {
+      for (auto wrapped_device : devices) {
+        Device device = wrapped_device->device;
+        auto it = default_metric_names.find(device.device_type);
+        if (it != default_metric_names.end()) {
+          papi_metric_names[device] = it->second;
+        }
+      }
+    }
+
+    // create event sets for each device
+    for (auto wrapped_device : devices) {
+      Device device = wrapped_device->device;
+      int cidx = component_for_device(device);
+      // unknown device, skipping
+      if (cidx < 0) {
+        continue;
+      }
+
+      auto it = papi_metric_names.find(device);
+      // skip devices with no metrics defined
+      if (it == papi_metric_names.end() || it->second.size() == 0) {
+        continue;
+      }
+      auto& metric_names = it->second;
+
+      const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
+      if (component->disabled) {
+        std::string help_message = "";
+        switch (device.device_type) {
+          case kDLCPU:
+            help_message =
+                "Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
+            break;
+          case kDLCUDA:
+            help_message =
+                "Try enabling gpu profiling with `modprobe nvidia "
+                "NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding  "
+                "`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
+                "`/etc/modprobe.d/nvidia-kernel-common.conf`.";
+            break;
+          default:
+            break;
+        }
+        LOG(WARNING) << "PAPI could not initialize counters for " << DeviceName(device.device_type)
+                     << ": " << component->disabled_reason << "\n"
+                     << help_message;
+        continue;
+      }
+
+      int event_set = PAPI_NULL;
+      PAPI_CALL(PAPI_create_eventset(&event_set));
+      PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
+      if (device.device_type == kDLCPU) {
+        // we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
+        PAPI_option_t opt;
+        memset(&opt, 0x0, sizeof(PAPI_option_t));
+        opt.inherit.inherit = PAPI_INHERIT_ALL;
+        opt.inherit.eventset = event_set;
+        PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
+      }
+
+      if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
+        PAPI_CALL(PAPI_set_multiplex(event_set));
+      }
+
+      // add all the metrics
+      for (auto metric : metric_names) {
+        int e = PAPI_add_named_event(event_set, metric.c_str());
+        if (e != PAPI_OK) {
+          LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
+                     << ".";
+        }
+      }
+      // Because we may have multiple calls in flight at the same time, we
+      // start all the timers when we initialize. Then we calculate the metrics
+      // counts for a call by comparing counter values at the start vs end of
+      // the call.
+      PAPI_CALL(PAPI_start(event_set));
+      event_sets[device] = event_set;
+    }
+  }
+  /*! \brief Called right before a function call. Reads starting values of the
+   * measured metrics.
+   *
+   * \param dev The device the function will be run on.
+   * \returns A `PAPIEventSetNode` containing values for the counters at the
+   * start of the call. Passed to a corresponding `Stop` call.
+   */
+  ObjectRef Start(Device dev) final {
+    // Record counter values at the start of the call, so we can calculate the
+    // metrics for the call by comparing the values at the end of the call.
+    auto it = event_sets.find(dev);
+    if (it != event_sets.end()) {
+      int event_set = it->second;
+      std::vector<long_long> values(papi_metric_names[dev].size());
+      PAPI_CALL(PAPI_read(event_set, values.data()));
+      return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
+    } else {
+      return ObjectRef(nullptr);
+    }
+  }
+  /*! \brief Called right after a function call. Reads ending values of the
+   * measured metrics. Computes the change in each metric from the
+   * corresponding `Start` call.
+   *
+   * \param obj `PAPIEventSetNode` created by a call to `Start`.
+   * \returns A mapping from metric name to value.
+   */
+  Map<String, ObjectRef> Stop(ObjectRef obj) final {
+    const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
+    std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
+    PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
+    std::unordered_map<String, ObjectRef> reported_metrics;
+    for (size_t i = 0; i < end_values.size(); i++) {
+      if (end_values[i] < event_set_node->start_values[i]) {
+        LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
+        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+            ObjectRef(make_object<CountNode>(-1));
+      } else {
+        reported_metrics[papi_metric_names[event_set_node->dev][i]] =
+            ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
+      }
+    }
+    return reported_metrics;
+  }
+
+  ~PAPIMetricCollectorNode() final {
+    for (auto p : event_sets) {
+      PAPI_CALL(PAPI_stop(p.second, NULL));
+      PAPI_CALL(PAPI_cleanup_eventset(p.second));
+      PAPI_CALL(PAPI_destroy_eventset(&p.second));
+    }
+  }
+
+  /*! \brief Device-specific event sets. Contains the running counters (the int values) for that
+   * device. */
+  std::unordered_map<Device, int> event_sets;
+  /*! \brief Device-specific metric names. Order of names matches the order in the corresponding
+   * `event_set`. */
+  std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
+
+  static constexpr const char* _type_key = "runtime.profiling.PAPIMetricCollector";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
+};
+
+/*! \brief Wrapper for `PAPIMetricCollectorNode`. */
+class PAPIMetricCollector : public MetricCollector {
+ public:
+  explicit PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
+    data_ = make_object<PAPIMetricCollectorNode>(metrics);
+  }
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector,
+                                        PAPIMetricCollectorNode);
+};
+
+MetricCollector CreatePAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
+  return PAPIMetricCollector(metrics);
+}
+
+TVM_REGISTER_OBJECT_TYPE(PAPIEventSetNode);
+TVM_REGISTER_OBJECT_TYPE(PAPIMetricCollectorNode);
+
+TVM_REGISTER_GLOBAL("runtime.profiling.PAPIMetricCollector")
+    .set_body_typed([](Map<DeviceWrapper, Array<String>> metrics) {
+      return PAPIMetricCollector(metrics);
+    });
+
+}  // namespace profiling
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 1ea01b19e8aa..2fa73971d000 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -276,16 +276,20 @@ class GraphExecutorDebug : public GraphExecutor {
    * the module compared to GraphRuntimeDebug::RunIndividual as it runs the
    * entire graph in order.
    *
+   * \param collectors Optional user defined `MetricCollector`s to use with this profiling run.
+   *
    * \returns A table of per-op runtimes and total times.
    */
-  profiling::Report Profile() {
+  profiling::Report Profile(Array<profiling::MetricCollector> collectors) {
+    std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
+    profiling::Profiler prof(devices_, cs);
+
     // warm up. 1 iteration does not seem enough.
     for (int i = 0; i < 3; i++) {
       GraphExecutor::Run();
     }
 
-    profiling::Profiler prof;
-    prof.Start(devices_);
+    prof.Start();
     for (size_t i = 0; i < op_execs_.size(); ++i) {
       if (op_execs_[i]) {
         // get argument shapes
@@ -359,7 +363,10 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
   } else if (name == "profile") {
-    return TypedPackedFunc<profiling::Report()>([sptr_to_self, this]() { return this->Profile(); });
+    return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
+        [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
+          return this->Profile(collectors);
+        });
   } else {
     return GraphExecutor::GetFunction(name, sptr_to_self);
   }
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index ab9d674fad50..c149298f11a5 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -23,8 +23,10 @@
  */
 
 #include <tvm/ir/expr.h>
+#include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <chrono>
 #include <iomanip>
@@ -100,16 +102,37 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
 
 namespace profiling {
 
-void Profiler::Start(const std::vector<Device>& devs) {
-  CHECK(global_timers_.empty()) << "You can only call Start once per Profiler.";
+Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors)
+    : devs_(devs), collectors_(metric_collectors) {
+  is_running_ = false;
+  std::vector<DeviceWrapper> wrapped_devs;
   for (auto dev : devs) {
-    global_timers_.emplace_back(dev, Timer::Start(dev));
+    wrapped_devs.push_back(DeviceWrapper(make_object<DeviceWrapperNode>(dev)));
+  }
+  for (auto& x : collectors_) {
+    x->Init(wrapped_devs);
+  }
+  // reset the thread pool so that PAPI eventset hooks are set in all threads.
+  threading::ResetThreadPool();
+}
+
+void Profiler::Start() {
+  is_running_ = true;
+  for (auto dev : devs_) {
+    StartCall("Total", dev, {});
   }
 }
 
 void Profiler::StartCall(String name, Device dev,
                          std::unordered_map<std::string, ObjectRef> extra_metrics) {
-  in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics});
+  std::vector<std::pair<MetricCollector, ObjectRef>> objs;
+  for (auto& collector : collectors_) {
+    ObjectRef obj = collector->Start(dev);
+    if (obj.defined()) {
+      objs.emplace_back(collector, obj);
+    }
+  }
+  in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics, objs});
 }
 
 void Profiler::StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics) {
@@ -118,14 +141,21 @@ void Profiler::StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics
   for (auto& p : extra_metrics) {
     cf.extra_metrics[p.first] = p.second;
   }
+  // collect the extra metrics from user defined collectors
+  for (const auto& obj : cf.extra_collectors) {
+    auto collector_metrics = obj.first->Stop(obj.second);
+    for (auto& p : collector_metrics) {
+      cf.extra_metrics[p.first] = p.second;
+    }
+  }
   in_flight_.pop();
   calls_.push_back(cf);
 }
 
 void Profiler::Stop() {
-  // Stop all global timers. We wait to synchronize until we are making the report.
-  for (auto p : global_timers_) {
-    p.second->Stop();
+  is_running_ = false;
+  for (size_t i = 0; i < devs_.size(); i++) {
+    StopCall();
   }
 }
 
@@ -396,31 +426,11 @@ std::string DeviceString(Device dev) {
 }
 
 Report Profiler::Report(bool aggregate, bool sort) {
-  std::vector<std::pair<Device, double>> global_times;
-  for (auto p : global_timers_) {
-    global_times.emplace_back(p.first, p.second->SyncAndGetElapsedNanos() / 1e3);
-  }
-
-  double overall_time = 0;
-  for (auto p : global_times) {
-    overall_time = std::max(overall_time, p.second);
-  }
-
-  std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
-  for (auto p : global_times) {
-    std::unordered_map<String, ObjectRef> row;
-    row["Name"] = String("Total");
-    row["Duration (us)"] = ObjectRef(make_object<DurationNode>(p.second));
-    row["Percent"] = ObjectRef(make_object<PercentNode>(p.second / overall_time * 100));
-    row["Device"] = String(DeviceString(p.first));
-    device_metrics[DeviceString(p.first)] = row;
-  }
-
-  std::vector<Map<String, ObjectRef>> rows;
+  // sync all timers and normalize rows
+  std::vector<std::unordered_map<String, ObjectRef>> rows;
   for (auto& cf : calls_) {
     std::unordered_map<String, ObjectRef> row;
     double us = cf.timer->SyncAndGetElapsedNanos() / 1e3;
-    row["Percent"] = ObjectRef(make_object<PercentNode>(us / overall_time * 100));
     row["Duration (us)"] = ObjectRef(make_object<DurationNode>(us));
     row["Count"] = ObjectRef(make_object<CountNode>(1));
     row["Name"] = cf.name;
@@ -431,7 +441,30 @@ Report Profiler::Report(bool aggregate, bool sort) {
     rows.push_back(row);
   }
 
-  return profiling::Report(rows, device_metrics);
+  // the last couple of call frames are the overall times
+  double overall_time_us = 0;
+  std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
+  for (size_t i = 0; i < devs_.size(); i++) {
+    auto row = rows[rows.size() - 1];
+    rows.pop_back();
+    device_metrics[Downcast<String>(row["Device"])] = row;
+    overall_time_us =
+        std::max(overall_time_us, row["Duration (us)"].as<DurationNode>()->microseconds);
+  }
+
+  // Calculate percentages
+  for (auto& row : rows) {
+    row["Percent"] = ObjectRef(make_object<PercentNode>(
+        row["Duration (us)"].as<DurationNode>()->microseconds / overall_time_us * 100));
+  }
+
+  // convert to map
+  std::vector<Map<String, ObjectRef>> converted_rows;
+  for (const auto& row : rows) {
+    converted_rows.push_back(row);
+  }
+
+  return profiling::Report(converted_rows, device_metrics);
 }
 
 Report::Report(Array<Map<String, ObjectRef>> calls,
@@ -446,8 +479,13 @@ TVM_REGISTER_OBJECT_TYPE(DurationNode);
 TVM_REGISTER_OBJECT_TYPE(PercentNode);
 TVM_REGISTER_OBJECT_TYPE(CountNode);
 TVM_REGISTER_OBJECT_TYPE(ReportNode);
+TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode);
+TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode);
 
 TVM_REGISTER_GLOBAL("runtime.profiling.AsCSV").set_body_typed([](Report n) { return n->AsCSV(); });
+TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) {
+  return DeviceWrapper(dev);
+});
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index cab04ec0db4a..c11e9f7ac084 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -258,26 +258,29 @@ class SpscTaskQueue {
 class ThreadPool {
  public:
   ThreadPool() : num_workers_(tvm::runtime::threading::MaxConcurrency()) {
-    for (int i = 0; i < num_workers_; ++i) {
-      // The SpscTaskQueue only hosts ONE item at a time
-      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
-    }
     const char* exclude_worker0 = getenv("TVM_EXCLUDE_WORKER0");
     if (exclude_worker0 && atoi(exclude_worker0) == 0) {
       exclude_worker0_ = false;
     }
-    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
-        new tvm::runtime::threading::ThreadGroup(
-            num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
-            exclude_worker0_ /* include_main_thread */));
-    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
+    Init();
   }
+
   ~ThreadPool() {
     for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
       q->SignalForKill();
     }
     threads_.reset();
   }
+
+  void Reset() {
+    for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
+      q->SignalForKill();
+    }
+    queues_.clear();
+    threads_.reset();
+    Init();
+  }
+
   int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, int need_sync) {
     ParallelLauncher* launcher = ParallelLauncher::ThreadLocal();
     ICHECK(!launcher->is_worker)
@@ -323,6 +326,19 @@ class ThreadPool {
   }
 
  private:
+  // Shared initialization code
+  void Init() {
+    for (int i = 0; i < num_workers_; ++i) {
+      // The SpscTaskQueue only hosts ONE item at a time
+      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
+    }
+    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
+        new tvm::runtime::threading::ThreadGroup(
+            num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
+            exclude_worker0_ /* include_main_thread */));
+    num_workers_used_ = threads_->Configure(threading::ThreadGroup::kBig, 0, exclude_worker0_);
+  }
+
   // Internal worker function.
   void RunWorker(int worker_id) {
     SpscTaskQueue* queue = queues_[worker_id].get();
@@ -359,6 +375,10 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe
   ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads);
 });
 
+namespace threading {
+void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
+}  // namespace threading
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index e8b948d3d2ae..ca9b22a9099d 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -273,6 +273,13 @@ void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
     primitive_names[packed_index] = it.first;
   }
   strm->Write(primitive_names);
+  // TODO(tkonolige): cannot serialize ObjectRefs with dmlc's serializer.
+  // std::vector<std::pair<size_t, Map<String, ObjectRef>>> primitive_attrs;
+  // for (const auto& it : this->op_attrs) {
+  //   auto packed_index = static_cast<size_t>(it.first);
+  //   primitive_attrs.push_back({packed_index, it.second});
+  // }
+  // strm->Write(primitive_attrs);
 }
 
 // Serialize a virtual machine instruction. It creates a list that contains the
@@ -569,6 +576,12 @@ void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
   for (size_t i = 0; i < primitive_names.size(); i++) {
     this->primitive_map.insert({primitive_names[i], i});
   }
+  // TODO(tkonolige): cannot serialize ObjectRefs with dmlc's serializer.
+  // std::vector<std::pair<size_t, Map<String, ObjectRef>>> primitive_attrs;
+  // STREAM_CHECK(strm->Read(&primitive_attrs), "primitive attrs");
+  // for (auto p : primitive_attrs) {
+  //   this->op_attrs.insert(p);
+  // }
 }
 
 // Extract the `cnt` number of fields started at `start` from the list
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index a7d65944d581..6d893114d623 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -43,26 +43,31 @@ namespace vm {
 PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
                                             const ObjectPtr<Object>& sptr_to_self) {
   if (name == "profile") {
-    return TypedPackedFunc<profiling::Report(String)>([sptr_to_self, this](String arg_name) {
-      std::vector<Device> devices;
-      for (auto dev : devices_) {
-        if (dev.device_type > 0) {
-          devices.push_back(dev);
-        }
-      }
-
-      auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
-      // warmup
-      for (int i = 0; i < 3; i++) {
-        invoke(arg_name);
-      }
-
-      prof_ = profiling::Profiler();  // reset profiler
-      prof_.Start(devices);
-      invoke(arg_name);
-      prof_.Stop();
-      return prof_.Report();
-    });
+    return TypedPackedFunc<profiling::Report(String, Array<profiling::MetricCollector>)>(
+        [sptr_to_self, this](String arg_name, Array<profiling::MetricCollector> collectors) {
+          std::vector<Device> devices;
+          for (auto dev : devices_) {
+            if (dev.device_type > 0) {
+              devices.push_back(dev);
+            }
+          }
+
+          std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
+          prof_ = profiling::Profiler(devices, cs);
+
+          auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
+          // warmup
+          for (int i = 0; i < 3; i++) {
+            invoke(arg_name);
+          }
+
+          prof_.operator*().Start();
+          invoke(arg_name);
+          prof_.operator*().Stop();
+          auto report = prof_.operator*().Report();
+          prof_ = dmlc::optional<profiling::Profiler>();  // releases hardware counters
+          return report;
+        });
   } else {
     return VirtualMachine::GetFunction(name, sptr_to_self);
   }
@@ -80,7 +85,7 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
                                        Index output_size, const std::vector<ObjectRef>& args) {
   ICHECK(exec_);
   ICHECK(!devices_.empty()) << "Device has not been initialized yet.";
-  if (prof_.IsRunning()) {
+  if (prof_ && prof_.operator*().IsRunning()) {
     // The device of any input of the operator is used for synchronization.
     ICHECK_GT(arg_count, 0U);
     ObjectRef arg = args[0];
@@ -122,11 +127,11 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
     }
     metrics["Argument Shapes"] = profiling::ShapeString(shapes);
 
-    prof_.StartCall(packed_index_map_[packed_index], dev, metrics);
+    prof_.operator*().StartCall(packed_index_map_[packed_index], dev, metrics);
   }
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args);
-  if (prof_.IsRunning()) {
-    prof_.StopCall();
+  if (prof_ && prof_.operator*().IsRunning()) {
+    prof_.operator*().StopCall();
   }
 }
 
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 521a9bd454e7..1efefda52b97 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -25,6 +25,7 @@
 #ifndef TVM_RUNTIME_VM_PROFILER_VM_H_
 #define TVM_RUNTIME_VM_PROFILER_VM_H_
 
+#include <dmlc/optional.h>
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/vm/vm.h>
 
@@ -39,7 +40,7 @@ namespace vm {
 
 class VirtualMachineDebug : public VirtualMachine {
  public:
-  VirtualMachineDebug() : VirtualMachine() {}
+  VirtualMachineDebug() : VirtualMachine(), prof_({}) {}
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;
 
@@ -52,7 +53,7 @@ class VirtualMachineDebug : public VirtualMachine {
                     const std::vector<ObjectRef>& args) final;
 
   std::unordered_map<Index, std::string> packed_index_map_;
-  profiling::Profiler prof_;
+  dmlc::optional<profiling::Profiler> prof_;
 };
 
 }  // namespace vm
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index ee8032550b39..7420e32b5416 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -18,6 +18,7 @@
 import pytest
 from io import StringIO
 import csv
+import os
 
 import tvm.testing
 from tvm.runtime import profiler_vm
@@ -26,6 +27,24 @@
 from tvm.contrib.debugger import debug_executor
 
 
+def read_csv(report):
+    f = StringIO(report.csv())
+    headers = []
+    rows = []
+    reader = csv.reader(f, delimiter=",")
+    # force parsing
+    in_header = True
+    for row in reader:
+        if in_header:
+            headers = row
+            in_header = False
+            rows = [[] for x in headers]
+        else:
+            for i in range(len(row)):
+                rows[i].append(row[i])
+    return dict(zip(headers, rows))
+
+
 @pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
 @tvm.testing.parametrize_targets
 def test_vm(target, dev):
@@ -39,14 +58,9 @@ def test_vm(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
 
-    f = StringIO(report.csv())
-    reader = csv.reader(f, delimiter=",")
-    # force parsing
-    in_header = True
-    for row in reader:
-        if in_header:
-            assert "Hash" in row
-            in_header = False
+    csv = read_csv(report)
+    assert "Hash" in csv.keys()
+    assert all([float(x) > 0 for x in csv["Duration (us)"]])
 
 
 @tvm.testing.parametrize_targets
@@ -61,3 +75,39 @@ def test_graph_executor(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
     assert "Hash" in str(report)
+
+
+@tvm.testing.parametrize_targets("cuda", "llvm")
+@pytest.mark.skipif(
+    tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None,
+    reason="PAPI profiling not enabled",
+)
+def test_papi(target, dev):
+    target = tvm.target.Target(target)
+    if str(target.kind) == "llvm":
+        metric = "PAPI_FP_OPS"
+    elif str(target.kind) == "cuda":
+        metric = "cuda:::event:shared_load:device=0"
+    else:
+        pytest.skip(f"Target {target.kind} not supported by this test")
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.vm.compile(mod, target, params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
+
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
+    report = vm.profile(
+        [data],
+        func_name="main",
+        collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})],
+    )
+    print(report)
+    assert metric in str(report)
+
+    csv = read_csv(report)
+    assert metric in csv.keys()
+    assert any([float(x) > 0 for x in csv[metric]])
+
+
+if __name__ == "__main__":
+    test_papi("llvm", tvm.cpu())