Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)

# 3rdparty libraries
tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
Expand Down Expand Up @@ -407,6 +408,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
include(cmake/modules/contrib/TensorRT.cmake)
include(cmake/modules/contrib/VitisAI.cmake)
include(cmake/modules/contrib/Verilator.cmake)
include(cmake/modules/contrib/PAPI.cmake)
include(cmake/modules/Git.cmake)
include(cmake/modules/LibInfo.cmake)
include(cmake/modules/RustExt.cmake)
Expand Down
25 changes: 25 additions & 0 deletions cmake/modules/contrib/PAPI.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

if(USE_PAPI)
find_package(PkgConfig REQUIRED)

set(ENV{PKG_CONFIG_PATH} "${USE_PAPI}:$ENV{PKG_CONFIG_PATH}")
pkg_check_modules(PAPI REQUIRED IMPORTED_TARGET papi>=6.0)
list(APPEND TVM_RUNTIME_LINKER_LIBS PkgConfig::PAPI)
list(APPEND RUNTIME_SRCS src/runtime/contrib/papi/papi.cc)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know whether this works as expected for cross-compilation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this works with cross compiling. PAPI does support cross compilation though. I think you'd have to set PKG_CONFIG_PATH or USE_PAPI to point to the cross compiled library.

endif()
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ For Developers
:caption: MISC

vta/index
profiling/index


Index
Expand Down
24 changes: 24 additions & 0 deletions docs/profiling/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
.. Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

.. http://www.apache.org/licenses/LICENSE-2.0

.. Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.

Profiling Deep Learning Models
====================================

.. toctree::
:maxdepth: 1

papi
114 changes: 114 additions & 0 deletions docs/profiling/papi.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
.. Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

.. http://www.apache.org/licenses/LICENSE-2.0

.. Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.


Getting Started With PAPI
=========================

The Performance Application Programming Interface (PAPI) is a library that
provides performance counters on a variety of platforms. Performance counters
provide accurate low-level information about processors behavior during a given
execution run. This information can contain simple metrics like total cycle
count, cache misses, and instructions executed as well as more high level
information like total FLOPS and warp occupancy. PAPI makes these metrics
available while profiling.

Installing PAPI
---------------

PAPI can either be installed using your package manager (``apt-get install libpapi-dev``
on Ubuntu), or from source here:
https://bitbucket.org/icl/papi/src/master/.


Building TVM With PAPI
----------------------

To include PAPI in your build of TVM, set the following line in you ``config.cmake``:

.. code::

set(USE_PAPI ON)

If PAPI is installed in a non-standard place, you can specify where it is like so:

.. code::

set(USE_PAPI path/to/papi.pc)


Using PAPI While Profiling
--------------------------

If TVM has been built with PAPI (see above), then you can pass a
:py:class:`tvm.runtime.profiling.PAPIMetricCollector` to
:py:meth:`tvm.runtime.GraphModule.profile` to collect performance metrics. Here
is an example:

.. code:: python

target = "llvm"
dev = tvm.cpu()
mod, params = mlp.get_workload(1)

exe = relay.vm.compile(mod, target, params=params)
vm = profiler_vm.VirtualMachineProfiler(exe, dev)

data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
report = vm.profile(
[data],
func_name="main",
collectors=[tvm.runtime.profiling.PAPIMetricCollector()],
)
print(report)

.. code::

Name perf::CACHE-MISSES perf::CYCLES perf::STALLED-CYCLES-BACKEND perf::INSTRUCTIONS perf::STALLED-CYCLES-FRONTEND
fused_nn_dense_nn_bias_add_nn_relu 2,494 1,570,698 85,608 675,564 39,583
fused_nn_dense_nn_bias_add_nn_relu_1 1,149 655,101 13,278 202,297 21,380
fused_nn_dense_nn_bias_add 288 600,184 8,321 163,446 19,513
fused_nn_batch_flatten 301 587,049 4,636 158,636 18,565
fused_nn_softmax 154 575,143 8,018 160,738 18,995
----------
Sum 4,386 3,988,175 119,861 1,360,681 118,036
Total 10,644 8,327,360 179,310 2,660,569 270,044

You can also change which metrics are collected:

.. code:: python

report = vm.profile(
[data],
func_name="main",
collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"])],
)

.. code::

Name PAPI_FP_OPS
fused_nn_dense_nn_bias_add_nn_relu 200,832
fused_nn_dense_nn_bias_add_nn_relu_1 16,448
fused_nn_dense_nn_bias_add 1,548
fused_nn_softmax 160
fused_nn_batch_flatten 0
----------
Sum 218,988
Total 218,988

You can find a list of available metrics by running the ``papi_avail`` and
``papi_native_avail`` commands.
46 changes: 46 additions & 0 deletions include/tvm/runtime/contrib/papi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \brief Performance counters for profiling via the PAPI library.
*/
#ifndef TVM_RUNTIME_CONTRIB_PAPI_H_
#define TVM_RUNTIME_CONTRIB_PAPI_H_

#include <tvm/runtime/container/array.h>
#include <tvm/runtime/container/map.h>
#include <tvm/runtime/profiling.h>

namespace tvm {
namespace runtime {
namespace profiling {

/*! \brief Construct a metric collector that collects data from hardware
* performance counters using the Performance Application Programming Interface
* (PAPI).
*
* \param metrics A mapping from a device type to the metrics that should be
* collected on that device. You can find the names of available metrics by
* running `papi_native_avail`.
*/
TVM_DLL MetricCollector CreatePAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics);
} // namespace profiling
} // namespace runtime
} // namespace tvm

#endif // TVM_RUNTIME_CONTRIB_PAPI_H_
102 changes: 96 additions & 6 deletions include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <vector>

namespace tvm {

namespace runtime {

/*! \brief Base class for all implementations.
Expand Down Expand Up @@ -150,6 +151,26 @@ class Timer : public ObjectRef {
Timer DefaultTimer(Device dev);

namespace profiling {
/*! \brief Wrapper for `Device` because `Device` is not passable across the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Device cannot be put as part of object container. How about DeviceObject?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DeviceObjectNode sounds a little weird. I think wrapper clearly explains what this is doing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not strongly attached to the choice, given it is in the profiling namespace, i will let you decide then

* PackedFunc interface.
*/
struct DeviceWrapperNode : public Object {
/*! The device */
Device device;

/*! Constructor */
explicit DeviceWrapperNode(Device device) : device(device) {}

static constexpr const char* _type_key = "runtime.profiling.DeviceWrapper";
TVM_DECLARE_BASE_OBJECT_INFO(DeviceWrapperNode, Object);
};

/*! \brief Wrapper for `Device`. */
class DeviceWrapper : public ObjectRef {
public:
explicit DeviceWrapper(Device dev) { data_ = make_object<DeviceWrapperNode>(dev); }
TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DeviceWrapper, ObjectRef, DeviceWrapperNode);
};

/*! \brief Data collected from a profiling run. Includes per-call metrics and per-device metrics.
*/
Expand Down Expand Up @@ -200,6 +221,57 @@ class Report : public ObjectRef {
TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Report, ObjectRef, ReportNode);
};

/*! \brief Interface for user defined profiling metric collection.
*
* Users can register their own collector by registering a packed function with
* the name "runtime.profiling.metrics.my_collector_name" where
* "my_collector_name" is the name of their collector. This function should
* take an Array of Device as input which contains the devices the collector
* will be run on.
*
* `MetricCollectorNode`s will be called in the following fashion.
* \code
* MetricCollector mc;
* for (auto op : model) {
* auto o = mc.Start();
* op();
* auto metrics = mc.Stop(o); // metrics are added the profiling report
* }
* \endcode
*/
class MetricCollectorNode : public Object {
public:
/*! \brief Initialization call. Called before profiling has started. Any
* expensive precomputation should happen here.
* \param devs The list of devices this collector will be run on.
*/
virtual void Init(Array<DeviceWrapper> devs) = 0;
/*! \brief Start colling metrics for a function call.
* \param dev The device the call will be run on.
* \returns An object used to maintain state of the metric collection. This
* object will be passed to the corresponding `Stop` call. If the device is
* not supported, this function will return a nullptr ObjectRef.
*/
virtual ObjectRef Start(Device dev) = 0;
/*! \brief Stop collecting metrics.
* \param obj The object created by the corresponding `Start` call.
* \returns A set of metric names and the associated values. Values must be
* one of DurationNode, PercentNode, CountNode, or StringObj.
*/
virtual Map<String, ObjectRef> Stop(ObjectRef obj) = 0;

virtual ~MetricCollectorNode() {}

static constexpr const char* _type_key = "runtime.profiling.MetricCollector";
TVM_DECLARE_BASE_OBJECT_INFO(MetricCollectorNode, Object);
};

/*! \brief Wrapper for `MetricCollectorNode`. */
class MetricCollector : public ObjectRef {
public:
TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MetricCollector, ObjectRef, MetricCollectorNode);
};

/*! Information about a single function or operator call. */
struct CallFrame {
/*! Device on which the call was made */
Expand All @@ -210,16 +282,21 @@ struct CallFrame {
Timer timer;
/*! Extra performance metrics */
std::unordered_map<std::string, ObjectRef> extra_metrics;
/*! User defined metric collectors. Each pair is the MetricCollector and its
* associated data (returned from MetricCollector.Start).
*/
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this PR but just in general about hiding. In this case, we could use https://en.cppreference.com/w/cpp/language/pimpl to hide the CallFrame def, recommend to come with a followup refactor

std::vector<std::pair<MetricCollector, ObjectRef>> extra_collectors;
};

/*! Runtime profiler for function and/or operator calls. Used in the graph
* runtime and VM to provide profiling information for all operators.
*
* Example usage:
* \code{.cpp}
* Profiler prof;
* Device cpu, gpu;
* prof.Start({cpu, gpu});
* Profiler prof({cpu, gpu});
* my_gpu_kernel(); // do a warmup iteration
* prof.Start();
* prof.StartCall("my_gpu_kernel", gpu);
* my_gpu_kernel();
* prof.StopCall();
Expand All @@ -232,13 +309,24 @@ struct CallFrame {
*/
class Profiler {
public:
/*! \brief Start the profiler.
/*! Constructor.
*
* The profiler should be constructed before you do any warmup iterations.
*
* \note
* Calling this constructor will reset the TVM threadpool. It is necessary in
* order to install thread handlers required by certain collectors.
*
* \param devs The list of devices the profiler will be running on. Should
* include all devices used by profiled operators.
* \param metric_collectors Additional `MetricCollector`s to use with this profiler.
*/
explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors);
/*! \brief Start the profiler.
*
* This function should only be called once per object.
*/
void Start(const std::vector<Device>& devs);
void Start();
/*! \brief Stop the profiler.
*
* This function should only be called once per object after start has been called.
Expand Down Expand Up @@ -270,12 +358,14 @@ class Profiler {
/*! \brief Check if the profiler is currently running.
* \returns Whether or not the profiler is running.
*/
bool IsRunning() const { return !global_timers_.empty(); }
bool IsRunning() const { return is_running_; }

private:
std::vector<std::pair<Device, Timer>> global_timers_;
std::vector<Device> devs_;
bool is_running_{false};
std::vector<CallFrame> calls_;
std::stack<CallFrame> in_flight_;
std::vector<MetricCollector> collectors_;
};

/* \brief A duration in time. */
Expand Down
Loading