From 69a0e82f26df1d60bf87724a16ceac0d04e72c61 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Wed, 24 Jun 2020 19:41:21 +0800
Subject: [PATCH 1/8] [clflush] Enable x86 cpu cache flush

---
 src/runtime/rpc/rpc_module.cc       | 42 +++++++++++++++++++++++++++++
 tutorials/autotvm/tune_relay_x86.py | 10 +++++--
 2 files changed, 50 insertions(+), 2 deletions(-)
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 89f3e7c6c7f8..20357a14b2f7 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -24,8 +24,12 @@
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/registry.h>
 
+#include <cstdlib>
 #include <cstring>
 #include <memory>
+#if defined(_M_X64) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
 
 #include "rpc_endpoint.h"
 #include "rpc_session.h"
@@ -300,6 +304,35 @@ std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
   return rmod->sess();
 }
 
+/*!
+ * \brief Flush the cache.
+ * \param addr The address of data we want to flush
+ * \param len The length of data
+ */
+/*
+ * When we are in the tuning of TVM, we will make TVM occupy
+ * the cache fully and doesn't flush it during iteration.
+ * This has problems then in e2e testing, since arrays that
+ * we assume exist in cache (ie. weights) are evicted during e2e runs,
+ * which leads to lower performance.
+ */
+inline void CacheFlush(const char* addr, unsigned int len) {
+// TODO(FrozenGene): Support ARM.
+#if (defined(_M_X64) || defined(__x86_64__))
+  const size_t cache_line = 64;
+
+  if (addr == nullptr || len <= 0) {
+    return;
+  }
+
+  for (uintptr_t uptr = (uintptr_t)addr & ~(cache_line - 1); uptr < (uintptr_t)addr + len;
+       uptr += cache_line) {
+    _mm_clflush(reinterpret_cast<const void*>(uptr));
+  }
+
+#endif
+}
+
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
                              int min_repeat_ms) {
   CHECK(pf != nullptr);
@@ -313,12 +346,21 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
   auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
+    const char* cache_flush = std::getenv("TVM_AUTO_CACHE_FLUSH");
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
 
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
 
     for (int i = 0; i < repeat; ++i) {
+      if (cache_flush && std::atoi(cache_flush) != 0) {
+        CHECK_EQ(number, 1);
+        // we want to keep input data
+        for (int j = 1; j < args.size(); j++) {
+          CacheFlush(static_cast<char*>((args[j].operator DLTensor*()->data)),
+                     GetDataSize(*(args[j].operator DLTensor*())));
+        }
+      }
       std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
           tend;
       double duration_ms = 0.0;
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index dcc5b25c8288..c3d5cac82daa 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -122,8 +122,8 @@ def get_network(name, batch_size):
 
     'measure_option': autotvm.measure_option(
         builder=autotvm.LocalBuilder(),
-        runner=autotvm.LocalRunner(number=10, repeat=1,
-                                   min_repeat_ms=1000),
+        runner=autotvm.LocalRunner(number=1, repeat=10,
+                                   min_repeat_ms=0),
     ),
 }
 
@@ -183,12 +183,18 @@ def tune_and_evaluate(tuning_opt):
                                               ops=(relay.op.get("nn.conv2d"),))
 
     # run tuning tasks
+    # set TVM_AUTO_CACHE_FLUSH environment value be 1 to enable flush
+    # the cache during tuning kernel so that we could get more accurate
+    # performance when we run e2e testing.
+    os.environ["TVM_AUTO_CACHE_FLUSH"] = "1"
     tune_kernels(tasks, **tuning_opt)
     tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
 
     # compile kernels with graph-level best records
     with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
+        # when we run e2e testing, we could enable the cache back.
+        os.environ["TVM_AUTO_CACHE_FLUSH"] = "0"
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build_module.build(
                 mod, target=target, params=params)

From 015d7795a1622b6ae446e01066bf6105232f5d03 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Wed, 24 Jun 2020 20:11:35 +0800
Subject: [PATCH 2/8] solve windows build

---
 src/runtime/rpc/rpc_module.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 20357a14b2f7..8698b67dc690 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -28,7 +28,7 @@
 #include <cstring>
 #include <memory>
 #if defined(_M_X64) || defined(__x86_64__)
-#include <x86intrin.h>
+#include <immintrin.h>
 #endif
 
 #include "rpc_endpoint.h"

From 5b4fc161b7f936104dc44066dc0c0d3bf1a5cfea Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 6 Jul 2020 12:54:19 +0800
Subject: [PATCH 3/8] clflush packed func

---
 python/tvm/autotvm/measure/measure_methods.py |  4 +-
 python/tvm/runtime/module.py                  |  4 +-
 src/runtime/rpc/rpc_module.cc                 | 60 ++++++++++++-------
 src/runtime/rpc/rpc_session.h                 |  2 +-
 tests/python/unittest/test_autotvm_measure.py |  3 +-
 tutorials/autotvm/tune_relay_x86.py           | 14 +++--
 6 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index b8969f55c00a..c7a616f44593 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -473,8 +473,10 @@ def run_through_rpc(measure_input, build_result,
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
+        cache_flush_packed = remote.get_function("runtime.cpu_cache_flush")(1)
         time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
+            func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms,
+            f_prepare=cache_flush_packed)
 
         # set input
         if ref_input:
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 3cdb28f8c496..e4ebfa5e9db5 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -163,7 +163,7 @@ def save(self, file_name, fmt=""):
         """
         _ffi_api.ModuleSaveToFile(self, file_name, fmt)
 
-    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
+    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_prepare=None):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -207,7 +207,7 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
         try:
             feval = _ffi_api.RPCTimeEvaluator(
                 self, func_name, ctx.device_type, ctx.device_id,
-                number, repeat, min_repeat_ms)
+                number, repeat, min_repeat_ms, f_prepare)
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 8698b67dc690..2f88a0b0dbc5 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -187,7 +187,7 @@ class RPCModuleNode final : public ModuleNode {
   }
 
   PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, int repeat,
-                              int min_repeat_ms) {
+                              int min_repeat_ms, PackedFunc f_prepare) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
     int dev_type = ctx.device_type;
@@ -198,11 +198,11 @@ class RPCModuleNode final : public ModuleNode {
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,
                                         static_cast<int>(ctx.device_type), ctx.device_id, number,
-                                        repeat, min_repeat_ms);
+                                        repeat, min_repeat_ms, f_prepare);
     } else {
       return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
                                         static_cast<int>(ctx.device_type), ctx.device_id, number,
-                                        repeat, min_repeat_ms);
+                                        repeat, min_repeat_ms, f_prepare);
     }
   }
 
@@ -240,7 +240,7 @@ class RPCModuleNode final : public ModuleNode {
   // The local channel
   std::shared_ptr<RPCSession> sess_;
   // remote function to get time evaluator
-  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int)>
+  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, PackedFunc)>
       remote_get_time_evaluator_;
   // remote function getter for modules.
   TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
@@ -259,7 +259,12 @@ void* RPCWrappedFunc::UnwrapRemoteValueToHandle(const TVMArgValue& arg) const {
     CHECK(rmod->sess() == sess_)
         << "ValueError: Cannot pass in module into a different remote session";
     return rmod->module_handle();
-  } else {
+  }
+//  else if (arg.type_code() == kTVMPackedFuncHandle) {
+//    PackedFunc pf = arg;
+//    pf.body()
+//  }
+    else {
     LOG(FATAL) << "ValueError: Cannot pass type " << runtime::ArgTypeCode2Str(arg.type_code())
                << " as an argument to the remote";
     return nullptr;
@@ -316,9 +321,10 @@ std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
  * we assume exist in cache (ie. weights) are evicted during e2e runs,
  * which leads to lower performance.
  */
-inline void CacheFlush(const char* addr, unsigned int len) {
+inline void CacheFlushImpl(const char* addr, unsigned int len) {
 // TODO(FrozenGene): Support ARM.
 #if (defined(_M_X64) || defined(__x86_64__))
+  std::cout << "len " << len << "\n";
   const size_t cache_line = 64;
 
   if (addr == nullptr || len <= 0) {
@@ -333,8 +339,18 @@ inline void CacheFlush(const char* addr, unsigned int len) {
 #endif
 }
 
+PackedFunc CPUCacheFlush(int begin_index) {
+  auto f_cache_flush = [begin_index](TVMArgs args, TVMRetValue* rv) {
+    for (int i = begin_index; i < args.size(); i++) {
+      CacheFlushImpl(static_cast<char*>((args[i].operator DLTensor*()->data)),
+                     GetDataSize(*(args[i].operator DLTensor*())));
+    }
+  };
+  return PackedFunc(f_cache_flush);
+}
+
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms) {
+                             int min_repeat_ms, PackedFunc f_prepare) {
   CHECK(pf != nullptr);
 
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
@@ -343,23 +359,17 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
     return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
   }
 
-  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) mutable {
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_prepare](TVMArgs args, TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
-    const char* cache_flush = std::getenv("TVM_AUTO_CACHE_FLUSH");
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
 
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
 
     for (int i = 0; i < repeat; ++i) {
-      if (cache_flush && std::atoi(cache_flush) != 0) {
-        CHECK_EQ(number, 1);
-        // we want to keep input data
-        for (int j = 1; j < args.size(); j++) {
-          CacheFlush(static_cast<char*>((args[j].operator DLTensor*()->data)),
-                     GetDataSize(*(args[j].operator DLTensor*())));
-        }
+      if (f_prepare != nullptr) {
+        f_prepare.CallPacked(args, &temp);
       }
       std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
           tend;
@@ -400,7 +410,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
 
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
-                       int number, int repeat, int min_repeat_ms) {
+                       int number, int repeat, int min_repeat_ms, PackedFunc f_prepare) {
       TVMContext ctx;
       ctx.device_type = static_cast<DLDeviceType>(device_type);
       ctx.device_id = device_id;
@@ -408,18 +418,28 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         Module m = opt_mod.value();
         std::string tkey = m->type_key();
         if (tkey == "rpc") {
+          // Wrong
           return static_cast<RPCModuleNode*>(m.operator->())
-              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms);
+              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_prepare);
+          // Pass
+//          ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+//          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms, f_prepare);
         } else {
-          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms);
+          std::cout << "First LOCAL " << tkey << " \n";
+          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms, f_prepare);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
         CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
-        return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms);
+        return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms, f_prepare);
       }
     });
 
+TVM_REGISTER_GLOBAL("runtime.cpu_cache_flush")
+.set_body_typed([](int begin_index) {
+  return CPUCacheFlush(begin_index);
+});
+
 // server function registration.
 TVM_REGISTER_GLOBAL("tvm.rpc.server.ImportModule").set_body_typed([](Module parent, Module child) {
   parent->Import(child);
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 6a7e6d6e41c1..b75a785186b6 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -327,7 +327,7 @@ struct RemoteSpace {
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms);
+                             int min_repeat_ms, PackedFunc f_prepare);
 
 /*!
  * \brief Create a Global RPC module that refers to the session.
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index f96d333ddbc3..ab90c4f4d56d 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -56,6 +56,7 @@ def test_check_correctness():
 
     def _callback_correct(tuner, measure_inputs, measure_results):
         for _, res in zip(measure_inputs, measure_results):
+            print(res)
             assert res.error_no == 0
 
     tuner = autotvm.tuner.RandomTuner(task)
@@ -79,5 +80,5 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
 
-    test_task_tuner_without_measurement()
+    # test_task_tuner_without_measurement()
     test_check_correctness()
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index c3d5cac82daa..91d47ff4ef79 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -35,6 +35,7 @@
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
 import tvm.contrib.graph_runtime as runtime
+# from tvm.contrib.debugger import debug_runtime as runtime
 
 #################################################################
 # Define network
@@ -84,12 +85,12 @@ def get_network(name, batch_size):
 # Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
 # For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
 # "llvm -mcpu=core-avx2".
-target = "llvm"
+target = "llvm -mcpu=skylake-avx512"
 
 batch_size = 1
 dtype = "float32"
 model_name = "resnet-18"
-log_file = "%s.log" % model_name
+log_file = "%s.dense.log" % model_name
 graph_opt_sch_file = "%s_graph_opt.log" % model_name
 
 # Set the input name of the graph
@@ -186,15 +187,17 @@ def tune_and_evaluate(tuning_opt):
     # set TVM_AUTO_CACHE_FLUSH environment value be 1 to enable flush
     # the cache during tuning kernel so that we could get more accurate
     # performance when we run e2e testing.
-    os.environ["TVM_AUTO_CACHE_FLUSH"] = "1"
+    # os.environ["TVM_AUTO_CACHE_FLUSH"] = "1"
     tune_kernels(tasks, **tuning_opt)
     tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
 
     # compile kernels with graph-level best records
+    # graph_opt_sch_file = 'resnet-18_graph_opt_clflush.log'
     with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
+        # os.environ["TVM_BIND_MASTER_CORE_0"] = "1"
         # when we run e2e testing, we could enable the cache back.
-        os.environ["TVM_AUTO_CACHE_FLUSH"] = "0"
+        # os.environ["TVM_AUTO_CACHE_FLUSH"] = "0"
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build_module.build(
                 mod, target=target, params=params)
@@ -212,11 +215,12 @@ def tune_and_evaluate(tuning_opt):
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
+        # module.run()
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.
 
-# tune_and_evaluate(tuning_option)
+tune_and_evaluate(tuning_option)
 
 ######################################################################
 # Sample Output

From 2fd31589c5322e90caae59a8a1c98d40ad8b034e Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 13 Jul 2020 20:23:18 +0800
Subject: [PATCH 4/8] refactor

---
 python/tvm/autotvm/measure/measure_methods.py | 25 ++++---
 python/tvm/runtime/module.py                  |  4 +-
 src/runtime/rpc/rpc_module.cc                 | 67 +++++++++----------
 tutorials/autotvm/tune_relay_x86.py           | 19 ++----
 4 files changed, 58 insertions(+), 57 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index c7a616f44593..59d2780afdb6 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -180,12 +180,14 @@ class RPCRunner(Runner):
         Whether check correctness after measurement. This will use llvm cpu target to
         call your template and get the reference output.
         This can work for TOPI templates, but may not work for your custom template.
+    enable_cpu_cache_flush: bool
+        Whether to enable cpu cache flush
     """
     def __init__(self,
                  key, host, port, priority=1,
                  timeout=10, n_parallel=None,
                  number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
-                 check_correctness=False):
+                 check_correctness=False, enable_cpu_cache_flush=False):
         super(RPCRunner, self).__init__(timeout, n_parallel)
 
         self.key = key
@@ -200,6 +202,7 @@ def __init__(self,
 
         self.ref_input = None
         self.ref_output = None
+        self.enable_cpu_cache_flush = enable_cpu_cache_flush
         self.check_correctness = check_correctness
         self.cooldown_interval = cooldown_interval
 
@@ -267,7 +270,8 @@ def run(self, measure_inputs, build_results):
                                            self.cooldown_interval,
                                            remote_args,
                                            self.ref_input,
-                                           self.ref_output)
+                                           self.ref_output,
+                                           self.enable_cpu_cache_flush)
                 futures.append(ret)
 
             for future in futures:
@@ -309,7 +313,8 @@ class LocalRunner(RPCRunner):
         Whether check correctness after measurement. This will use llvm cpu target to
         call your template and get the reference output.
         This can work for TOPI templates, but may not work for your custom template.
-
+    enable_cpu_cache_flush: bool
+        Whether to enable cpu cache flush
     Note
     ----
     This is a "fake" local mode. We start a silent rpc tracker and rpc server
@@ -318,13 +323,14 @@ class LocalRunner(RPCRunner):
     def __init__(self,
                  timeout=10,
                  number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
-                 check_correctness=False):
+                 check_correctness=False, enable_cpu_cache_flush=False):
         super(LocalRunner, self).__init__('', None, None, 0,
                                           timeout=timeout, n_parallel=1,
                                           number=number, repeat=repeat,
                                           min_repeat_ms=min_repeat_ms,
                                           cooldown_interval=cooldown_interval,
-                                          check_correctness=check_correctness)
+                                          check_correctness=check_correctness,
+                                          enable_cpu_cache_flush=enable_cpu_cache_flush)
         self.tracker = None
         self.server = None
 
@@ -421,7 +427,8 @@ def _wrapped(measure_input, tmp_dir, **kwargs):
 
 def run_through_rpc(measure_input, build_result,
                     number, repeat, min_repeat_ms, cooldown_interval,
-                    remote_args, ref_input=None, ref_output=None):
+                    remote_args, ref_input=None, ref_output=None,
+                    enable_cpu_cache_flush=False):
     """Run a generated library through rpc
 
     Parameters
@@ -454,6 +461,8 @@ def run_through_rpc(measure_input, build_result,
         The reference input used for checking correctness
     ref_output: List of np.ndarray
         The reference output used for checking correctness
+    enable_cpu_cache_flush: bool
+        Whether to enable cpu cache flush
     """
     if isinstance(build_result, MeasureResult):
         return build_result
@@ -473,10 +482,10 @@ def run_through_rpc(measure_input, build_result,
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
-        cache_flush_packed = remote.get_function("runtime.cpu_cache_flush")(1)
+        f_prepare = 'cache_flush_cpu_non_first_arg' if enable_cpu_cache_flush else ''
         time_f = func.time_evaluator(
             func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms,
-            f_prepare=cache_flush_packed)
+            f_prepare=f_prepare)
 
         # set input
         if ref_input:
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index e4ebfa5e9db5..b60898baf9a1 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -163,7 +163,7 @@ def save(self, file_name, fmt=""):
         """
         _ffi_api.ModuleSaveToFile(self, file_name, fmt)
 
-    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_prepare=None):
+    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_prepare=''):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -192,6 +192,8 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
+        f_prepare: str, optional
+            The prepared function name we want to execute before executing the time evaluator.
 
         Note
         ----
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 2f88a0b0dbc5..0b1b3fcf0c2f 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -24,7 +24,6 @@
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/registry.h>
 
-#include <cstdlib>
 #include <cstring>
 #include <memory>
 #if defined(_M_X64) || defined(__x86_64__)
@@ -187,7 +186,7 @@ class RPCModuleNode final : public ModuleNode {
   }
 
   PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, int repeat,
-                              int min_repeat_ms, PackedFunc f_prepare) {
+                              int min_repeat_ms, const std::string& f_prepare_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
     int dev_type = ctx.device_type;
@@ -198,11 +197,11 @@ class RPCModuleNode final : public ModuleNode {
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,
                                         static_cast<int>(ctx.device_type), ctx.device_id, number,
-                                        repeat, min_repeat_ms, f_prepare);
+                                        repeat, min_repeat_ms, f_prepare_name);
     } else {
       return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
                                         static_cast<int>(ctx.device_type), ctx.device_id, number,
-                                        repeat, min_repeat_ms, f_prepare);
+                                        repeat, min_repeat_ms, f_prepare_name);
     }
   }
 
@@ -240,7 +239,7 @@ class RPCModuleNode final : public ModuleNode {
   // The local channel
   std::shared_ptr<RPCSession> sess_;
   // remote function to get time evaluator
-  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, PackedFunc)>
+  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, std::string)>
       remote_get_time_evaluator_;
   // remote function getter for modules.
   TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
@@ -259,12 +258,7 @@ void* RPCWrappedFunc::UnwrapRemoteValueToHandle(const TVMArgValue& arg) const {
     CHECK(rmod->sess() == sess_)
         << "ValueError: Cannot pass in module into a different remote session";
     return rmod->module_handle();
-  }
-//  else if (arg.type_code() == kTVMPackedFuncHandle) {
-//    PackedFunc pf = arg;
-//    pf.body()
-//  }
-    else {
+  } else {
     LOG(FATAL) << "ValueError: Cannot pass type " << runtime::ArgTypeCode2Str(arg.type_code())
                << " as an argument to the remote";
     return nullptr;
@@ -321,12 +315,10 @@ std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
  * we assume exist in cache (ie. weights) are evicted during e2e runs,
  * which leads to lower performance.
  */
-inline void CacheFlushImpl(const char* addr, unsigned int len) {
+inline void CPUCacheFlushImpl(const char* addr, unsigned int len) {
 // TODO(FrozenGene): Support ARM.
 #if (defined(_M_X64) || defined(__x86_64__))
-  std::cout << "len " << len << "\n";
   const size_t cache_line = 64;
-
   if (addr == nullptr || len <= 0) {
     return;
   }
@@ -339,14 +331,11 @@ inline void CacheFlushImpl(const char* addr, unsigned int len) {
 #endif
 }
 
-PackedFunc CPUCacheFlush(int begin_index) {
-  auto f_cache_flush = [begin_index](TVMArgs args, TVMRetValue* rv) {
-    for (int i = begin_index; i < args.size(); i++) {
-      CacheFlushImpl(static_cast<char*>((args[i].operator DLTensor*()->data)),
-                     GetDataSize(*(args[i].operator DLTensor*())));
-    }
-  };
-  return PackedFunc(f_cache_flush);
+inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
+  for (int i = begin_index; i < args.size(); i++) {
+    CPUCacheFlushImpl(static_cast<char*>((args[i].operator DLTensor*()->data)),
+                      GetDataSize(*(args[i].operator DLTensor*())));
+  }
 }
 
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
@@ -359,7 +348,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
     return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
   }
 
-  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_prepare](TVMArgs args, TVMRetValue* rv) mutable {
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_prepare](TVMArgs args,
+                                                                    TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
@@ -410,7 +400,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
 
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
-                       int number, int repeat, int min_repeat_ms, PackedFunc f_prepare) {
+                       int number, int repeat, int min_repeat_ms, std::string f_prepare_name) {
       TVMContext ctx;
       ctx.device_type = static_cast<DLDeviceType>(device_type);
       ctx.device_id = device_id;
@@ -418,26 +408,35 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         Module m = opt_mod.value();
         std::string tkey = m->type_key();
         if (tkey == "rpc") {
-          // Wrong
           return static_cast<RPCModuleNode*>(m.operator->())
-              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_prepare);
-          // Pass
-//          ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-//          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms, f_prepare);
+              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_prepare_name);
         } else {
-          std::cout << "First LOCAL " << tkey << " \n";
-          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms, f_prepare);
+          PackedFunc f_prepare;
+          if (!f_prepare_name.empty()) {
+            auto* pf_prepare = runtime::Registry::Get(f_prepare_name);
+            CHECK(pf_prepare != nullptr)
+                << "Cannot find " << f_prepare_name << " in the global function";
+            f_prepare = *pf_prepare;
+          }
+          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms,
+                                   f_prepare);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
         CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
+        PackedFunc f_prepare;
+        if (!f_prepare_name.empty()) {
+          auto* pf_prepare = runtime::Registry::Get(f_prepare_name);
+          CHECK(pf_prepare != nullptr)
+              << "Cannot find " << f_prepare_name << " in the global function";
+          f_prepare = *pf_prepare;
+        }
         return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms, f_prepare);
       }
     });
 
-TVM_REGISTER_GLOBAL("runtime.cpu_cache_flush")
-.set_body_typed([](int begin_index) {
-  return CPUCacheFlush(begin_index);
+TVM_REGISTER_GLOBAL("cache_flush_cpu_non_first_arg").set_body([](TVMArgs args, TVMRetValue* rv) {
+  CPUCacheFlush(1, args);
 });
 
 // server function registration.
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 91d47ff4ef79..bee9b7d3fb4a 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -35,7 +35,6 @@
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
 import tvm.contrib.graph_runtime as runtime
-# from tvm.contrib.debugger import debug_runtime as runtime
 
 #################################################################
 # Define network
@@ -85,12 +84,12 @@ def get_network(name, batch_size):
 # Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
 # For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
 # "llvm -mcpu=core-avx2".
-target = "llvm -mcpu=skylake-avx512"
+target = "llvm"
 
 batch_size = 1
 dtype = "float32"
 model_name = "resnet-18"
-log_file = "%s.dense.log" % model_name
+log_file = "%s.log" % model_name
 graph_opt_sch_file = "%s_graph_opt.log" % model_name
 
 # Set the input name of the graph
@@ -124,7 +123,8 @@ def get_network(name, batch_size):
     'measure_option': autotvm.measure_option(
         builder=autotvm.LocalBuilder(),
         runner=autotvm.LocalRunner(number=1, repeat=10,
-                                   min_repeat_ms=0),
+                                   min_repeat_ms=0,
+                                   enable_cpu_cache_flush=True),
     ),
 }
 
@@ -184,20 +184,12 @@ def tune_and_evaluate(tuning_opt):
                                               ops=(relay.op.get("nn.conv2d"),))
 
     # run tuning tasks
-    # set TVM_AUTO_CACHE_FLUSH environment value be 1 to enable flush
-    # the cache during tuning kernel so that we could get more accurate
-    # performance when we run e2e testing.
-    # os.environ["TVM_AUTO_CACHE_FLUSH"] = "1"
     tune_kernels(tasks, **tuning_opt)
     tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
 
     # compile kernels with graph-level best records
-    # graph_opt_sch_file = 'resnet-18_graph_opt_clflush.log'
     with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
-        # os.environ["TVM_BIND_MASTER_CORE_0"] = "1"
-        # when we run e2e testing, we could enable the cache back.
-        # os.environ["TVM_AUTO_CACHE_FLUSH"] = "0"
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build_module.build(
                 mod, target=target, params=params)
@@ -215,12 +207,11 @@ def tune_and_evaluate(tuning_opt):
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
-        # module.run()
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.
 
-tune_and_evaluate(tuning_option)
+# tune_and_evaluate(tuning_option)
 
 ######################################################################
 # Sample Output

From 1f20abfe2a8dd9cd6f3f5601a6b00b2cd3540ea8 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 13 Jul 2020 20:27:53 +0800
Subject: [PATCH 5/8] add doc of function params

---
 src/runtime/rpc/rpc_session.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index b75a785186b6..45854702084e 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -324,10 +324,11 @@ struct RemoteSpace {
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
+ * \param f_prepare The function to be executed before we excetute time evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms, PackedFunc f_prepare);
+                             int min_repeat_ms, PackedFunc f_prepare = nullptr);
 
 /*!
  * \brief Create a Global RPC module that refers to the session.

From 05d090e09099f69d0c534ef9554710bdbb0f2910 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 13 Jul 2020 20:29:50 +0800
Subject: [PATCH 6/8] restore git master testing

---
 tests/python/unittest/test_autotvm_measure.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index ab90c4f4d56d..f96d333ddbc3 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -56,7 +56,6 @@ def test_check_correctness():
 
     def _callback_correct(tuner, measure_inputs, measure_results):
         for _, res in zip(measure_inputs, measure_results):
-            print(res)
             assert res.error_no == 0
 
     tuner = autotvm.tuner.RandomTuner(task)
@@ -80,5 +79,5 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
 
-    # test_task_tuner_without_measurement()
+    test_task_tuner_without_measurement()
     test_check_correctness()

From cd3ed60a4c1a14d4b108a9940872b3698de8c8ad Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Tue, 14 Jul 2020 13:30:22 +0800
Subject: [PATCH 7/8] update

---
 python/tvm/autotvm/measure/measure_methods.py | 14 ++++--
 python/tvm/runtime/module.py                  |  8 ++--
 src/runtime/rpc/rpc_module.cc                 | 46 +++++++++----------
 src/runtime/rpc/rpc_session.h                 |  4 +-
 tutorials/autotvm/tune_relay_x86.py           | 11 +++++
 5 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 59d2780afdb6..5eb4e9958c9f 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -181,7 +181,7 @@ class RPCRunner(Runner):
         call your template and get the reference output.
         This can work for TOPI templates, but may not work for your custom template.
     enable_cpu_cache_flush: bool
-        Whether to enable cpu cache flush
+        Whether to enable cpu cache flush, which only has effect on CPU task.
     """
     def __init__(self,
                  key, host, port, priority=1,
@@ -314,7 +314,7 @@ class LocalRunner(RPCRunner):
         call your template and get the reference output.
         This can work for TOPI templates, but may not work for your custom template.
     enable_cpu_cache_flush: bool
-        Whether to enable cpu cache flush
+        Whether to enable cpu cache flush, which only has effect on CPU task.
     Note
     ----
     This is a "fake" local mode. We start a silent rpc tracker and rpc server
@@ -462,7 +462,7 @@ def run_through_rpc(measure_input, build_result,
     ref_output: List of np.ndarray
         The reference output used for checking correctness
     enable_cpu_cache_flush: bool
-        Whether to enable cpu cache flush
+        Whether to enable cpu cache flush, which only has effect on CPU task.
     """
     if isinstance(build_result, MeasureResult):
         return build_result
@@ -482,10 +482,16 @@ def run_through_rpc(measure_input, build_result,
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
+
+        # Limitation:
+        # We can not get PackFunction directly in the remote mode as it is wrapped
+        # under the std::function. We could lift the restriction later once we fold
+        # the PackedFunc as an object. Currently, we pass function name to work
+        # around it.
         f_prepare = 'cache_flush_cpu_non_first_arg' if enable_cpu_cache_flush else ''
         time_f = func.time_evaluator(
             func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms,
-            f_prepare=f_prepare)
+            f_preproc=f_prepare)
 
         # set input
         if ref_input:
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index b60898baf9a1..754bb6f7dcef 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -163,7 +163,7 @@ def save(self, file_name, fmt=""):
         """
         _ffi_api.ModuleSaveToFile(self, file_name, fmt)
 
-    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_prepare=''):
+    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_preproc=''):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -192,8 +192,8 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
-        f_prepare: str, optional
-            The prepared function name we want to execute before executing the time evaluator.
+        f_preproc: str, optional
+            The preprocess function name we want to execute before executing the time evaluator.
 
         Note
         ----
@@ -209,7 +209,7 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f
         try:
             feval = _ffi_api.RPCTimeEvaluator(
                 self, func_name, ctx.device_type, ctx.device_id,
-                number, repeat, min_repeat_ms, f_prepare)
+                number, repeat, min_repeat_ms, f_preproc)
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 0b1b3fcf0c2f..d1eb89164fb7 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -186,7 +186,7 @@ class RPCModuleNode final : public ModuleNode {
   }
 
   PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, int repeat,
-                              int min_repeat_ms, const std::string& f_prepare_name) {
+                              int min_repeat_ms, const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
     int dev_type = ctx.device_type;
@@ -197,11 +197,11 @@ class RPCModuleNode final : public ModuleNode {
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,
                                         static_cast<int>(ctx.device_type), ctx.device_id, number,
-                                        repeat, min_repeat_ms, f_prepare_name);
+                                        repeat, min_repeat_ms, f_preproc_name);
     } else {
       return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
                                         static_cast<int>(ctx.device_type), ctx.device_id, number,
-                                        repeat, min_repeat_ms, f_prepare_name);
+                                        repeat, min_repeat_ms, f_preproc_name);
     }
   }
 
@@ -339,7 +339,7 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
 }
 
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms, PackedFunc f_prepare) {
+                             int min_repeat_ms, PackedFunc f_preproc) {
   CHECK(pf != nullptr);
 
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
@@ -348,7 +348,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
     return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
   }
 
-  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_prepare](TVMArgs args,
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
                                                                     TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
@@ -358,8 +358,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
 
     for (int i = 0; i < repeat; ++i) {
-      if (f_prepare != nullptr) {
-        f_prepare.CallPacked(args, &temp);
+      if (f_preproc != nullptr) {
+        f_preproc.CallPacked(args, &temp);
       }
       std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
           tend;
@@ -400,7 +400,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
 
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
-                       int number, int repeat, int min_repeat_ms, std::string f_prepare_name) {
+                       int number, int repeat, int min_repeat_ms, std::string f_preproc_name) {
       TVMContext ctx;
       ctx.device_type = static_cast<DLDeviceType>(device_type);
       ctx.device_id = device_id;
@@ -409,29 +409,29 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         std::string tkey = m->type_key();
         if (tkey == "rpc") {
           return static_cast<RPCModuleNode*>(m.operator->())
-              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_prepare_name);
+              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_preproc_name);
         } else {
-          PackedFunc f_prepare;
-          if (!f_prepare_name.empty()) {
-            auto* pf_prepare = runtime::Registry::Get(f_prepare_name);
-            CHECK(pf_prepare != nullptr)
-                << "Cannot find " << f_prepare_name << " in the global function";
-            f_prepare = *pf_prepare;
+          PackedFunc f_preproc;
+          if (!f_preproc_name.empty()) {
+            auto* pf_preproc = runtime::Registry::Get(f_preproc_name);
+            CHECK(pf_preproc != nullptr)
+                << "Cannot find " << f_preproc_name << " in the global function";
+            f_preproc = *pf_preproc;
           }
           return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms,
-                                   f_prepare);
+                                   f_preproc);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
         CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
-        PackedFunc f_prepare;
-        if (!f_prepare_name.empty()) {
-          auto* pf_prepare = runtime::Registry::Get(f_prepare_name);
-          CHECK(pf_prepare != nullptr)
-              << "Cannot find " << f_prepare_name << " in the global function";
-          f_prepare = *pf_prepare;
+        PackedFunc f_preproc;
+        if (!f_preproc_name.empty()) {
+          auto* pf_preproc = runtime::Registry::Get(f_preproc_name);
+          CHECK(pf_preproc != nullptr)
+              << "Cannot find " << f_preproc_name << " in the global function";
+          f_preproc = *pf_preproc;
         }
-        return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms, f_prepare);
+        return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms, f_preproc);
       }
     });
 
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 45854702084e..954c5b4ead22 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -324,11 +324,11 @@ struct RemoteSpace {
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
- * \param f_prepare The function to be executed before we excetute time evaluator.
+ * \param f_preproc The function to be executed before we excetute time evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms, PackedFunc f_prepare = nullptr);
+                             int min_repeat_ms, PackedFunc f_preproc = nullptr);
 
 /*!
  * \brief Create a Global RPC module that refers to the session.
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index bee9b7d3fb4a..b083a799e662 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -114,6 +114,17 @@ def get_network(name, batch_size):
 # We will use local mode for tuning configuration. RPC tracker
 # mode can be setup similarly to the approach in
 # :ref:`tune_relay_arm` tutorial.
+#
+# In the measure option, we turn on enable_cpu_cache_flush to
+# get more precise measurement. When we turn it on, we don't
+# need to set min_repeat_ms to dynamically adjust to run op
+# many times so that we get precise measurement as when we
+# have cache flush, we could get precise measurement even we
+# run serveral times. So, we could just set number be 1 and
+# repeat be 10 to run only 10 times. The reason we set number be 1
+# is we will turn on cache flush before every repeat run in
+# internal implementation. So if number is greater than 1, the
+# cache flush effect will be probably invalid.
 
 tuning_option = {
     'log_filename': log_file,

From 8b0de2b271a38961657a5b6b85ce56b12d3cd144 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Wed, 15 Jul 2020 10:56:35 +0800
Subject: [PATCH 8/8] update

---
 python/tvm/autotvm/measure/measure_methods.py | 18 +++++++++++++++---
 tutorials/autotvm/tune_relay_x86.py           | 14 ++++----------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 5eb4e9958c9f..03fc09a2538e 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -181,7 +181,11 @@ class RPCRunner(Runner):
         call your template and get the reference output.
         This can work for TOPI templates, but may not work for your custom template.
     enable_cpu_cache_flush: bool
-        Whether to enable cpu cache flush, which only has effect on CPU task.
+        Whether to flush cache on CPU between repeated measurements.
+        Flushing cache can make the measured latency of one operator closer to
+        its actual latency during end-to-end inference.
+        To make this option effective, the argument `number` should also be set to 1.
+        This is only has effect on CPU task.
     """
     def __init__(self,
                  key, host, port, priority=1,
@@ -314,7 +318,11 @@ class LocalRunner(RPCRunner):
         call your template and get the reference output.
         This can work for TOPI templates, but may not work for your custom template.
     enable_cpu_cache_flush: bool
-        Whether to enable cpu cache flush, which only has effect on CPU task.
+        Whether to flush cache on CPU between repeated measurements.
+        Flushing cache can make the measured latency of one operator closer to
+        its actual latency during end-to-end inference.
+        To make this option effective, the argument `number` should also be set to 1.
+        This is only has effect on CPU task.
     Note
     ----
     This is a "fake" local mode. We start a silent rpc tracker and rpc server
@@ -462,7 +470,11 @@ def run_through_rpc(measure_input, build_result,
     ref_output: List of np.ndarray
         The reference output used for checking correctness
     enable_cpu_cache_flush: bool
-        Whether to enable cpu cache flush, which only has effect on CPU task.
+        Whether to flush cache on CPU between repeated measurements.
+        Flushing cache can make the measured latency of one operator closer to
+        its actual latency during end-to-end inference.
+        To make this option effective, the argument `number` should also be set to 1.
+        This is only has effect on CPU task.
     """
     if isinstance(build_result, MeasureResult):
         return build_result
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index b083a799e662..92fdafb056d1 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -115,16 +115,10 @@ def get_network(name, batch_size):
 # mode can be setup similarly to the approach in
 # :ref:`tune_relay_arm` tutorial.
 #
-# In the measure option, we turn on enable_cpu_cache_flush to
-# get more precise measurement. When we turn it on, we don't
-# need to set min_repeat_ms to dynamically adjust to run op
-# many times so that we get precise measurement as when we
-# have cache flush, we could get precise measurement even we
-# run serveral times. So, we could just set number be 1 and
-# repeat be 10 to run only 10 times. The reason we set number be 1
-# is we will turn on cache flush before every repeat run in
-# internal implementation. So if number is greater than 1, the
-# cache flush effect will be probably invalid.
+# To perform a precise measurement, we should repeat the measurement several
+# times and use the average of results. In addition, we need to flush the cache
+# for the weight tensors between repeated measurements. This can make the measured
+# latency of one operator closer to its actual latency during end-to-end inference.
 
 tuning_option = {
     'log_filename': log_file,