From 79131a1f20f43d1d05e56142ced584f2700c6648 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 27 Oct 2020 16:29:37 -0700
Subject: [PATCH] [FIX][AUTOTVM] Make autotvm work with spawn

Like #6671 this PR fixes autotvm when using the spawn start method for
multiprocessing. I've added some tests to make sure that things work
with spawn in the CI.
---
 python/tvm/autotvm/task/task.py               | 14 ++++--
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 50 +++++++++++++------
 .../unittest/test_autotvm_index_tuner.py      | 14 ++++++
 tests/python/unittest/test_autotvm_measure.py | 15 ++++++
 .../unittest/test_autotvm_xgboost_model.py    | 15 ++++++
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  4 ++
 tutorials/auto_scheduler/tune_matmul_x86.py   |  4 ++
 tutorials/autotvm/tune_conv2d_cuda.py         |  4 ++
 tutorials/autotvm/tune_relay_arm.py           |  4 ++
 tutorials/autotvm/tune_relay_cuda.py          |  4 ++
 tutorials/autotvm/tune_relay_mobile_gpu.py    |  4 ++
 tutorials/autotvm/tune_relay_x86.py           |  4 ++
 tutorials/autotvm/tune_simple_template.py     |  4 ++
 13 files changed, 121 insertions(+), 19 deletions(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index a7cb9a095765..8822ba971e4c 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -23,15 +23,14 @@
 """
 import numpy as np
 
-from tvm.target import Target
 from tvm import runtime
 from tvm.ir import container
+from tvm.target import Target
+from tvm.te import placeholder, tensor
 from tvm.tir import expr
-from tvm.te import tensor, placeholder
-
 
 from ..util import get_const_int, get_const_tuple
-from .dispatcher import DispatchContext, ApplyConfig
+from .dispatcher import ApplyConfig, DispatchContext
 from .space import ConfigSpace
 
 
@@ -173,6 +172,8 @@ def __getstate__(self):
         # some unpickable local task functions.
         # So we only pickle the name of the function
         # and restore the function by name when unpickling it.
+        import cloudpickle  # pylint: disable=import-outside-toplevel
+
         return {
             "name": self.name,
             "args": self.args,
@@ -181,14 +182,17 @@ def __getstate__(self):
             "flop": self.flop,
             "target": self.target,
             "target_host": self.target_host,
+            "func": cloudpickle.dumps(self.func),
         }
 
     def __setstate__(self, state):
+        import cloudpickle  # pylint: disable=import-outside-toplevel
+
         self.name = state["name"]
         self.args = state["args"]
         self.kwargs = state["kwargs"]
         self.config_space = state["config_space"]
-        self.func = _lookup_task(state["name"])
+        self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
         self.target = state["target"]
         self.target_host = state["target_host"]
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 7b9df1c99373..f66764c42520 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -153,7 +153,10 @@ def _reset_pool(self, space, target, task):
 
         self._close_pool()
 
-        # use global variable to pass common arguments
+        # Use global variable to pass common arguments. This is only used when
+        # new processes are started with fork. We have to set the globals
+        # before we create the pool, so that processes in the pool get the
+        # correct globals.
         global _extract_space, _extract_target, _extract_task
         _extract_space = space
         _extract_target = target
@@ -324,7 +327,12 @@ def _get_feature(self, indexes):
 
         if need_extract:
             pool = self._get_pool()
-            feas = pool.map(self.feature_extract_func, need_extract)
+            # If we are forking, we can pass arguments in globals for better performance
+            if multiprocessing.get_start_method(False) == "fork":
+                feas = pool.map(self.feature_extract_func, need_extract)
+            else:
+                args = [(self.space.get(x), self.target, self.task) for x in need_extract]
+                feas = pool.map(self.feature_extract_func, args)
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea
 
@@ -344,18 +352,24 @@ def __del__(self):
         self._close_pool()
 
 
+# Global variables for passing arguments to extract functions.
 _extract_space = None
 _extract_target = None
 _extract_task = None
 
 
-def _extract_itervar_feature_index(index):
+def _extract_itervar_feature_index(args):
     """extract iteration var feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
-        with _extract_target:
-            sch, args = _extract_task.instantiate(config)
-        fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+            with _extract_target:
+                sch, fargs = _extract_task.instantiate(config)
+        else:
+            config, target, task = args
+            with target:
+                sch, fargs = task.instantiate(config)
+        fea = feature.get_itervar_feature_flatten(sch, fargs, take_log=True)
         fea = np.concatenate((fea, list(config.get_other_option().values())))
         return fea
     except Exception:  # pylint: disable=broad-except
@@ -381,10 +395,13 @@ def _extract_itervar_feature_log(arg):
         return None
 
 
-def _extract_knob_feature_index(index):
+def _extract_knob_feature_index(args):
     """extract knob feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+        else:
+            config = args[0]
         return config.get_flatten_feature()
     except Exception:  # pylint: disable=broad-except
         return None
@@ -408,13 +425,18 @@ def _extract_knob_feature_log(arg):
         return None
 
 
-def _extract_curve_feature_index(index):
+def _extract_curve_feature_index(args):
     """extract sampled curve feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
-        with _extract_target:
-            sch, args = _extract_task.instantiate(config)
-        fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+            with _extract_target:
+                sch, fargs = _extract_task.instantiate(config)
+        else:
+            config, target, task = args
+            with target:
+                sch, fargs = task.instantiate(config)
+        fea = feature.get_buffer_curve_sample_flatten(sch, fargs, sample_n=20)
         fea = np.concatenate((fea, list(config.get_other_option().values())))
         return np.array(fea)
     except Exception:  # pylint: disable=broad-except
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
index 05f12118e6af..c433d8fb7297 100644
--- a/tests/python/unittest/test_autotvm_index_tuner.py
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test index based tuners"""
 
+import multiprocessing
 from test_autotvm_common import DummyRunner, get_sample_task
 from tvm import autotvm
 from tvm.autotvm.tuner import GridSearchTuner, RandomTuner
@@ -43,6 +44,18 @@ def test_gridsearch_tuner():
     assert not tuner.has_next()
 
 
+def grid_search_spawn():
+    assert multiprocessing.get_spawn_method(False) == "spawn"
+    test_gridsearch_tuner()
+
+
+def test_grid_search_tuner_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=test_gridsearch_tuner)
+    p.start()
+    p.join()
+
+
 def test_random_tuner():
     """Test RandomTuner"""
 
@@ -65,4 +78,5 @@ def test_random_tuner():
 
 if __name__ == "__main__":
     test_gridsearch_tuner()
+    test_gridsearch_tuner_spawn()
     test_random_tuner()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index c8760d2be1b4..1a18d6122bf0 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test builder and runner"""
 import logging
+import multiprocessing
 import time
 
 import numpy as np
@@ -46,6 +47,19 @@ def test_task_tuner_without_measurement():
         assert tuner.best_flops > 1
 
 
+def task_tuner_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_task_tuner_without_measurement()
+
+
+def test_task_tuner_without_measurement_spawn():
+    # Subprocesses inherit the spawn method of their parents
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=task_tuner_spawn)
+    p.start()
+    p.join()
+
+
 def test_check_correctness():
     task, target = get_sample_task()
 
@@ -77,4 +91,5 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
     logging.basicConfig(level=logging.INFO)
 
     test_task_tuner_without_measurement()
+    test_task_tuner_without_measurement_spawn()
     test_check_correctness()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 5789a9fad4d5..58b2a4d66344 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -16,6 +16,7 @@
 # under the License.
 import time
 
+import multiprocessing
 import numpy as np
 
 import tvm
@@ -43,6 +44,19 @@ def test_fit():
     upper_model.fit(xs, ys, plan_size=32)
 
 
+def fit_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_fit()
+
+
+def test_fit_spawn():
+    # Subprocesses inherit the spawn method of their parents
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=test_fit)
+    p.start()
+    p.join()
+
+
 def test_tuner():
     task, target = get_sample_task()
     records = get_sample_records(n=100)
@@ -53,4 +67,5 @@ def test_tuner():
 
 if __name__ == "__main__":
     test_fit()
+    test_fit_spawn()
     test_tuner()
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 10a2d1b44144..42273bf72891 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -30,6 +30,10 @@
 find a good schedule in the space.
 
 We use a convolution layer as an example in this tutorial.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 import os
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 81f2e71ff8f7..0f2ebe0e09a4 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -27,6 +27,10 @@
 find a good schedule in the space.
 
 We use matrix multiplication as an example in this tutorial.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 import os
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index ce9c19860ff4..b307077905d3 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -22,6 +22,10 @@
 This is an advanced tutorial for writing high performance tunable template for
 NVIDIA GPU. By running auto-tuner on this template, we can outperform the
 vendor provided library CuDNN in many cases.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index f024ba4f201a..31fda54a9a7e 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -35,6 +35,10 @@
 We also released pre-tuned parameters for some arm devices. You can go to
 `ARM CPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#arm-cpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 4636103a22e2..e86430767b31 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -33,6 +33,10 @@
 We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
 `NVIDIA GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#nvidia-gpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 61254662c463..9a112e134f4f 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -33,6 +33,10 @@
 We also released pre-tuned parameters for some arm devices. You can go to
 `Mobile GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#mobile-gpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 1dd947fefd25..b1b7ca29e46a 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -23,6 +23,10 @@
 
 This is a tutorial about how to tune convolution neural network
 for x86 CPU.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 import os
 import numpy as np
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 357abf19a09c..b5167b3c72ab 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -26,6 +26,10 @@
 The second step is running a search algorithm to explore through this space.
 In this tutorial, you can learn how to perform these two steps in TVM.
 The whole workflow is illustrated by a matrix multiplication example.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################