apache · tqchen · Aug 4, 2018 · Aug 3, 2018 · merrymercy · Aug 6, 2018
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -327,14 +327,16 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
 def schedule_bitserial_conv2d_nhwc(outs):
     """Raspverry pi schedule for bitserial conv2d"""
     s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
     def traverse(op):
         """Traverse operators from computation graph"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(op.tag):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
         if 'spatial_bitserial_conv_nhwc' in op.tag:
@@ -360,6 +362,7 @@ def traverse(op):
 
             _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
                                           kernel, kernel_q, kernel_vec, conv_out, output, outs[0])
+        scheduled_ops.append(op)
 
     traverse(outs[0].op)
     return s
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
@@ -39,10 +39,11 @@ def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
 def schedule_conv2d_nchw_arm_cpu(cfg, outs):
     """TOPI schedule callback"""
     s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
 
     def _callback(op):
         # schedule conv2d
-        if 'spatial_conv_output' in op.tag:
+        if 'spatial_conv_output' in op.tag and op not in scheduled_ops:
             output = op.output(0)
             conv = op.input_tensors[0]
 
@@ -64,6 +65,8 @@ def _callback(op):
             output = op.output(0)
             _schedule_winograd(cfg, s, output, outs[0])
 
+        scheduled_ops.append(op)
+
     traverse_inline(s, outs[0].op, _callback)
     return s
 

diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -79,8 +79,10 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
 
         return s
 
+    scheduled_ops = []
+
     def _callback(op):
-        if op.tag == 'depthwise_conv2d_nchw':
+        if op.tag == 'depthwise_conv2d_nchw' and op not in scheduled_ops:
             output = op.output(0)
             kernel = op.input_tensors[1]
             data = op.input_tensors[0]
@@ -90,5 +92,7 @@ def _callback(op):
                 data = data_pad.op.input_tensors[0]
             _schedule(cfg, s, data, data_pad, kernel, output)
 
+        scheduled_ops.append(op)
+
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -99,13 +99,15 @@ def schedule(Apad, W, B):
         sch[WW].bind(tx, thread_x)
         sch[WW].vectorize(fi)
 
+    scheduled_ops = []
+
     def traverse(operator):
         """Traverse operators from computation graph"""
         if tag.is_broadcast(operator.tag):
             if operator not in sch.outputs:
                 sch[operator].compute_inline()
             for tensor in operator.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         elif operator.tag == 'conv2d_hwcn':
             Apad = operator.input_tensors[0]
@@ -117,5 +119,7 @@ def traverse(operator):
         else:
             raise RuntimeError("Unsupported operator: %s" % operator.tag)
 
+        scheduled_ops.append(operator)
+
     traverse(outs[0].op)
     return sch
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
@@ -492,14 +492,16 @@ def schedule(temp, Filter, Output):
         else:
             conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Traverse operators from computation graph"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule conv2d
         if 'conv2d_nchw' in OP.tag:
@@ -510,6 +512,8 @@ def traverse(OP):
             Output = OP.output(0)
             schedule(temp, Filter, Output)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
 

diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -73,14 +73,16 @@ def schedule(temp, Filter, Output):
         else:
             conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_injective(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule conv2d_transpose_nchw
         if 'conv2d_transpose_nchw' in OP.tag:
@@ -91,6 +93,8 @@ def traverse(OP):
             Output = OP.output(0)
             schedule(temp, Filter, Output)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
 

diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
@@ -86,14 +86,16 @@ def _schedule(Dense):
         s[Dense].set_store_predicate(thread_x.var.equal(0))
         s[Out].set_store_predicate(thread_x.var.equal(0))
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule dense
         elif OP.tag == 'dense':
@@ -102,5 +104,7 @@ def traverse(OP):
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -101,14 +101,16 @@ def _schedule(PaddedInput, Filter, DepthwiseConv2d):
         s[FS].bind(ty, thread_y)
         s[FS].bind(tx, thread_x)
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule depthwise_conv2d
         if OP.tag == 'depthwise_conv2d_nchw':
@@ -119,6 +121,8 @@ def traverse(OP):
             DepthwiseConv2d = OP.output(0)
             _schedule(PaddedInput, Filter, DepthwiseConv2d)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
 
@@ -180,14 +184,16 @@ def _schedule(temp, Filter, DepthwiseConv2d):
         fused = s[FS].fuse(fi, ci)
         s[FS].bind(fused, thread_x)
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule depthwise_conv2d
         if OP.tag == 'depthwise_conv2d_nhwc':
@@ -198,6 +204,8 @@ def traverse(OP):
             DepthwiseConv2d = OP.output(0)
             _schedule(PaddedInput, Filter, DepthwiseConv2d)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
 

diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
@@ -45,14 +45,16 @@ def _schedule(Pool):
         else:
             s[Pool].compute_at(s[Out], tx)
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule global_pool
         elif OP.tag.startswith('global_pool'):
@@ -61,6 +63,8 @@ def traverse(OP):
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
 
@@ -101,14 +105,16 @@ def _schedule(PaddedInput, Pool):
         else:
             s[Pool].compute_at(s[Out], tx)
 
+    scheduled_ops = []
+
     def traverse(OP):
         """Internal travserse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('pool'):
@@ -118,5 +124,7 @@ def traverse(OP):
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
+        scheduled_ops.append(OP)
+
     traverse(outs[0].op)
     return s
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
@@ -88,6 +88,7 @@ def schedule_reduce(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     sch = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
 
     def traverse_before_reduce(operator):
         """Internal travserse function"""
@@ -96,24 +97,32 @@ def traverse_before_reduce(operator):
         elif tag.is_injective(operator.tag):
             sch[operator].compute_inline()
             for tensor in operator.input_tensors:
-                traverse_before_reduce(tensor.op)
+                if tensor.op not in scheduled_ops:
+                    traverse_before_reduce(tensor.op)
         else:
             raise RuntimeError("Unsupported operator: %s" % operator.tag)
 
+        scheduled_ops.append(operator)
+
     def traverse_after_reduce(operator):
         """Internal travserse function"""
         if tag.is_broadcast(operator.tag):
             raise RuntimeError("Not yet support ewise after reduce")
         elif operator.tag == 'comm_reduce':
             _schedule_reduce(operator, sch, is_idx_reduce=False)
             for tensor in operator.input_tensors:
-                traverse_before_reduce(tensor.op)
+                if tensor.op not in scheduled_ops:
+                    traverse_before_reduce(tensor.op)
         elif operator.tag == 'comm_reduce_idx':
             _schedule_reduce(operator, sch, is_idx_reduce=True)
-            for tensor in operator.input_tensors[0].op.input_tensors:
-                traverse_before_reduce(tensor.op)
+            input_tensors = operator.input_tensors[0].op.input_tensors
+            for tensor in input_tensors:
+                if tensor.op not in scheduled_ops:
+                    traverse_before_reduce(tensor.op)
         else:
             raise RuntimeError("Unsupported operator: %s" % operator.tag)
 
+        scheduled_ops.append(operator)
+
     traverse_after_reduce(outs[0].op)
     return sch
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
@@ -11,6 +11,8 @@ def _default_schedule(outs):
     target = tvm.target.current_target()
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
     def traverse(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
         if "nms" in op.tag:
@@ -32,9 +34,11 @@ def traverse(op):
                 s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
                 s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
             for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
+        scheduled_ops.append(op)
+
     traverse(outs[0].op)
     return s
 

diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
@@ -113,19 +113,22 @@ def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, layout, out_
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
 
     def traverse(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
         if tag.is_broadcast(op.tag):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
            or "1_16" in op.tag:
             _schedule_cl_spatialpack_NCHWc(s, op)
 
+        scheduled_ops.append(op)
+
     traverse(outs[0].op)
 
     return s
@@ -360,19 +363,22 @@ def schedule_conv2d_nchw(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
 
     def traverse(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
         if tag.is_broadcast(op.tag):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
            or "1_16" in op.tag:
             _schedule_cl_spatialpack(s, op)
 
+        scheduled_ops.append(op)
+
     traverse(outs[0].op)
     return s