From 639db175c8de8983f286966adc22e93ea3ce0d90 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Mon, 17 Feb 2020 08:07:47 +0000
Subject: [PATCH 01/53] passed args down to acceptSubgraph

---
 .../extensions/lib_subgraph/subgraph_lib.cc   | 12 +++++-
 include/mxnet/lib_api.h                       | 31 ++++++++++++--
 src/c_api/c_api_symbolic.cc                   |  8 ++++
 .../partitioner/custom_subgraph_property.h    | 41 ++++++++++++++++---
 4 files changed, 82 insertions(+), 10 deletions(-)
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 0727eb786ad8..e3fd3dd9bd89 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -219,10 +219,20 @@ MXReturnValue mySupportedOps(std::string json,
 
 MXReturnValue myAcceptSubgraph(std::string json, int subraph_id, bool* accept,
                                std::unordered_map<std::string, std::string>& options,
-                               std::unordered_map<std::string, std::string>& attrs) {
+                               std::unordered_map<std::string, std::string>& attrs,
+                               std::map<std::string, MXTensor>& args) {
   for (auto kv : options) {
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
   }
+  for (auto kv : args) {
+    std::cout << "arg: " << kv.first << " ==> (";
+    for (auto s : kv.second.shape)
+      std::cout << s << ",";
+    std::cout << ") [";
+    for (int i=0; i<kv.second.size(); i++)
+      std::cout << kv.second.data<float>()[i] << ", ";
+    std::cout << "]" << std::endl;
+  }
   if(options.find("reject") != options.end() &&
      options["reject"].compare("True") == 0) {
     *accept = false;
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index aeb5f79e2f70..b47e9aa52ea3 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -717,7 +717,8 @@ typedef MXReturnValue (*supportedOps_t)(std::string, int, int*,
                                         std::unordered_map<std::string, std::string>&);
 typedef MXReturnValue (*acceptSubgraph_t)(std::string, int, bool*,
                                           std::unordered_map<std::string, std::string>&,
-                                          std::unordered_map<std::string, std::string>&);
+                                          std::unordered_map<std::string, std::string>&,
+                                          std::map<std::string, MXTensor>&);
 
 /*!
  * \brief An abstract class for subgraph property
@@ -920,7 +921,11 @@ typedef int (*partCallSupportedOps_t)(supportedOps_t supportedOps, const char *j
 typedef int (*partCallAcceptSubgraph_t)(acceptSubgraph_t acceptSubgraph, const char *json,
                                         int subgraph_id, int *accept, const char* const* opt_keys,
                                         const char* const* opt_vals, int num_opts,
-                                        char*** attr_keys, char*** attr_vals, int *num_attrs);
+                                        char*** attr_keys, char*** attr_vals, int *num_attrs,
+                                        const char* const* arg_names, int num_args,
+                                        void** arg_data, const int64_t** arg_shapes, int* arg_dims,
+                                        int* arg_types, size_t* arg_IDs, const char** arg_dev_type,
+                                        int* arg_dev_id);
 
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int version);
@@ -1283,7 +1288,11 @@ extern "C" {
   _partCallAcceptSubgraph(acceptSubgraph_t acceptSubgraph, const char *json,
                           int subgraph_id, int *accept, const char* const* opt_keys,
                           const char* const* opt_vals, int num_opts,
-                          char*** attr_keys, char*** attr_vals, int *num_attrs) {
+                          char*** attr_keys, char*** attr_vals, int *num_attrs,
+                          const char* const* arg_names, int num_args,
+                          void** arg_data, const int64_t** arg_shapes, int* arg_dims,
+                          int* arg_types, size_t* arg_IDs, const char** arg_dev_type,
+                          int* arg_dev_id) {
     std::string subgraph_json(json);
     bool accept_bool = false;
     // create map of attributes from list
@@ -1292,10 +1301,24 @@ extern "C" {
       opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
     }
 
+    // create a map of named tensors for args
+    std::map<std::string, MXTensor> args;
+    for (int i = 0; i < num_args; i++) {
+      std::vector<int64_t> shapes;
+      for (int j = 0; j < arg_dims[i]; j++)
+        shapes.push_back(arg_shapes[i][j]);
+      
+      // void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype, size_t vID, MXContext mx_ctx
+      MXTensor tensor(arg_data[i], shapes, (MXDType)arg_types[i],
+            arg_IDs[i], {arg_dev_type[i], arg_dev_id[i]});
+      args[arg_names[i]] = tensor;
+    }
+
+
     // attributes to set on subgraph node
     std::unordered_map<std::string, std::string> attrs;
 
-    MXReturnValue retval = acceptSubgraph(subgraph_json, subgraph_id, &accept_bool, opts, attrs);
+    MXReturnValue retval = acceptSubgraph(subgraph_json, subgraph_id, &accept_bool, opts, attrs, args);
     *accept = accept_bool;
 
     if (attrs.size() > 0) {
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 8f78fc110d49..ffbbd0ab548a 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1383,6 +1383,14 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
       common::HandleInferStorageTypeError(num_forward_inputs, indexed_graph,
                                           g.GetAttr<StorageTypeVector>("storage_type"));
     }
+    std::vector<std::string> arg_names = sym->ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+    g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
+    g.attrs["in_arg_names"] = std::make_shared<nnvm::any>(arg_names);
+  } else {
+    NDArray **in_args_ptr = static_cast<NDArray**>(nullptr);
+    std::vector<std::string> arg_names;
+    g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
+    g.attrs["in_arg_names"] = std::make_shared<nnvm::any>(arg_names);
   }
   std::vector<std::pair<std::string, std::string>> options_map;
   for (mx_uint i = 0; i < num_options; ++i) {
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 5d0629c25190..eea8715180a5 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -99,7 +99,10 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     const std::vector<std::pair<std::string, std::string>>& options_map) {
     // clear supported_nodes to remove state from previous calls
     supported_nodes.clear();
-
+    // get input args and arg names
+    in_arg_names = g.GetAttr<std::vector<std::string>>("in_arg_names");
+    in_args_ptr = g.GetAttr<NDArray**>("in_args");
+    
     // remove all graph attrs, some cannot be saved to json
     nnvm::Graph graph = std::move(g);
     graph.attrs.clear();
@@ -162,7 +165,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   }
   // override CreateSubgraphNode
   virtual nnvm::ObjectPtr CreateSubgraphNode(const nnvm::Symbol &sym,
-                                           const int subgraph_id = 0) const {
+                                             const int subgraph_id = 0) const {
     int accept = 1;
     int num_attr = 0;
     char** attr_keys = nullptr;
@@ -187,11 +190,37 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         }
       }
 
+      // convert input args
+      std::vector<const char*> arg_names;
+      std::vector<void*> arg_data;
+      std::vector<const int64_t *> arg_shapes;
+      std::vector<int> arg_dims;
+      std::vector<int> arg_types;
+      std::vector<size_t> arg_verIDs;
+      std::vector<const char*> arg_dev_type;
+      std::vector<int> arg_dev_id;
+      for (int i=0; i<in_arg_names.size(); i++) {
+        arg_names.push_back(in_arg_names[i].c_str());
+        const auto &in_arg = *(in_args_ptr[i]);
+        arg_data.push_back(in_arg.data().dptr_);
+        arg_shapes.push_back(in_arg.shape().data());
+        arg_dims.push_back(in_arg.shape().ndim());
+        arg_types.push_back(in_arg.dtype());
+        arg_verIDs.push_back(in_arg.version());
+        const char* ctx_str = in_arg.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+        arg_dev_type.push_back(ctx_str);
+        arg_dev_id.push_back(in_arg.ctx().real_dev_id());
+      }
       std::string subgraph_json = nnvm::pass::SaveJSON(g);
       CHECK(call_accept_subgraph_(accept_subgraph_, subgraph_json.c_str(),
-                                subgraph_id, &accept, opt_keys_.data(),
-                                opt_vals_.data(), opt_keys_.size(),
-                                &attr_keys, &attr_vals, &num_attr))
+                                  subgraph_id, &accept, opt_keys_.data(),
+                                  opt_vals_.data(), opt_keys_.size(),
+                                  &attr_keys, &attr_vals, &num_attr,
+                                  arg_names.data(), arg_names.size(),
+                                  arg_data.data(), arg_shapes.data(),
+                                  arg_dims.data(), arg_types.data(),
+                                  arg_verIDs.data(), arg_dev_type.data(),
+                                  arg_dev_id.data()))
         << "Error calling accept_subgraph for '" << subgraph_prop << "'";
     }
     if (accept) {
@@ -228,6 +257,8 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   std::string subgraph_op_name;
   std::vector<std::pair<std::string, std::string>> options_map_;
   std::vector<const char*> opt_keys_, opt_vals_;
+  std::vector<std::string> in_arg_names;
+  NDArray **in_args_ptr;
 };
 }  // namespace op
 }  // namespace mxnet

From 5898d5357ed9495fabec7ea4bab59da55c78d220 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 10:23:06 +0000
Subject: [PATCH 02/53] added example and set param names on inputs to subgraph
 to map

---
 .../extensions/lib_subgraph/subgraph_lib.cc   |  8 ++-
 .../extensions/lib_subgraph/test_subgraph.py  | 30 ++++++++++
 include/mxnet/lib_api.h                       | 14 ++---
 src/c_api/c_api.cc                            |  6 +-
 src/operator/subgraph/build_subgraph.cc       |  6 +-
 .../partitioner/custom_subgraph_property.h    | 55 +++++++++++--------
 6 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index e3fd3dd9bd89..df95888cd29b 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -206,7 +206,7 @@ MXReturnValue mySupportedOps(std::string json,
     }
 
     //check if op dtype is float
-    if(dtype == kFloat32) {
+    if((dtype == kFloat32 && options.count("reqFloat") > 0) || options.count("reqFloat") == 0) {
       //check if op is in whitelist
       if(std::find(op_names.begin(),op_names.end(),op.str.c_str()) != op_names.end()) {
         // found op in whitelist, set value to 1 to include op in subgraph
@@ -233,6 +233,12 @@ MXReturnValue myAcceptSubgraph(std::string json, int subraph_id, bool* accept,
       std::cout << kv.second.data<float>()[i] << ", ";
     std::cout << "]" << std::endl;
   }
+  if(options.count("reqArgs") > 0 && args.size() == 0) {
+    *accept = false;
+    std::cout << "rejecting subgraph since args were not provided" << std::endl;
+    return MX_SUCCESS;
+  }
+  
   if(options.find("reject") != options.end() &&
      options["reject"].compare("True") == 0) {
     *accept = false;
diff --git a/example/extensions/lib_subgraph/test_subgraph.py b/example/extensions/lib_subgraph/test_subgraph.py
index 8169261d4d42..6d030913a8db 100644
--- a/example/extensions/lib_subgraph/test_subgraph.py
+++ b/example/extensions/lib_subgraph/test_subgraph.py
@@ -35,12 +35,17 @@
     path = os.path.abspath('libsubgraph_lib.dll')
     mx.library.load(path)
 
+# example model, ops to be partitioned do not have args (use outputs from other ops as inputs)
 a = mx.sym.var('a')
 b = mx.sym.var('b')
 c = a + b
 d = mx.sym.exp(c)
 sym = mx.sym.log(d)
 
+# example model, ops to be partitioned have args
+d2 = mx.sym.exp(a)
+sym2 = mx.sym.log(d2)
+
 #execute in MXNet
 print('-------------------------------')
 print('Testing regular MXNet execution')
@@ -74,3 +79,28 @@
 exe3 = mysym3.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))})
 out3 = exe3.forward()
 print(out3)
+
+#execute in MXNet
+print('-------------------------------')
+print('Testing regular MXNet execution')
+exe4 = sym2.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
+out4 = exe4.forward()
+print(out4)
+
+# with propogating shapes/types
+print('-------------------------------')
+print('Testing partitioning with shapes/types')
+arg_array = [mx.nd.ones((3,2),dtype='float32')]
+mysym5 = sym2.optimize_for("myProp", arg_array, reqArgs=True)
+print(mysym5.tojson())
+exe5 = mysym5.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
+out5 = exe5.forward()
+print(out5)
+
+# without propogating shapes/types
+print('-------------------------------')
+print('Testing partitioning without shapes/types')
+mysym6 = sym2.optimize_for("myProp", reqArgs=True)
+exe6 = mysym6.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
+out6 = exe6.forward()
+print(out6)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b47e9aa52ea3..d3c922384bb7 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -923,9 +923,9 @@ typedef int (*partCallAcceptSubgraph_t)(acceptSubgraph_t acceptSubgraph, const c
                                         const char* const* opt_vals, int num_opts,
                                         char*** attr_keys, char*** attr_vals, int *num_attrs,
                                         const char* const* arg_names, int num_args,
-                                        void** arg_data, const int64_t** arg_shapes, int* arg_dims,
-                                        int* arg_types, size_t* arg_IDs, const char** arg_dev_type,
-                                        int* arg_dev_id);
+                                        void* const* arg_data, const int64_t* const* arg_shapes,
+                                        const int* arg_dims, const int* arg_types, const size_t* arg_IDs,
+                                        const char* const* arg_dev_type, const int* arg_dev_id);
 
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int version);
@@ -1290,9 +1290,9 @@ extern "C" {
                           const char* const* opt_vals, int num_opts,
                           char*** attr_keys, char*** attr_vals, int *num_attrs,
                           const char* const* arg_names, int num_args,
-                          void** arg_data, const int64_t** arg_shapes, int* arg_dims,
-                          int* arg_types, size_t* arg_IDs, const char** arg_dev_type,
-                          int* arg_dev_id) {
+                          void* const* arg_data, const int64_t* const* arg_shapes, const int* arg_dims,
+                          const int* arg_types, const size_t* arg_IDs, const char* const* arg_dev_type,
+                          const int* arg_dev_id) {
     std::string subgraph_json(json);
     bool accept_bool = false;
     // create map of attributes from list
@@ -1307,7 +1307,7 @@ extern "C" {
       std::vector<int64_t> shapes;
       for (int j = 0; j < arg_dims[i]; j++)
         shapes.push_back(arg_shapes[i][j]);
-      
+
       // void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype, size_t vID, MXContext mx_ctx
       MXTensor tensor(arg_data[i], shapes, (MXDType)arg_types[i],
             arg_IDs[i], {arg_dev_type[i], arg_dev_id[i]});
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 962bb3b6c06e..5cab37a66a1a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -823,9 +823,9 @@ int MXLoadLib(const char *path) {
 
       // MXNET_REGISTER_SUBGRAPH_PROPERTY(customBackend, CustomSubgraphProperty);
       mxnet::op::SubgraphBackendRegistry::Get()->__REGISTER_CUSTOM_PROPERTY__(name_str,
-                            std::make_shared<mxnet::op::CustomSubgraphProperty>(
-                           strategy_str, callSupportedOps, supportedOps_fp,
-                           callAcceptSubgraph, acceptSubgraph_fp, callFree, op_name_str));
+                               std::make_shared<mxnet::op::CustomSubgraphProperty>(
+                                    strategy_str, callSupportedOps, supportedOps_fp,
+                         callAcceptSubgraph, acceptSubgraph_fp, callFree, op_name_str));
     }
   }
   API_END();
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index a66e8a158c14..dfbc0b5f7df8 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -561,10 +561,12 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
     nnvm::ObjectPtr n = nnvm::CreateVariableNode(
         var_name + std::to_string(name_count_map[var_name]));
     // set attribute for subgraph input to indicate if it is from an arg/param to model
-    if (e->node->is_variable())
+    if (e->node->is_variable()) {
       n->attrs.dict["isArg"] = "True";
-    else
+      n->attrs.dict["argName"] = var_name;
+    } else {
       n->attrs.dict["isArg"] = "False";
+    }
     *e = nnvm::NodeEntry{n, 0, 0};
   }
 }
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index eea8715180a5..6050f16416ff 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -102,6 +102,28 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     // get input args and arg names
     in_arg_names = g.GetAttr<std::vector<std::string>>("in_arg_names");
     in_args_ptr = g.GetAttr<NDArray**>("in_args");
+
+    // convert input args
+    arg_names.clear();
+    arg_data.clear();
+    arg_shapes.clear();
+    arg_dims.clear();
+    arg_types.clear();
+    arg_verIDs.clear();
+    arg_dev_type.clear();
+    arg_dev_id.clear();
+    for (size_t i=0; i<in_arg_names.size(); i++) {
+      arg_names.push_back(in_arg_names[i].c_str());
+      const auto &in_arg = *(in_args_ptr[i]);
+      arg_data.push_back(in_arg.data().dptr_);
+      arg_shapes.push_back(in_arg.shape().data());
+      arg_dims.push_back(in_arg.shape().ndim());
+      arg_types.push_back(in_arg.dtype());
+      arg_verIDs.push_back(in_arg.version());
+      const char* ctx_str = in_arg.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+      arg_dev_type.push_back(ctx_str);
+      arg_dev_id.push_back(in_arg.ctx().real_dev_id());
+    }
     
     // remove all graph attrs, some cannot be saved to json
     nnvm::Graph graph = std::move(g);
@@ -190,27 +212,6 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         }
       }
 
-      // convert input args
-      std::vector<const char*> arg_names;
-      std::vector<void*> arg_data;
-      std::vector<const int64_t *> arg_shapes;
-      std::vector<int> arg_dims;
-      std::vector<int> arg_types;
-      std::vector<size_t> arg_verIDs;
-      std::vector<const char*> arg_dev_type;
-      std::vector<int> arg_dev_id;
-      for (int i=0; i<in_arg_names.size(); i++) {
-        arg_names.push_back(in_arg_names[i].c_str());
-        const auto &in_arg = *(in_args_ptr[i]);
-        arg_data.push_back(in_arg.data().dptr_);
-        arg_shapes.push_back(in_arg.shape().data());
-        arg_dims.push_back(in_arg.shape().ndim());
-        arg_types.push_back(in_arg.dtype());
-        arg_verIDs.push_back(in_arg.version());
-        const char* ctx_str = in_arg.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
-        arg_dev_type.push_back(ctx_str);
-        arg_dev_id.push_back(in_arg.ctx().real_dev_id());
-      }
       std::string subgraph_json = nnvm::pass::SaveJSON(g);
       CHECK(call_accept_subgraph_(accept_subgraph_, subgraph_json.c_str(),
                                   subgraph_id, &accept, opt_keys_.data(),
@@ -246,7 +247,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   virtual SubgraphSelectorPtr CreateSubgraphSelector() const {
     return std::make_shared<CustomContainOpSelector>(supported_nodes);
   }
-
+  
   std::string subgraph_prop;
   partCallSupportedOps_t call_supported_ops_;
   supportedOps_t supported_ops_;
@@ -259,7 +260,15 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   std::vector<const char*> opt_keys_, opt_vals_;
   std::vector<std::string> in_arg_names;
   NDArray **in_args_ptr;
-};
+  std::vector<const char*> arg_names;
+  std::vector<void*> arg_data;
+  std::vector<const int64_t*> arg_shapes;
+  std::vector<int> arg_dims;
+  std::vector<int> arg_types;
+  std::vector<size_t> arg_verIDs;
+  std::vector<const char*> arg_dev_type;
+  std::vector<int> arg_dev_id;
+  };
 }  // namespace op
 }  // namespace mxnet
 

From 2294584fd06946fc25e12c2f4ce146c8f876e71c Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 01:38:56 +0000
Subject: [PATCH 03/53] increased lib_api version number

---
 include/mxnet/lib_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index d3c922384bb7..822b9388379b 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -39,7 +39,7 @@
 #include <utility>
 #include <stdexcept>
 
-#define MX_LIBRARY_VERSION 3
+#define MX_LIBRARY_VERSION 4
 
 /*!
  * \brief For loading multiple custom op libraries in Linux, exporting same symbol multiple

From fad8e74c784704befdb022e32550d931b73cf4c0 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 01:51:09 +0000
Subject: [PATCH 04/53] fixed whitespace

---
 include/mxnet/lib_api.h                            | 14 ++++++++------
 .../partitioner/custom_subgraph_property.h         |  8 ++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 822b9388379b..c57c038c0319 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -924,8 +924,9 @@ typedef int (*partCallAcceptSubgraph_t)(acceptSubgraph_t acceptSubgraph, const c
                                         char*** attr_keys, char*** attr_vals, int *num_attrs,
                                         const char* const* arg_names, int num_args,
                                         void* const* arg_data, const int64_t* const* arg_shapes,
-                                        const int* arg_dims, const int* arg_types, const size_t* arg_IDs,
-                                        const char* const* arg_dev_type, const int* arg_dev_id);
+                                        const int* arg_dims, const int* arg_types,
+                                        const size_t* arg_IDs, const char* const* arg_dev_type,
+                                        const int* arg_dev_id);
 
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int version);
@@ -1290,8 +1291,9 @@ extern "C" {
                           const char* const* opt_vals, int num_opts,
                           char*** attr_keys, char*** attr_vals, int *num_attrs,
                           const char* const* arg_names, int num_args,
-                          void* const* arg_data, const int64_t* const* arg_shapes, const int* arg_dims,
-                          const int* arg_types, const size_t* arg_IDs, const char* const* arg_dev_type,
+                          void* const* arg_data, const int64_t* const* arg_shapes,
+                          const int* arg_dims, const int* arg_types,
+                          const size_t* arg_IDs, const char* const* arg_dev_type,
                           const int* arg_dev_id) {
     std::string subgraph_json(json);
     bool accept_bool = false;
@@ -1308,7 +1310,6 @@ extern "C" {
       for (int j = 0; j < arg_dims[i]; j++)
         shapes.push_back(arg_shapes[i][j]);
 
-      // void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype, size_t vID, MXContext mx_ctx
       MXTensor tensor(arg_data[i], shapes, (MXDType)arg_types[i],
             arg_IDs[i], {arg_dev_type[i], arg_dev_id[i]});
       args[arg_names[i]] = tensor;
@@ -1318,7 +1319,8 @@ extern "C" {
     // attributes to set on subgraph node
     std::unordered_map<std::string, std::string> attrs;
 
-    MXReturnValue retval = acceptSubgraph(subgraph_json, subgraph_id, &accept_bool, opts, attrs, args);
+    MXReturnValue retval = acceptSubgraph(subgraph_json, subgraph_id, &accept_bool,
+                                          opts, attrs, args);
     *accept = accept_bool;
 
     if (attrs.size() > 0) {
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 6050f16416ff..baf862036bf8 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -112,7 +112,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     arg_verIDs.clear();
     arg_dev_type.clear();
     arg_dev_id.clear();
-    for (size_t i=0; i<in_arg_names.size(); i++) {
+    for (size_t i=0; i < in_arg_names.size(); i++) {
       arg_names.push_back(in_arg_names[i].c_str());
       const auto &in_arg = *(in_args_ptr[i]);
       arg_data.push_back(in_arg.data().dptr_);
@@ -124,7 +124,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       arg_dev_type.push_back(ctx_str);
       arg_dev_id.push_back(in_arg.ctx().real_dev_id());
     }
-    
+
     // remove all graph attrs, some cannot be saved to json
     nnvm::Graph graph = std::move(g);
     graph.attrs.clear();
@@ -247,7 +247,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   virtual SubgraphSelectorPtr CreateSubgraphSelector() const {
     return std::make_shared<CustomContainOpSelector>(supported_nodes);
   }
-  
+
   std::string subgraph_prop;
   partCallSupportedOps_t call_supported_ops_;
   supportedOps_t supported_ops_;
@@ -268,7 +268,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   std::vector<size_t> arg_verIDs;
   std::vector<const char*> arg_dev_type;
   std::vector<int> arg_dev_id;
-  };
+};
 }  // namespace op
 }  // namespace mxnet
 

From 734f1c45cf58b17f582ad793467c44fe49a8ee2a Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 18:43:14 +0000
Subject: [PATCH 05/53] fixed spacing

---
 src/c_api/c_api.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 5cab37a66a1a..2b0e1206f98e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -821,11 +821,10 @@ int MXLoadLib(const char *path) {
       LOG(INFO) << "\t\tStrategy[" << j << "] " << strategy_str
                 << " subgraphOp: '" << op_name_str << "'";
 
-      // MXNET_REGISTER_SUBGRAPH_PROPERTY(customBackend, CustomSubgraphProperty);
-      mxnet::op::SubgraphBackendRegistry::Get()->__REGISTER_CUSTOM_PROPERTY__(name_str,
-                               std::make_shared<mxnet::op::CustomSubgraphProperty>(
-                                    strategy_str, callSupportedOps, supportedOps_fp,
-                         callAcceptSubgraph, acceptSubgraph_fp, callFree, op_name_str));
+      mxnet::op::SubgraphBackendRegistry::Get()->__REGISTER_CUSTOM_PROPERTY__
+        (name_str, std::make_shared<mxnet::op::CustomSubgraphProperty>
+          (strategy_str, callSupportedOps, supportedOps_fp,
+           callAcceptSubgraph, acceptSubgraph_fp, callFree, op_name_str));
     }
   }
   API_END();

From ceed9be676f0ab1c265925299c7237e14877c646 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Thu, 20 Feb 2020 22:26:35 +0000
Subject: [PATCH 06/53] added info about lib_api.h to README

---
 example/extensions/lib_subgraph/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index b113be267fd3..d32cf6afb2e2 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -53,9 +53,11 @@ You can start getting familiar with custom partitioners by running an example pr
 
 * **lib_subgraph/test_subgraph.py**: This file calls `mx.library.load(‘libsubgraph_lib.so’)` to load the library containing the custom components, partitions the model using the `optimize_for` API, and prints outputs of the forward passes. The outputs should be the same as the regular MXNet forward pass without partitioning.
 
+* **include/mxnet/lib_api.h**: This file from MXNet source code is the single header file needed to include all necessary data types and function prototypes for writing a custom operator library. You can either specify the include path in the `Makefile`, or copy the header file over to `example/extensions/lib_subgraph` folder. Note that apart from this header, the custom operator library is independent of MXNet source.
+
 ## Writing Custom Partitioner Library
 
-For building a library containing your own custom partitioner, compose a C++ source file like `mypart_lib.cc`, include `lib_api.h` header file, and write your custom partitioner with these essential functions:
+To build your own library containing a custom partitioner, compose a C++ source file like `mypart_lib.cc`, include `lib_api.h` header file, and write your custom partitioner with these essential functions:
 - `initialize` - Library Initialization Function
 - `REGISTER_PARTITIONER ` - Partitioner Registration Macro
 - `mySupportedOps ` - Operator Support

From 098db855b9cbf6d2726f5c717c585b87f12f3df6 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Thu, 20 Feb 2020 22:34:19 +0000
Subject: [PATCH 07/53] updated readme for new args argument to reviewSubgraph

---
 example/extensions/lib_subgraph/README.md | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index d32cf6afb2e2..a5a72ca6f046 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -122,11 +122,8 @@ There are several essential building blocks for making a custom partitioner:
     * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setReviewSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `reviewSubgraph` function.
 
             REGISTER_PARTITIONER(my_part_name)
-            .addStrategy("strategy1", 
-                          supportedOps, 
-                          "_custom_subgraph_op")
-            .setReviewSubgraph("strategy1", 
-                                reviewSubgraph);
+            .addStrategy("strategy1", supportedOps, "_custom_subgraph_op")
+            .setReviewSubgraph("strategy1", reviewSubgraph);
 
 
 Also there are some optional functions you can specify:
@@ -138,16 +135,15 @@ Also there are some optional functions you can specify:
                 std::string json,
                 int subraph_id,
                 bool* accept,
-                std::unordered_map<std::string, 
-                                   std::string>& options,
-                std::unordered_map<std::string, 
-                                   std::string>& attrs)
+                std::unordered_map<std::string, std::string>& options,
+                std::unordered_map<std::string, std::string>& attrs,
+                std::map<std::string, MXTensor>& args)
 
 Let’s take a closer look at those registry functions:
 
 * **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. The 2nd argument is an array of booleans, one for each operator in the model. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `true` for the node ID in the `ids` array. The last argument is the map of options specified by the user. Users can pass custom options to the partitioner and they are passed to this function in the `options` map. 
 
-* **reviewSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. The 4th argument is the map of options specified by the user. The last argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API.
+* **reviewSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API. The 4th argument is the map of options specified by the user. The 5th argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. The last argument is the map of params/weights/args to the model and the associated names. For inputs the the subgraph that come directly from the params/weights of the model, you can look up the name of the input in this map to get the actual tensor values.
 
 ### Writing A Custom Subgraph Operator
 

From cfcc0a61e9aa11232bad35fbc3342a7232083c62 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Thu, 20 Feb 2020 22:58:30 +0000
Subject: [PATCH 08/53] added more tests

---
 .../extensions/lib_subgraph/test_subgraph.py  | 61 ++++++++++++-------
 include/mxnet/lib_api.h                       |  4 +-
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/example/extensions/lib_subgraph/test_subgraph.py b/example/extensions/lib_subgraph/test_subgraph.py
index 4127d4b55ee4..a7577bc4b720 100644
--- a/example/extensions/lib_subgraph/test_subgraph.py
+++ b/example/extensions/lib_subgraph/test_subgraph.py
@@ -37,6 +37,9 @@
     path = os.path.abspath('libsubgraph_lib.dll')
     mx.library.load(path)
 
+###############################################
+# Test with subgraph not consuming params
+###############################################
 # example model, ops to be partitioned do not have args (use outputs from other ops as inputs)
 a = mx.sym.var('a')
 b = mx.sym.var('b')
@@ -44,10 +47,6 @@
 d = mx.sym.exp(c)
 sym = mx.sym.log(d)
 
-# example model, ops to be partitioned have args
-d2 = mx.sym.exp(a)
-sym2 = mx.sym.log(d2)
-
 #execute in MXNet
 print('-------------------------------')
 print('Testing regular MXNet execution')
@@ -82,37 +81,55 @@
 out3 = exe3.forward()
 print(out3)
 
+# Gluon Hybridize partitioning with shapes/types
+print('-------------------------------')
+print('Testing Gluon Hybridize partitioning with shapes/types')
+inputs = [a,b]
+sym_block = nn.SymbolBlock(sym, inputs)
+sym_block.initialize()
+sym_block.hybridize(backend='myProp')
+out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
+print(out4)
+
+
+###############################################
+# Test with subgraph directly consuming params
+###############################################
+# example model, ops to be partitioned have args
+d2 = mx.sym.exp(a)
+sym2 = mx.sym.log(d2)
+
 #execute in MXNet
 print('-------------------------------')
 print('Testing regular MXNet execution')
-exe4 = sym2.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
-out4 = exe4.forward()
-print(out4)
+exe5 = sym2.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
+out5 = exe5.forward()
+print(out5)
 
 # with propogating shapes/types
 print('-------------------------------')
 print('Testing partitioning with shapes/types')
 arg_array = [mx.nd.ones((3,2),dtype='float32')]
-mysym5 = sym2.optimize_for("myProp", arg_array, reqArgs=True)
-print(mysym5.tojson())
-exe5 = mysym5.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
-out5 = exe5.forward()
-print(out5)
+mysym6 = sym2.optimize_for("myProp", arg_array, reqArgs=True)
+print(mysym6.tojson())
+exe6 = mysym6.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
+out6 = exe6.forward()
+print(out6)
 
 # without propogating shapes/types
 print('-------------------------------')
 print('Testing partitioning without shapes/types')
-mysym6 = sym2.optimize_for("myProp", reqArgs=True)
-exe6 = mysym6.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
-out6 = exe6.forward()
-print(out6)
+mysym7 = sym2.optimize_for("myProp", reqArgs=True)
+exe7 = mysym7.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2))})
+out7 = exe7.forward()
+print(out7)
 
 # Gluon Hybridize partitioning with shapes/types
 print('-------------------------------')
 print('Testing Gluon Hybridize partitioning with shapes/types')
-inputs = [a,b]
-sym_block = nn.SymbolBlock(sym, inputs)
-sym_block.initialize()
-sym_block.hybridize(backend='myProp')
-out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
-print(out4)
+inputs = [a]
+sym2_block = nn.SymbolBlock(sym2, inputs)
+sym2_block.initialize()
+sym2_block.hybridize(backend='myProp')
+out8 = sym2_block(mx.nd.ones((3,2)))
+print(out8)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 8d15f402fa09..be8815760c8a 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -1329,8 +1329,8 @@ extern "C" {
     // attributes to set on subgraph node
     std::unordered_map<std::string, std::string> attrs;
 
-      MXReturnValue retval = reviewSubgraph(subgraph_json, subgraph_id, &accept_bool,
-                                            opts, attrs, args);
+    MXReturnValue retval = reviewSubgraph(subgraph_json, subgraph_id, &accept_bool,
+                                          opts, attrs, args);
     if (!retval) return retval;
     
     *accept = accept_bool;

From 1fa7f1d3a4924fa7fc3fc4c6d4648f7363a69d81 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Fri, 21 Feb 2020 01:41:09 +0000
Subject: [PATCH 09/53] added example for partitioning HybridBlock in-place
 without forward pass

---
 include/mxnet/lib_api.h     |  2 +-
 python/mxnet/gluon/block.py | 56 +++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index be8815760c8a..924cc35b04fa 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -1332,7 +1332,7 @@ extern "C" {
     MXReturnValue retval = reviewSubgraph(subgraph_json, subgraph_id, &accept_bool,
                                           opts, attrs, args);
     if (!retval) return retval;
-    
+
     *accept = accept_bool;
 
     if (attrs.size() > 0) {
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index e925b31a280f..da7bf26315c9 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1026,6 +1026,62 @@ def _call_cached_op(self, *args):
             out = [out]
         return _regroup(out, self._out_format)
 
+    def optimize_for(self, x, *args, backend=None, **kwargs):
+        """Partitions the Symbol generated by hybrizing the Block. Arguments must be 
+        :py:class:`NDArray`.
+
+        Parameters
+        ----------
+        x : NDArray
+            first input to model
+        *args : NDArray
+            other inputs to model
+        backend : str
+            The name of backend, as registered in `SubgraphBackendRegistry`, default None
+        **kwargs : dict of user-specified options to pass to the backend for partitioning, optional
+            Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
+            Equivalent to 'backend_opts' in hybridize API
+        """
+
+        self._backend = backend
+        if backend_opts is not None:
+            assert isinstance(backend_opts, dict), \
+            "HybridBlock hybridize requires backend_opts to be a dictionary."
+            self._backend_opts = backend_opts
+
+        # do part of hybrize API call
+        self._active = active
+        self._clear_cached_op()
+        if active and self._forward_hooks or self._forward_pre_hooks:
+            warnings.warn('"{block}" is being hybridized while still having forward hook/pre-hook. '
+                          'If "{block}" is a child of HybridBlock, the hooks will not take effect.'
+                          .format(block=self))
+        super(HybridBlock, self).hybridize(active, **kwargs)
+
+        # do part of forward API call
+        has_symbol, has_ndarray, ctx_set, first_ctx = _gather_type_ctx_info([x] + list(args))
+        if has_symbol:
+            raise ValueError('Inputs must be NDArrays for the optimize_for API'
+                             ' Please check the type of the args.\n')
+        if not has_symbol and not has_ndarray:
+            raise ValueError('In HybridBlock, there must be one NDArray or one Symbol in the input.'
+                             ' Please check the type of the args.\n')
+
+        ctx = first_ctx
+        if len(ctx_set) > 1:
+            raise ValueError('Find multiple contexts in the input, '
+                             'After hybridized, the HybridBlock only supports one input '
+                             'context. You can print the ele.ctx in the '
+                             'input arguments to inspect their contexts. '
+                             'Find all contexts = {}'.format(ctx_set))
+
+        self._build_cache(x, *args)
+        assert self._cached_op, "Gluon failed to build the cache. " \
+                                "This should never happen. " \
+                                "Please submit an issue on Github" \
+                                " https://github.com/apache/incubator-mxnet."
+        # do not actually call the cached_op
+        
     def _clear_cached_op(self):
         self._cached_graph = ()
         self._cached_op = None

From 8f37c48d9f0f559b8ee6b652087920b1165e4a54 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Fri, 21 Feb 2020 02:19:43 +0000
Subject: [PATCH 10/53] added example for partitioning

---
 .../extensions/lib_subgraph/test_subgraph.py  |  9 ++++++
 python/mxnet/gluon/block.py                   | 30 ++++++++++++-------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/example/extensions/lib_subgraph/test_subgraph.py b/example/extensions/lib_subgraph/test_subgraph.py
index a7577bc4b720..55a40514f105 100644
--- a/example/extensions/lib_subgraph/test_subgraph.py
+++ b/example/extensions/lib_subgraph/test_subgraph.py
@@ -91,6 +91,15 @@
 out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
 print(out4)
 
+# Gluon Hybridize partitioning with shapes/types without inference
+print('-------------------------------')
+print('Testing Gluon Hybridize partitioning with shapes/types without inference')
+inputs = [a,b]
+sym_block2 = nn.SymbolBlock(sym, inputs)
+sym_block2.initialize()
+sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend='myProp')
+sym_block2.export('partitioned')
+
 
 ###############################################
 # Test with subgraph directly consuming params
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index da7bf26315c9..6ab17b872d0d 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -975,9 +975,11 @@ def _build_cache(self, *args):
                          for name in out.list_arguments()]
             # Partition the graph.
             out = out.optimize_for(self._backend, arg_array, ctx, **self._backend_opts)
-
+            #update cached graph with partitioned graph
+            self._cached_graph = data, out
         self._cached_op = ndarray.CachedOp(out, flags)
 
+
     def _deferred_infer_shape(self, *args):
         try:
             self.infer_shape(*args)
@@ -1026,9 +1028,9 @@ def _call_cached_op(self, *args):
             out = [out]
         return _regroup(out, self._out_format)
 
-    def optimize_for(self, x, *args, backend=None, **kwargs):
-        """Partitions the Symbol generated by hybrizing the Block. Arguments must be 
-        :py:class:`NDArray`.
+    def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
+        """Activates or deactivates :py:class:`HybridBlock` s recursively. Has no effect on
+        non-hybrid children.
 
         Parameters
         ----------
@@ -1038,25 +1040,31 @@ def optimize_for(self, x, *args, backend=None, **kwargs):
             other inputs to model
         backend : str
             The name of backend, as registered in `SubgraphBackendRegistry`, default None
-        **kwargs : dict of user-specified options to pass to the backend for partitioning, optional
+        backend_opts : dict of user-specified options to pass to the backend for partitioning, optional
             Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
-            Equivalent to 'backend_opts' in hybridize API
+        static_alloc : bool, default False
+            Statically allocate memory to improve speed. Memory usage may increase.
+        static_shape : bool, default False
+            Optimize for invariant input shapes between iterations. Must also
+            set static_alloc to True. Change of input shapes is still allowed
+            but slower.
         """
 
         self._backend = backend
         if backend_opts is not None:
             assert isinstance(backend_opts, dict), \
             "HybridBlock hybridize requires backend_opts to be a dictionary."
-            self._backend_opts = backend_opts
+            self._backend_opts = kwargs
 
         # do part of hybrize API call
-        self._active = active
+        self._active = True
+        self._flags = list(kwargs.items())
         self._clear_cached_op()
-        if active and self._forward_hooks or self._forward_pre_hooks:
+        if self._forward_hooks or self._forward_pre_hooks:
             warnings.warn('"{block}" is being hybridized while still having forward hook/pre-hook. '
                           'If "{block}" is a child of HybridBlock, the hooks will not take effect.'
                           .format(block=self))
-        super(HybridBlock, self).hybridize(active, **kwargs)
+        super(HybridBlock, self).hybridize(True, **kwargs)
 
         # do part of forward API call
         has_symbol, has_ndarray, ctx_set, first_ctx = _gather_type_ctx_info([x] + list(args))
@@ -1064,7 +1072,7 @@ def optimize_for(self, x, *args, backend=None, **kwargs):
             raise ValueError('Inputs must be NDArrays for the optimize_for API'
                              ' Please check the type of the args.\n')
         if not has_symbol and not has_ndarray:
-            raise ValueError('In HybridBlock, there must be one NDArray or one Symbol in the input.'
+            raise ValueError('In HybridBlock, there must be one NDArray as input.'
                              ' Please check the type of the args.\n')
 
         ctx = first_ctx

From 729173f2f15f3227e73f52a8897fe9d1db594d25 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Fri, 21 Feb 2020 03:44:02 +0000
Subject: [PATCH 11/53] fixed whitespace

---
 python/mxnet/gluon/block.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 6ab17b872d0d..567b8c09c972 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1075,7 +1075,6 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
             raise ValueError('In HybridBlock, there must be one NDArray as input.'
                              ' Please check the type of the args.\n')
 
-        ctx = first_ctx
         if len(ctx_set) > 1:
             raise ValueError('Find multiple contexts in the input, '
                              'After hybridized, the HybridBlock only supports one input '
@@ -1089,7 +1088,7 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
                                 "Please submit an issue on Github" \
                                 " https://github.com/apache/incubator-mxnet."
         # do not actually call the cached_op
-        
+
     def _clear_cached_op(self):
         self._cached_graph = ()
         self._cached_op = None

From bb90d7069dfdcd3962669449949026815bfbc639 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Fri, 21 Feb 2020 04:11:56 +0000
Subject: [PATCH 12/53] fixed sanity

---
 python/mxnet/gluon/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 567b8c09c972..bab3f303852e 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1067,7 +1067,7 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
         super(HybridBlock, self).hybridize(True, **kwargs)
 
         # do part of forward API call
-        has_symbol, has_ndarray, ctx_set, first_ctx = _gather_type_ctx_info([x] + list(args))
+        has_symbol, has_ndarray, ctx_set,_ = _gather_type_ctx_info([x] + list(args))
         if has_symbol:
             raise ValueError('Inputs must be NDArrays for the optimize_for API'
                              ' Please check the type of the args.\n')

From 06c3841e8bab5b79a6d7adf585fb307151c6d041 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Fri, 21 Feb 2020 04:41:24 +0000
Subject: [PATCH 13/53] fixed lint

---
 python/mxnet/gluon/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index bab3f303852e..1249cc134e66 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1067,7 +1067,7 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
         super(HybridBlock, self).hybridize(True, **kwargs)
 
         # do part of forward API call
-        has_symbol, has_ndarray, ctx_set,_ = _gather_type_ctx_info([x] + list(args))
+        has_symbol, has_ndarray, ctx_set, _ = _gather_type_ctx_info([x] + list(args))
         if has_symbol:
             raise ValueError('Inputs must be NDArrays for the optimize_for API'
                              ' Please check the type of the args.\n')

From f8f6191c36ad31a31c42611253d3704821e5ff5e Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sat, 22 Feb 2020 19:07:46 +0000
Subject: [PATCH 14/53] added support for passing aux

---
 .../extensions/lib_subgraph/subgraph_lib.cc   |  3 +-
 include/mxnet/c_api.h                         |  4 +-
 include/mxnet/lib_api.h                       | 28 +++++++-
 python/mxnet/gluon/block.py                   |  4 +-
 python/mxnet/symbol/symbol.py                 | 20 +++++-
 src/c_api/c_api_symbolic.cc                   | 27 ++++++--
 .../partitioner/custom_subgraph_property.h    | 69 ++++++++++++-------
 7 files changed, 118 insertions(+), 37 deletions(-)

diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 6783676b3437..36110d69a4cb 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -219,7 +219,8 @@ MXReturnValue mySupportedOps(std::string json,
 MXReturnValue myReviewSubgraph(std::string json, int subraph_id, bool* accept,
                                std::unordered_map<std::string, std::string>& options,
                                std::unordered_map<std::string, std::string>& attrs,
-                               std::map<std::string, MXTensor>& args) {
+                               std::map<std::string, MXTensor>& args,
+                               std::map<std::string, MXTensor>& aux) {
   for (auto kv : options) {
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
   }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index bb2a5686c3b5..e3d9062cec79 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2170,8 +2170,10 @@ MXNET_DLL int MXOptimizeForBackend(SymbolHandle sym_handle,
                                    const char* backend_name,
                                    const int dev_type,
                                    SymbolHandle* ret_sym_handle,
-                                   const mx_uint len,
+                                   const mx_uint args_len,
                                    NDArrayHandle* in_args_handle,
+                                   const mx_uint aux_len,
+                                   NDArrayHandle* in_aux_handle,
                                    const mx_uint num_options,
                                    const char** keys,
                                    const char** vals);
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 924cc35b04fa..9173df120b8f 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -718,6 +718,7 @@ typedef MXReturnValue (*supportedOps_t)(std::string, std::vector<bool>&,
 typedef MXReturnValue (*reviewSubgraph_t)(std::string, int, bool*,
                                           std::unordered_map<std::string, std::string>&,
                                           std::unordered_map<std::string, std::string>&,
+                                          std::map<std::string, MXTensor>&,
                                           std::map<std::string, MXTensor>&);
 
 /*!
@@ -926,7 +927,12 @@ typedef int (*partCallReviewSubgraph_t)(reviewSubgraph_t reviewSubgraph, const c
                                         void* const* arg_data, const int64_t* const* arg_shapes,
                                         const int* arg_dims, const int* arg_types,
                                         const size_t* arg_IDs, const char* const* arg_dev_type,
-                                        const int* arg_dev_id);
+                                        const int* arg_dev_id,
+                                        const char* const* aux_names, int num_aux,
+                                        void* const* aux_data, const int64_t* const* aux_shapes,
+                                        const int* aux_dims, const int* aux_types,
+                                        const size_t* aux_IDs, const char* const* aux_dev_type,
+                                        const int* aux_dev_id);
 
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int version);
@@ -1304,7 +1310,12 @@ extern "C" {
                           void* const* arg_data, const int64_t* const* arg_shapes,
                           const int* arg_dims, const int* arg_types,
                           const size_t* arg_IDs, const char* const* arg_dev_type,
-                          const int* arg_dev_id) {
+                          const int* arg_dev_id,
+                          const char* const* aux_names, int num_aux,
+                          void* const* aux_data, const int64_t* const* aux_shapes,
+                          const int* aux_dims, const int* aux_types,
+                          const size_t* aux_IDs, const char* const* aux_dev_type,
+                          const int* aux_dev_id) {
     std::string subgraph_json(json);
     bool accept_bool = false;
     // create map of attributes from list
@@ -1324,13 +1335,24 @@ extern "C" {
             arg_IDs[i], {arg_dev_type[i], arg_dev_id[i]});
       args[arg_names[i]] = tensor;
     }
+    // create a map of named tensors for aux
+    std::map<std::string, MXTensor> aux;
+    for (int i = 0; i < num_aux; i++) {
+      std::vector<int64_t> shapes;
+      for (int j = 0; j < aux_dims[i]; j++)
+        shapes.push_back(aux_shapes[i][j]);
+
+      MXTensor tensor(aux_data[i], shapes, (MXDType)aux_types[i],
+            aux_IDs[i], {aux_dev_type[i], aux_dev_id[i]});
+      aux[aux_names[i]] = tensor;
+    }
 
 
     // attributes to set on subgraph node
     std::unordered_map<std::string, std::string> attrs;
 
     MXReturnValue retval = reviewSubgraph(subgraph_json, subgraph_id, &accept_bool,
-                                          opts, attrs, args);
+                                          opts, attrs, args, aux);
     if (!retval) return retval;
 
     *accept = accept_bool;
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 1249cc134e66..9d0fd0aa269a 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -973,8 +973,10 @@ def _build_cache(self, *args):
             # get list of params in the order of out.list_arguments
             arg_array = [args[data_names[name]] if name in data_names.keys() else params[name].data()
                          for name in out.list_arguments()]
+            aux_array = [args[data_names[name]] if name in data_names.keys() else params[name].data()
+                         for name in out.list_auxiliary_states()]
             # Partition the graph.
-            out = out.optimize_for(self._backend, arg_array, ctx, **self._backend_opts)
+            out = out.optimize_for(self._backend, arg_array, aux_array, ctx, **self._backend_opts)
             #update cached graph with partitioned graph
             self._cached_graph = data, out
         self._cached_op = ndarray.CachedOp(out, flags)
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index a4599c8435fa..7ed86f64effc 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1445,7 +1445,7 @@ def _gen_atomic_symbol(self):
         return Symbol(handle)
 
 
-    def optimize_for(self, backend, args=None, ctx=None, **kwargs):
+    def optimize_for(self, backend, args=None, aux=None, ctx=None, **kwargs):
         """Partitions current symbol and optimizes it for a given backend,
         returns new partitioned symbol.
 
@@ -1461,6 +1461,13 @@ def optimize_for(self, backend, args=None, ctx=None, **kwargs):
             - If type is a dict of str to `NDArray`, then it maps the name of arguments
               to the corresponding `NDArray`.
 
+        aux : list of NDArray or dict of str to NDArray, optional
+            Input auxiliary arguments to the symbol
+
+            - If type is a list of `NDArray`, the order is the same as that of `list_arguments()`.
+            - If type is a dict of str to `NDArray`, then it maps the name of arguments
+              to the corresponding `NDArray`.
+
         ctx : Context, optional
             Device context, used to infer stypes
 
@@ -1475,13 +1482,20 @@ def optimize_for(self, backend, args=None, ctx=None, **kwargs):
         out = SymbolHandle()
         assert isinstance(backend, str)
 
-        if args is None:
+        if args is None or len(args) == 0:
             args = []
             args_handle = c_array(NDArrayHandle, [])
         else:
             listed_arguments = self.list_arguments()
             args_handle, args = self._get_ndarray_inputs('args', args, listed_arguments, False)
 
+        if aux is None or len(aux) == 0:
+            aux = []
+            aux_handle = c_array(NDArrayHandle, [])
+        else:
+            listed_aux = self.list_arguments()
+            aux_handle, aux = self._get_ndarray_inputs('aux_states', aux,
+                                                       self.list_auxiliary_states(), False)
         if ctx is None:
             ctx = current_context()
         assert isinstance(ctx, Context)
@@ -1497,6 +1511,8 @@ def optimize_for(self, backend, args=None, ctx=None, **kwargs):
                                              ctypes.byref(out),
                                              mx_uint(len(args)),
                                              args_handle,
+                                             mx_uint(len(aux)),
+                                             aux_handle,
                                              mx_uint(len(key_list)),
                                              c_str_array(key_list),
                                              c_str_array(val_list)))
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index ffbbd0ab548a..577fdae679f2 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1343,8 +1343,10 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
                          const char* backend_name,
                          const int dev_type,
                          SymbolHandle* ret_sym_handle,
-                         const mx_uint len,
+                         const mx_uint args_len,
                          NDArrayHandle* in_args_handle,
+                         const mx_uint aux_len,
+                         NDArrayHandle* in_aux_handle,
                          const mx_uint num_options,
                          const char** keys,
                          const char** vals) {
@@ -1353,13 +1355,13 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
   nnvm::Symbol *sym = static_cast<nnvm::Symbol *>(sym_handle);
   *s = sym->Copy();
   nnvm::Graph g = Symbol2Graph(*s);
-  if (len) {
+  if (args_len) {
     NDArray **in_args_ptr = reinterpret_cast<NDArray**>(in_args_handle);
     Context default_ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), 0);
-    mxnet::ShapeVector arg_shapes(len);
-    nnvm::DTypeVector arg_dtypes(len);
-    StorageTypeVector arg_stypes(len);
-    for (mx_uint i = 0; i < len; i++) {
+    mxnet::ShapeVector arg_shapes(args_len);
+    nnvm::DTypeVector arg_dtypes(args_len);
+    StorageTypeVector arg_stypes(args_len);
+    for (mx_uint i = 0; i < args_len; i++) {
       const auto &in_arg = *(in_args_ptr[i]);
       arg_shapes[i] = in_arg.shape();
       arg_dtypes[i] = in_arg.dtype();
@@ -1392,6 +1394,19 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
     g.attrs["in_arg_names"] = std::make_shared<nnvm::any>(arg_names);
   }
+
+  if (aux_len) {
+    std::vector<std::string> aux_names = sym->ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+    NDArray **in_aux_ptr = reinterpret_cast<NDArray**>(in_aux_handle);
+    g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
+    g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
+  } else {
+    NDArray **in_aux_ptr = static_cast<NDArray**>(nullptr);
+    std::vector<std::string> aux_names;
+    g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
+    g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
+  }
+
   std::vector<std::pair<std::string, std::string>> options_map;
   for (mx_uint i = 0; i < num_options; ++i) {
     options_map.emplace_back(keys[i], vals[i]);
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 6f45397d32d2..96d35118c35e 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -102,6 +102,8 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     // get input args and arg names
     in_arg_names = g.GetAttr<std::vector<std::string>>("in_arg_names");
     in_args_ptr = g.GetAttr<NDArray**>("in_args");
+    in_aux_names = g.GetAttr<std::vector<std::string>>("in_aux_names");
+    in_aux_ptr = g.GetAttr<NDArray**>("in_aux");
 
     // convert input args
     arg_names.clear();
@@ -120,11 +122,33 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       arg_dims.push_back(in_arg.shape().ndim());
       arg_types.push_back(in_arg.dtype());
       arg_verIDs.push_back(in_arg.version());
-      const char* ctx_str = in_arg.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
-      arg_dev_type.push_back(ctx_str);
+      const char* arg_ctx_str = in_arg.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+      arg_dev_type.push_back(arg_ctx_str);
       arg_dev_id.push_back(in_arg.ctx().real_dev_id());
     }
 
+    // convert input aux
+    aux_names.clear();
+    aux_data.clear();
+    aux_shapes.clear();
+    aux_dims.clear();
+    aux_types.clear();
+    aux_verIDs.clear();
+    aux_dev_type.clear();
+    aux_dev_id.clear();
+    for (size_t i=0; i < in_aux_names.size(); i++) {
+      aux_names.push_back(in_aux_names[i].c_str());
+      const auto &in_aux = *(in_aux_ptr[i]);
+      aux_data.push_back(in_aux.data().dptr_);
+      aux_shapes.push_back(in_aux.shape().data());
+      aux_dims.push_back(in_aux.shape().ndim());
+      aux_types.push_back(in_aux.dtype());
+      aux_verIDs.push_back(in_aux.version());
+      const char* aux_ctx_str = in_aux.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+      aux_dev_type.push_back(aux_ctx_str);
+      aux_dev_id.push_back(in_aux.ctx().real_dev_id());
+    }
+
     // remove all graph attrs, some cannot be saved to json
     nnvm::Graph graph = std::move(g);
     graph.attrs.clear();
@@ -198,8 +222,8 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       const auto& idx = g.indexed_graph();
 
       // set isArg/isAux for each null op/param in the graph
-      const std::vector<std::string> aux_names = sym.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
-      std::unordered_set<std::string> aux_set(aux_names.begin(), aux_names.end());
+      const std::vector<std::string> aux_state_names = sym.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+      std::unordered_set<std::string> aux_set(aux_state_names.begin(), aux_state_names.end());
       for (unsigned i = 0; i < idx.num_nodes(); i++) {
         nnvm::Node* node = const_cast<nnvm::Node*>(idx[i].source);
         // check if this node is input to subgraph
@@ -213,16 +237,14 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       }
 
       std::string subgraph_json = nnvm::pass::SaveJSON(g);
-
-      CHECK(call_review_subgraph_(review_subgraph_, subgraph_json.c_str(),
-                                  subgraph_id, &accept, opt_keys_.data(),
-                                  opt_vals_.data(), opt_keys_.size(),
-                                  &attr_keys, &attr_vals, &num_attr,
-                                  arg_names.data(), arg_names.size(),
-                                  arg_data.data(), arg_shapes.data(),
-                                  arg_dims.data(), arg_types.data(),
-                                  arg_verIDs.data(), arg_dev_type.data(),
-                                  arg_dev_id.data()))
+      CHECK(call_review_subgraph_(review_subgraph_, subgraph_json.c_str(),  subgraph_id, &accept,
+                                  opt_keys_.data(),  opt_vals_.data(), opt_keys_.size(),  &attr_keys,
+                                  &attr_vals, &num_attr,  arg_names.data(), arg_names.size(),
+                                  arg_data.data(), arg_shapes.data(),  arg_dims.data(), arg_types.data(),
+                                  arg_verIDs.data(), arg_dev_type.data(),  arg_dev_id.data(),
+                                  aux_names.data(), aux_names.size(),  aux_data.data(), aux_shapes.data(),
+                                  aux_dims.data(), aux_types.data(), aux_verIDs.data(),
+                                  aux_dev_type.data(), aux_dev_id.data()))
         << "Error calling review_subgraph for '" << subgraph_prop << "'";
     }
 
@@ -260,16 +282,17 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   std::string subgraph_op_name;
   std::vector<std::pair<std::string, std::string>> options_map_;
   std::vector<const char*> opt_keys_, opt_vals_;
-  std::vector<std::string> in_arg_names;
+  std::vector<std::string> in_arg_names, in_aux_names;
   NDArray **in_args_ptr;
-  std::vector<const char*> arg_names;
-  std::vector<void*> arg_data;
-  std::vector<const int64_t*> arg_shapes;
-  std::vector<int> arg_dims;
-  std::vector<int> arg_types;
-  std::vector<size_t> arg_verIDs;
-  std::vector<const char*> arg_dev_type;
-  std::vector<int> arg_dev_id;
+  NDArray **in_aux_ptr;
+  std::vector<const char*> arg_names, aux_names;
+  std::vector<void*> arg_data, aux_data;
+  std::vector<const int64_t*> arg_shapes, aux_shapes;
+  std::vector<int> arg_dims, aux_dims;
+  std::vector<int> arg_types, aux_types;
+  std::vector<size_t> arg_verIDs, aux_verIDs;
+  std::vector<const char*> arg_dev_type, aux_dev_type;
+  std::vector<int> arg_dev_id, aux_dev_id;
 };
 }  // namespace op
 }  // namespace mxnet

From dc17e3f1dbe0f3aedd7a2a97ca8ac607f4311d9f Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sat, 22 Feb 2020 19:24:06 +0000
Subject: [PATCH 15/53] fixed lint

---
 .../partitioner/custom_subgraph_property.h    | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 96d35118c35e..53426b714f8a 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -222,7 +222,8 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       const auto& idx = g.indexed_graph();
 
       // set isArg/isAux for each null op/param in the graph
-      const std::vector<std::string> aux_state_names = sym.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+      const std::vector<std::string> aux_state_names =
+        sym.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
       std::unordered_set<std::string> aux_set(aux_state_names.begin(), aux_state_names.end());
       for (unsigned i = 0; i < idx.num_nodes(); i++) {
         nnvm::Node* node = const_cast<nnvm::Node*>(idx[i].source);
@@ -237,13 +238,15 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       }
 
       std::string subgraph_json = nnvm::pass::SaveJSON(g);
-      CHECK(call_review_subgraph_(review_subgraph_, subgraph_json.c_str(),  subgraph_id, &accept,
-                                  opt_keys_.data(),  opt_vals_.data(), opt_keys_.size(),  &attr_keys,
-                                  &attr_vals, &num_attr,  arg_names.data(), arg_names.size(),
-                                  arg_data.data(), arg_shapes.data(),  arg_dims.data(), arg_types.data(),
-                                  arg_verIDs.data(), arg_dev_type.data(),  arg_dev_id.data(),
-                                  aux_names.data(), aux_names.size(),  aux_data.data(), aux_shapes.data(),
-                                  aux_dims.data(), aux_types.data(), aux_verIDs.data(),
+      CHECK(call_review_subgraph_(review_subgraph_, subgraph_json.c_str(),  subgraph_id,
+                                  &accept, opt_keys_.data(), opt_vals_.data(),
+                                  opt_keys_.size(),  &attr_keys, &attr_vals, &num_attr,
+                                  arg_names.data(), arg_names.size(), arg_data.data(),
+                                  arg_shapes.data(), arg_dims.data(), arg_types.data(),
+                                  arg_verIDs.data(), arg_dev_type.data(),
+                                  arg_dev_id.data(), aux_names.data(), aux_names.size(),
+                                  aux_data.data(), aux_shapes.data(), aux_dims.data(),
+                                  aux_types.data(), aux_verIDs.data(),
                                   aux_dev_type.data(), aux_dev_id.data()))
         << "Error calling review_subgraph for '" << subgraph_prop << "'";
     }

From 56bbb01e80530ddc167222a39679677f842f852c Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sat, 22 Feb 2020 19:34:05 +0000
Subject: [PATCH 16/53] sanity

---
 python/mxnet/symbol/symbol.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 7ed86f64effc..0a19018b6e62 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1486,14 +1486,13 @@ def optimize_for(self, backend, args=None, aux=None, ctx=None, **kwargs):
             args = []
             args_handle = c_array(NDArrayHandle, [])
         else:
-            listed_arguments = self.list_arguments()
-            args_handle, args = self._get_ndarray_inputs('args', args, listed_arguments, False)
+            args_handle, args = self._get_ndarray_inputs('args', args,
+                                                         self.list_arguments(), False)
 
         if aux is None or len(aux) == 0:
             aux = []
             aux_handle = c_array(NDArrayHandle, [])
         else:
-            listed_aux = self.list_arguments()
             aux_handle, aux = self._get_ndarray_inputs('aux_states', aux,
                                                        self.list_auxiliary_states(), False)
         if ctx is None:

From a12517dee7d2587ecb653754c56eeaad2a4b342e Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sat, 22 Feb 2020 23:06:51 +0000
Subject: [PATCH 17/53] perl changes

---
 perl-package/AI-MXNetCAPI/mxnet.i | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index 3bc53d6442c1..846b28ff0e34 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -1633,6 +1633,8 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
                                    const mx_uint in,
                                    NDArrayHandle* in,
                                    const mx_uint in,
+                                   NDArrayHandle* in,
+                                   const mx_uint in,
                                    const char** keys,
                                    const char** vals);
 

From c5d322e4a7b5e0419f55fabc52a527302d029454 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Mon, 24 Feb 2020 05:23:43 +0000
Subject: [PATCH 18/53] replaced code with hybridize call

---
 python/mxnet/gluon/block.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 9d0fd0aa269a..eb6035c15136 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1052,22 +1052,9 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
             but slower.
         """
 
-        self._backend = backend
-        if backend_opts is not None:
-            assert isinstance(backend_opts, dict), \
-            "HybridBlock hybridize requires backend_opts to be a dictionary."
-            self._backend_opts = kwargs
-
-        # do part of hybrize API call
-        self._active = True
-        self._flags = list(kwargs.items())
-        self._clear_cached_op()
-        if self._forward_hooks or self._forward_pre_hooks:
-            warnings.warn('"{block}" is being hybridized while still having forward hook/pre-hook. '
-                          'If "{block}" is a child of HybridBlock, the hooks will not take effect.'
-                          .format(block=self))
-        super(HybridBlock, self).hybridize(True, **kwargs)
-
+        # do hybrize API call
+        self.hybridize(True, backend, backend_opts, **kwargs)
+            
         # do part of forward API call
         has_symbol, has_ndarray, ctx_set, _ = _gather_type_ctx_info([x] + list(args))
         if has_symbol:
@@ -1076,7 +1063,6 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
         if not has_symbol and not has_ndarray:
             raise ValueError('In HybridBlock, there must be one NDArray as input.'
                              ' Please check the type of the args.\n')
-
         if len(ctx_set) > 1:
             raise ValueError('Find multiple contexts in the input, '
                              'After hybridized, the HybridBlock only supports one input '

From 4333260a4a6b6ae6c1c4001ea26bd0aa8c799607 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Mon, 24 Feb 2020 05:31:00 +0000
Subject: [PATCH 19/53] added unittest for gluon optimize_for

---
 tests/python/unittest/test_extensions.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index 726cb6a998a3..9098775e3ac5 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -166,3 +166,15 @@ def test_subgraph():
     out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out4[0].asnumpy(), rtol=1e-3, atol=1e-3)
+
+    # Gluon Hybridize partitioning with shapes/types
+    sym_block2 = nn.SymbolBlock(sym, [a,b])
+    sym_block2.initialize()
+    sym_block2.optimize_for(backend='myProp')
+    sym_block2.export('optimized')
+    sym_block3 = nn.SymbolBlock.imports('optimized-symbol.json',['a','b'],
+                                        'optimized-0000.params')
+
+    out5 = sym_block3(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
+    # check that result matches one executed by MXNet
+    assert_almost_equal(out[0].asnumpy(), out5[0].asnumpy(), rtol=1e-3, atol=1e-3)

From adde4568560c05d38152d8276c80888d2c071829 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Mon, 24 Feb 2020 05:49:46 +0000
Subject: [PATCH 20/53] fixed whitespace

---
 python/mxnet/gluon/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index eb6035c15136..d5a502ab8bfc 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1054,7 +1054,7 @@ def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
 
         # do hybrize API call
         self.hybridize(True, backend, backend_opts, **kwargs)
-            
+
         # do part of forward API call
         has_symbol, has_ndarray, ctx_set, _ = _gather_type_ctx_info([x] + list(args))
         if has_symbol:

From 8f58f33d3f0a378ebaaa5b14a41059880e765d4d Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Mon, 24 Feb 2020 07:08:40 +0000
Subject: [PATCH 21/53] fixed test

---
 tests/python/unittest/test_extensions.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index 9098775e3ac5..65cdb8b82198 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -170,11 +170,13 @@ def test_subgraph():
     # Gluon Hybridize partitioning with shapes/types
     sym_block2 = nn.SymbolBlock(sym, [a,b])
     sym_block2.initialize()
-    sym_block2.optimize_for(backend='myProp')
+    a_data = mx.nd.ones((3,2))
+    b_data = mx.nd.ones((3,2))
+    sym_block2.optimize_for(a_data, b_data, backend='myProp')
     sym_block2.export('optimized')
     sym_block3 = nn.SymbolBlock.imports('optimized-symbol.json',['a','b'],
                                         'optimized-0000.params')
 
-    out5 = sym_block3(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
+    out5 = sym_block3(a_data, b_data)
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out5[0].asnumpy(), rtol=1e-3, atol=1e-3)

From 4daefa782ff040e83d02d562fcbe9d004af799c6 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 26 Feb 2020 07:04:52 +0000
Subject: [PATCH 22/53] addressed comments

---
 example/extensions/lib_subgraph/README.md     | 38 ++++++++++++++++---
 .../extensions/lib_subgraph/subgraph_lib.cc   | 11 ++++--
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index a5a72ca6f046..b90a4a4fa54a 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -78,28 +78,54 @@ sym, _, _ = mx.model.load_checkpoint('mymodel', 0)
 # Symbol/Module flow
 sym2 = sym.optimize_for("myPart")
 
-# Gluon flow
+# Gluon flow 1
 sym_block = nn.SymbolBlock(sym, inputs)
 sym_block.hybridize(backend='myPart')
+
+# Gluon flow 2
+sym_block = nn.SymbolBlock(sym, inputs)
+sym_block.optimize_for(x, backend='myPart')
 ```
 
+In the Gluon hybridize flow, the model is actually hybridized during the first inference, rather than immediately when calling hybridize. This hybridize-based flow is useful if a user expects to run inference immediately after hybridizing. But for users than just want to partition but not run a whole forward pass, the `optimize_for` API combines the hybrdize/forward APIs but does not run a forward pass. After calling `optimize_for` users can `export` thier model immediately without running a forward pass. 
+
 ### Using a Custom Partitioner Library
 
 Partitioning APIs in MXNet are available in both Symbol and Gluon APIs. For the Symbol API, the `optimize_for` API can be called on Symbol objects to return a partitioned Symbol.
 
 ```
-optimize_for(backend, args=None, ctx=None, **kwargs)
+optimize_for(backend, args=None, aux=None, ctx=None, **kwargs)
 ```
 
-The `optimize_for` API takes at least 1 argument, `backend` which is a string that identifies which backend to partition the model for. The `args` argument is optional and takes a list of NDArray or dict of str to NDArray. It is used to infer shapes and types and before partitioning. The `ctx` argument is optional and takes a device context to infer storage types. It also take any other user-specified options that will be passed to the backend partitioning APIs.
+The `optimize_for` API takes at least 1 argument, `backend` which is a string that identifies which backend to partition the model for. The `args` and `aux` arguments are optional and takes a list of NDArray or dict of str to NDArray. They are used to infer shapes and types and before partitioning, and passed to the backend to use during compilation. The `ctx` argument is optional and takes a device context to infer storage types. It also take any other user-specified options that will be passed to the backend partitioning APIs.
 
 For the Gluon API, the `hybridize` API can be called on HybridBlocks to partition the internal CachedOp Symbol.
 
 ```
-hybridize(backend=None, backend_opts=None)
+hybridize(backend=None, backend_opts=None, **kwargs)
+```
+
+The `hybridize` function prepares the HybridBlock to be converted into a backend symbol. The `backend` argument is a string that identifies which backend that will partition the model. The `backend_opts` takes other user-specified options that will be passed to the backend partitioning APIs. The actual partitioning takes place during the forward pass.
+
+If you just want to partition the HybridBlock but not run a complete forward pass, you can use `optimize_for` API that combines the work done in the `hybridize` API with part of the work done in the forward pass.
+
+```
+optimize_for(x, backend=None, backend_opts=None, **kwargs)
 ```
 
-When the `hybridize` function is called, Gluon will convert the program’s execution into the style used in symbolic programming. The `backend` argument is a string that identifies which backend to partition the model for. The `backend_opts` takes other user-specified options that will be passed to the backend partitioning APIs.
+When the `optimize_for` API is called on a HybridBlock it partitions immediately. This lets users export the partitioned model without running a complete forward pass.
+
+```
+block.optimize_for(x, backend='myPart')
+block.export('partitioned')
+```
+
+But you can also use `optimize_for` in place of `hybridize` and run inference immediately after too.
+
+```
+block.optimize_for(x, backend='myPart')
+block(x)
+```
 
 ### Writing A Custom Partitioner
 
@@ -141,7 +167,7 @@ Also there are some optional functions you can specify:
 
 Let’s take a closer look at those registry functions:
 
-* **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. The 2nd argument is an array of booleans, one for each operator in the model. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `true` for the node ID in the `ids` array. The last argument is the map of options specified by the user. Users can pass custom options to the partitioner and they are passed to this function in the `options` map. 
+* **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. The 2nd argument is an array of booleans, one for each operator in the model. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `true` for the index in the `ids` array corresponding to the node ID. The last argument is the map of options specified by the user. Users can pass custom options to the partitioner and they are passed to this function in the `options` map. 
 
 * **reviewSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API. The 4th argument is the map of options specified by the user. The 5th argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. The last argument is the map of params/weights/args to the model and the associated names. For inputs the the subgraph that come directly from the params/weights of the model, you can look up the name of the input in this map to get the actual tensor values.
 
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 36110d69a4cb..2120890f3bac 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -204,7 +204,7 @@ MXReturnValue mySupportedOps(std::string json,
         dtype = std::stoi(attrs.map[JsonVal("dtype")].str);
     }
 
-    //check if op dtype is float
+    //check if op dtype is float, and if option was specified to require float types
     if((dtype == kFloat32 && options.count("reqFloat") > 0) || options.count("reqFloat") == 0) {
       //check if op is in whitelist
       if(std::find(op_names.begin(),op_names.end(),op.str.c_str()) != op_names.end()) {
@@ -233,14 +233,17 @@ MXReturnValue myReviewSubgraph(std::string json, int subraph_id, bool* accept,
       std::cout << kv.second.data<float>()[i] << ", ";
     std::cout << "]" << std::endl;
   }
+
+  // check if option `reqArgs` was specified, and if so check if args were provided
   if(options.count("reqArgs") > 0 && args.size() == 0) {
     *accept = false;
     std::cout << "rejecting subgraph since args were not provided" << std::endl;
     return MX_SUCCESS;
   }
-  
-  if(options.find("reject") != options.end() &&
-     options["reject"].compare("True") == 0) {
+
+  // check if option `reject` was specified, and if so check if value is 'True'
+  if(options.count("reject") > 0 && options["reject"].compare("True") == 0) {
+    // if specified, reject the subgraph. this is only used for testing
     *accept = false;
     std::cout << "rejecting subgraph" << std::endl;
   } else {

From 68f3de0723fbb5dc7fafb6ba5a825bd6808ed2dd Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 26 Feb 2020 07:15:40 +0000
Subject: [PATCH 23/53] fixed grammar

---
 example/extensions/lib_subgraph/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index b90a4a4fa54a..1518548745c6 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -87,7 +87,7 @@ sym_block = nn.SymbolBlock(sym, inputs)
 sym_block.optimize_for(x, backend='myPart')
 ```
 
-In the Gluon hybridize flow, the model is actually hybridized during the first inference, rather than immediately when calling hybridize. This hybridize-based flow is useful if a user expects to run inference immediately after hybridizing. But for users than just want to partition but not run a whole forward pass, the `optimize_for` API combines the hybrdize/forward APIs but does not run a forward pass. After calling `optimize_for` users can `export` thier model immediately without running a forward pass. 
+In the Gluon hybridize flow, the model is actually hybridized during the first inference, rather than immediately when calling `hybridize`. This hybridize-based flow is useful if a user expects to run inference immediately after hybridizing. But for users than just want to partition but not run a whole forward pass, the `optimize_for` API combines the hybrdize/forward APIs but does not run a forward pass. After calling `optimize_for` users can `export` their model immediately without running a forward pass. 
 
 ### Using a Custom Partitioner Library
 
@@ -97,7 +97,7 @@ Partitioning APIs in MXNet are available in both Symbol and Gluon APIs. For the
 optimize_for(backend, args=None, aux=None, ctx=None, **kwargs)
 ```
 
-The `optimize_for` API takes at least 1 argument, `backend` which is a string that identifies which backend to partition the model for. The `args` and `aux` arguments are optional and takes a list of NDArray or dict of str to NDArray. They are used to infer shapes and types and before partitioning, and passed to the backend to use during compilation. The `ctx` argument is optional and takes a device context to infer storage types. It also take any other user-specified options that will be passed to the backend partitioning APIs.
+The `optimize_for` API takes at least 1 argument, `backend` which is a string that identifies which backend to partition the model for. The `args` and `aux` arguments are optional and take a list of NDArray or dict of str to NDArray. They are used to infer shapes and types and before partitioning, and passed to the backend to use during compilation. The `ctx` argument is optional and takes a device context to infer storage types. It also takes any other user-specified options that will be passed to the backend partitioning APIs.
 
 For the Gluon API, the `hybridize` API can be called on HybridBlocks to partition the internal CachedOp Symbol.
 
@@ -107,7 +107,7 @@ hybridize(backend=None, backend_opts=None, **kwargs)
 
 The `hybridize` function prepares the HybridBlock to be converted into a backend symbol. The `backend` argument is a string that identifies which backend that will partition the model. The `backend_opts` takes other user-specified options that will be passed to the backend partitioning APIs. The actual partitioning takes place during the forward pass.
 
-If you just want to partition the HybridBlock but not run a complete forward pass, you can use `optimize_for` API that combines the work done in the `hybridize` API with part of the work done in the forward pass.
+If you just want to partition the HybridBlock but not run a complete forward pass, you can use the `optimize_for` API that combines the work done in the `hybridize` API with part of the work done in the forward pass.
 
 ```
 optimize_for(x, backend=None, backend_opts=None, **kwargs)
@@ -131,7 +131,7 @@ block(x)
 
 There are several essential building blocks for making a custom partitioner:
 
-* [initialize](./subgraph_lib.cc#L242):
+* [initialize](./subgraph_lib.cc#L261):
     * This function is the library initialization function necessary for any dynamic libraries. It lets you check if the user is using a compatible version of MXNet. Note that this `version` parameter is passed from MXNet when library is loaded.
 
             MXReturnValue initialize(int version)
@@ -144,7 +144,7 @@ There are several essential building blocks for making a custom partitioner:
                 std::vector<bool>& ids,
                 std::unordered_map<std::string, std::string>& options)
 
-* [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L238):
+* [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L257):
     * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setReviewSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `reviewSubgraph` function.
 
             REGISTER_PARTITIONER(my_part_name)
@@ -154,7 +154,7 @@ There are several essential building blocks for making a custom partitioner:
 
 Also there are some optional functions you can specify:
 
-* [reviewSubgraph](./subgraph_lib.cc#L220):
+* [reviewSubgraph](./subgraph_lib.cc#L219):
     * This function provides an opportunity to accept/reject a subgraph after MXNet partitions it. It also allows specifying custom attributes on the subgraph (ie. user-generated IDs). If you do not register this function, subgraphs will be accepted by default. 
 
             MXReturnValue reviewSubgraph(

From 520edccc753b0147b45ec9fb263683dff1909897 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 26 Feb 2020 21:39:39 +0000
Subject: [PATCH 24/53] fixed spelling

---
 example/extensions/lib_subgraph/README.md       | 4 ++--
 example/extensions/lib_subgraph/subgraph_lib.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index 1518548745c6..29b1c3301cf3 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -159,7 +159,7 @@ Also there are some optional functions you can specify:
 
             MXReturnValue reviewSubgraph(
                 std::string json,
-                int subraph_id,
+                int subgraph_id,
                 bool* accept,
                 std::unordered_map<std::string, std::string>& options,
                 std::unordered_map<std::string, std::string>& attrs,
@@ -169,7 +169,7 @@ Let’s take a closer look at those registry functions:
 
 * **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. The 2nd argument is an array of booleans, one for each operator in the model. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `true` for the index in the `ids` array corresponding to the node ID. The last argument is the map of options specified by the user. Users can pass custom options to the partitioner and they are passed to this function in the `options` map. 
 
-* **reviewSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API. The 4th argument is the map of options specified by the user. The 5th argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. The last argument is the map of params/weights/args to the model and the associated names. For inputs the the subgraph that come directly from the params/weights of the model, you can look up the name of the input in this map to get the actual tensor values.
+* **reviewSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments, unique for each subgraph in the model). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API. The 4th argument is the map of options specified by the user. The 5th argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. The last argument is the map of params/weights/args to the model and the associated names. For inputs the the subgraph that come directly from the params/weights of the model, you can look up the name of the input in this map to get the actual tensor values.
 
 ### Writing A Custom Subgraph Operator
 
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 2120890f3bac..dbacf3fff059 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -216,7 +216,7 @@ MXReturnValue mySupportedOps(std::string json,
   return MX_SUCCESS;
 }
 
-MXReturnValue myReviewSubgraph(std::string json, int subraph_id, bool* accept,
+MXReturnValue myReviewSubgraph(std::string json, int subgraph_id, bool* accept,
                                std::unordered_map<std::string, std::string>& options,
                                std::unordered_map<std::string, std::string>& attrs,
                                std::map<std::string, MXTensor>& args,

From 55d575bd5335c46904589a3dc4daa7e6765369a1 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 26 Feb 2020 21:52:51 +0000
Subject: [PATCH 25/53] added aux argument to the reviewSubgraph API in README

---
 example/extensions/lib_subgraph/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index 29b1c3301cf3..a4b692f22fb2 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -163,7 +163,8 @@ Also there are some optional functions you can specify:
                 bool* accept,
                 std::unordered_map<std::string, std::string>& options,
                 std::unordered_map<std::string, std::string>& attrs,
-                std::map<std::string, MXTensor>& args)
+                std::map<std::string, MXTensor>& args,
+                std::map<std::string, MXTensor>& aux)
 
 Let’s take a closer look at those registry functions:
 

From 005b53c09d7d52367e992c06529482cedf401b5e Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 27 Feb 2020 02:15:09 +0000
Subject: [PATCH 26/53] updated infer shape to use aux for optimize_for

---
 src/c_api/c_api_symbolic.cc | 49 +++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 577fdae679f2..417e1792d6f9 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1355,22 +1355,37 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
   nnvm::Symbol *sym = static_cast<nnvm::Symbol *>(sym_handle);
   *s = sym->Copy();
   nnvm::Graph g = Symbol2Graph(*s);
-  if (args_len) {
+  const auto& indexed_graph = g.indexed_graph();
+  const auto& mutable_nodes = indexed_graph.mutable_input_nodes();
+  size_t num_forward_inputs = sym->ListInputs(nnvm::Symbol::kAll).size();
+  if (args_len || aux_len) {
     NDArray **in_args_ptr = reinterpret_cast<NDArray**>(in_args_handle);
+    NDArray **in_aux_ptr = reinterpret_cast<NDArray**>(in_aux_handle);
     Context default_ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), 0);
-    mxnet::ShapeVector arg_shapes(args_len);
-    nnvm::DTypeVector arg_dtypes(args_len);
-    StorageTypeVector arg_stypes(args_len);
-    for (mx_uint i = 0; i < args_len; i++) {
-      const auto &in_arg = *(in_args_ptr[i]);
-      arg_shapes[i] = in_arg.shape();
-      arg_dtypes[i] = in_arg.dtype();
-      arg_stypes[i] = in_arg.storage_type();
+    mxnet::ShapeVector arg_shapes(args_len + aux_len);
+    nnvm::DTypeVector arg_dtypes(args_len + aux_len);
+    StorageTypeVector arg_stypes(args_len + aux_len);
+    size_t args_top = 0, aux_top = 0;
+    for (size_t i = 0; i < num_forward_inputs; ++i) {
+      const uint32_t nid = indexed_graph.input_nodes().at(i);
+      if (mutable_nodes.count(nid)) {
+	CHECK_LT(aux_top, aux_len);
+	const auto &in_arg = *(in_aux_ptr[aux_top++]);
+	arg_shapes[i] = in_arg.shape();
+	arg_dtypes[i] = in_arg.dtype();
+	arg_stypes[i] = in_arg.storage_type();
+      } else {
+	CHECK_LT(args_top, args_len);
+	const auto &in_arg = *(in_args_ptr[args_top++]);
+	arg_shapes[i] = in_arg.shape();
+	arg_dtypes[i] = in_arg.dtype();
+	arg_stypes[i] = in_arg.storage_type();
+      }
     }
-    const auto& indexed_graph = g.indexed_graph();
-    const auto num_forward_inputs = indexed_graph.input_nodes().size();
+
     g.attrs["context"] = std::make_shared<nnvm::any>(
         exec::ContextVector(indexed_graph.num_nodes(), default_ctx));
+    std::cout << "inferring shapes in optimize_for: " << arg_shapes.size() << std::endl;
     // infer shapes
     g = exec::InferShape(std::move(g), std::move(arg_shapes), "__shape__");
     // infer dtypes
@@ -1388,25 +1403,23 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     std::vector<std::string> arg_names = sym->ListInputNames(nnvm::Symbol::kReadOnlyArgs);
     g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
     g.attrs["in_arg_names"] = std::make_shared<nnvm::any>(arg_names);
+
+    std::vector<std::string> aux_names = sym->ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+    g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
+    g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
   } else {
     NDArray **in_args_ptr = static_cast<NDArray**>(nullptr);
     std::vector<std::string> arg_names;
     g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
     g.attrs["in_arg_names"] = std::make_shared<nnvm::any>(arg_names);
-  }
 
-  if (aux_len) {
-    std::vector<std::string> aux_names = sym->ListInputNames(nnvm::Symbol::kAuxiliaryStates);
-    NDArray **in_aux_ptr = reinterpret_cast<NDArray**>(in_aux_handle);
-    g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
-    g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
-  } else {
     NDArray **in_aux_ptr = static_cast<NDArray**>(nullptr);
     std::vector<std::string> aux_names;
     g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
     g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
   }
 
+
   std::vector<std::pair<std::string, std::string>> options_map;
   for (mx_uint i = 0; i < num_options; ++i) {
     options_map.emplace_back(keys[i], vals[i]);

From 668e315d64dbad039fbbd6a7056129703f97a513 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 27 Feb 2020 02:23:09 +0000
Subject: [PATCH 27/53] fixed spacing

---
 src/c_api/c_api_symbolic.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 417e1792d6f9..2d04c9ef4bca 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1369,17 +1369,17 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     for (size_t i = 0; i < num_forward_inputs; ++i) {
       const uint32_t nid = indexed_graph.input_nodes().at(i);
       if (mutable_nodes.count(nid)) {
-	CHECK_LT(aux_top, aux_len);
-	const auto &in_arg = *(in_aux_ptr[aux_top++]);
-	arg_shapes[i] = in_arg.shape();
-	arg_dtypes[i] = in_arg.dtype();
-	arg_stypes[i] = in_arg.storage_type();
+        CHECK_LT(aux_top, aux_len);
+        const auto &in_arg = *(in_aux_ptr[aux_top++]);
+        arg_shapes[i] = in_arg.shape();
+        arg_dtypes[i] = in_arg.dtype();
+        arg_stypes[i] = in_arg.storage_type();
       } else {
-	CHECK_LT(args_top, args_len);
-	const auto &in_arg = *(in_args_ptr[args_top++]);
-	arg_shapes[i] = in_arg.shape();
-	arg_dtypes[i] = in_arg.dtype();
-	arg_stypes[i] = in_arg.storage_type();
+        CHECK_LT(args_top, args_len);
+        const auto &in_arg = *(in_args_ptr[args_top++]);
+        arg_shapes[i] = in_arg.shape();
+        arg_dtypes[i] = in_arg.dtype();
+        arg_stypes[i] = in_arg.storage_type();
       }
     }
 

From bb7e52daa68e3d79d570f6c36ab001bccd70f08c Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 27 Feb 2020 19:04:01 +0000
Subject: [PATCH 28/53] changed shape/dtype keys so they dont conflict with
 MXNet operator attrs

---
 include/mxnet/lib_api.h                                      | 5 ++++-
 src/c_api/c_api_symbolic.cc                                  | 2 +-
 src/operator/subgraph/partitioner/custom_subgraph_property.h | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 9173df120b8f..ee81909696a2 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -234,7 +234,8 @@ enum MXReturnValue {
  */
 struct MXTensor {
   MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0) {}
-
+  MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
+    dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx) {}
   MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
            size_t vID, MXContext mx_ctx)
   : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx) {}
@@ -407,6 +408,8 @@ class OpResource {
  */
 /*! \brief Macro to help passing serialized subgraph through attribute dict */
 #define SUBGRAPH_SYM_JSON "subgraph_sym_json"
+#define MX_DTYPE "__dtype__"
+#define MX_SHAPE "__shape__"
 
 /*! \brief Types of JSON objects */
 enum JsonType {ERR, STR, NUM, LIST, MAP};
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 2d04c9ef4bca..3025bd60f753 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1385,7 +1385,7 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
 
     g.attrs["context"] = std::make_shared<nnvm::any>(
         exec::ContextVector(indexed_graph.num_nodes(), default_ctx));
-    std::cout << "inferring shapes in optimize_for: " << arg_shapes.size() << std::endl;
+
     // infer shapes
     g = exec::InferShape(std::move(g), std::move(arg_shapes), "__shape__");
     // infer dtypes
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 53426b714f8a..3349e296ec74 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -162,7 +162,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         mxnet::TShape shape = shapes[i];
         std::stringstream ss;
         ss << shape;
-        node->attrs.dict["shape"] = ss.str();
+        node->attrs.dict[MX_SHAPE] = ss.str();
       }
     }
     // set dtype attrs for each node in the graph
@@ -173,7 +173,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         int dtype = dtypes[i];
         std::stringstream ss;
         ss << dtype;
-        node->attrs.dict["dtype"] = ss.str();
+        node->attrs.dict[MX_DTYPE] = ss.str();
       }
     }
 

From 20382ae678523195cf2767bcad5492dadae44272 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 27 Feb 2020 19:11:12 +0000
Subject: [PATCH 29/53] added error message to show missing arg/aux

---
 src/c_api/c_api_symbolic.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 3025bd60f753..83b09fb2db4d 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1357,7 +1357,8 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
   nnvm::Graph g = Symbol2Graph(*s);
   const auto& indexed_graph = g.indexed_graph();
   const auto& mutable_nodes = indexed_graph.mutable_input_nodes();
-  size_t num_forward_inputs = sym->ListInputs(nnvm::Symbol::kAll).size();
+  std::vector<std::string> input_names = sym->ListInputNames(nnvm::Symbol::kAll);
+  size_t num_forward_inputs = input_names.size();
   if (args_len || aux_len) {
     NDArray **in_args_ptr = reinterpret_cast<NDArray**>(in_args_handle);
     NDArray **in_aux_ptr = reinterpret_cast<NDArray**>(in_aux_handle);
@@ -1369,13 +1370,15 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     for (size_t i = 0; i < num_forward_inputs; ++i) {
       const uint32_t nid = indexed_graph.input_nodes().at(i);
       if (mutable_nodes.count(nid)) {
-        CHECK_LT(aux_top, aux_len);
+        CHECK_LT(aux_top, aux_len)
+          << "Cannot find aux '" << input_names[i] << "' in provided aux to optimize_for";
         const auto &in_arg = *(in_aux_ptr[aux_top++]);
         arg_shapes[i] = in_arg.shape();
         arg_dtypes[i] = in_arg.dtype();
         arg_stypes[i] = in_arg.storage_type();
       } else {
-        CHECK_LT(args_top, args_len);
+        CHECK_LT(args_top, args_len)
+          << "Cannot find arg '" << input_names[i] << "' in provided args to optimize_for";
         const auto &in_arg = *(in_args_ptr[args_top++]);
         arg_shapes[i] = in_arg.shape();
         arg_dtypes[i] = in_arg.dtype();

From a2a9df17cda1d0d70cce9934af21d7a97f766c1f Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 27 Feb 2020 20:17:12 +0000
Subject: [PATCH 30/53] added calls to setDLtensor for MXTensor constructors

---
 include/mxnet/lib_api.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index ee81909696a2..284e53506ca0 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -235,10 +235,14 @@ enum MXReturnValue {
 struct MXTensor {
   MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0) {}
   MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
-    dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx) {}
+    dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx) {
+    setDLTensor();
+  }
   MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
            size_t vID, MXContext mx_ctx)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx) {}
+  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx) {
+    setDLTensor();
+  }
 
   /*! \brief populate internal tensor fields */
   void setTensor(void *dptr, MXDType type, const int64_t* dims, int ndims,

From 23958da3302bc60ddef277a2e2f43e29a39243af Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 27 Feb 2020 20:24:12 +0000
Subject: [PATCH 31/53] changed tests to pass aux in addition to args

---
 tests/python/unittest/test_subgraph_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_subgraph_op.py b/tests/python/unittest/test_subgraph_op.py
index f1572e71f128..e414a9836ccb 100644
--- a/tests/python/unittest/test_subgraph_op.py
+++ b/tests/python/unittest/test_subgraph_op.py
@@ -282,7 +282,7 @@ def check_subgraph_exe6(sym, subgraph_backend, op_names):
     # infer shape/type before partition before simple_bind
     check_call(_LIB.MXSetSubgraphPropertyOpNamesV2(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                  c_str_array(op_names)))
-    part_sym = sym.optimize_for(subgraph_backend, exe1.arg_dict)
+    part_sym = sym.optimize_for(subgraph_backend, exe1.arg_dict, exe1.aux_dict)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))
 
     exe2 = part_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
@@ -335,7 +335,7 @@ def check_subgraph_exe8(sym, subgraph_backend, op_names):
     # infer shape/type before partition before bind
     check_call(_LIB.MXSetSubgraphPropertyOpNamesV2(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                  c_str_array(op_names)))
-    part_sym = sym.optimize_for(subgraph_backend, arg_array)
+    part_sym = sym.optimize_for(subgraph_backend, arg_array, aux_array)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))
 
     exe2 = part_sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')

From 4f1d0d2d4bc3587d79bab59cba0a4dd5765ed1f6 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 29 Feb 2020 18:03:20 +0000
Subject: [PATCH 32/53] fixed bug passing attributes

---
 include/mxnet/lib_api.h |  3 +--
 src/c_api/c_api.cc      | 18 +++++++++---------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 284e53506ca0..f99d116b4280 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -1327,9 +1327,8 @@ extern "C" {
     bool accept_bool = false;
     // create map of attributes from list
     std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++) {
+    for (int i = 0; i < num_opts; i++)
       opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-    }
 
     // create a map of named tensors for args
     std::map<std::string, MXTensor> args;
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index bea9277e78c6..3ca44d4d4223 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -192,7 +192,7 @@ void CustomFComputeDispatcher(const std::string op_name,
   if (fcomp_fp != nullptr) {
     // convert attributes to vector of char*
     std::vector<const char*> attr_keys, attr_vals;
-    for (auto kv : attrs->dict) {
+    for (auto &kv : attrs->dict) {
       attr_keys.push_back(kv.first.c_str());
       attr_vals.push_back(kv.second.c_str());
     }
@@ -360,7 +360,7 @@ int MXLoadLib(const char *path) {
     auto attr_parser = [=](const NodeAttrs* attrs) {
       // convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs->dict) {
+      for (auto &kv : attrs->dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -387,7 +387,7 @@ int MXLoadLib(const char *path) {
     auto num_inputs = [=](const NodeAttrs& attrs) {
       // convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -405,7 +405,7 @@ int MXLoadLib(const char *path) {
     auto num_outputs = [=](const NodeAttrs& attrs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -424,7 +424,7 @@ int MXLoadLib(const char *path) {
     auto num_inouts = [=](const NodeAttrs& attrs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -444,7 +444,7 @@ int MXLoadLib(const char *path) {
                             mxnet::ShapeVector *out_shape) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -515,7 +515,7 @@ int MXLoadLib(const char *path) {
                             std::vector<int> *out_type) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -543,7 +543,7 @@ int MXLoadLib(const char *path) {
     auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -628,7 +628,7 @@ int MXLoadLib(const char *path) {
                                const std::vector<int>& in_types) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for (auto kv : attrs.dict) {
+      for (auto &kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }

From 127bcbfa75a59fa4114913ad1d210845af8188f7 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 29 Feb 2020 18:03:58 +0000
Subject: [PATCH 33/53] fixed memory leak where user attribute strings were not
 freed

---
 .../partitioner/custom_subgraph_property.h    | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 3349e296ec74..2cfe5ca7e562 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -214,6 +214,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
                                              const int subgraph_id = 0) const {
     int accept = 1;
     int num_attr = 0;
+    std::map<std::string, std::string> user_attrs;
     char** attr_keys = nullptr;
     char** attr_vals = nullptr;
     if (review_subgraph_) {
@@ -249,6 +250,18 @@ class  CustomSubgraphProperty: public SubgraphProperty {
                                   aux_types.data(), aux_verIDs.data(),
                                   aux_dev_type.data(), aux_dev_id.data()))
         << "Error calling review_subgraph for '" << subgraph_prop << "'";
+
+      if(num_attr > 0) {
+        // set user specified attributes
+        for (int i=0; i < num_attr; i++) {
+          user_attrs[attr_keys[i]] = attr_vals[i];
+          call_free_(attr_vals[i]);
+          call_free_(attr_keys[i]);
+        }
+        // free memory used by custom op to allocate attributes
+        call_free_(attr_vals);
+        call_free_(attr_keys);
+      }
     }
 
     if (accept) {
@@ -256,15 +269,11 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       n->attrs.op = Op::Get(subgraph_op_name);
       n->attrs.name = "_op" + std::to_string(subgraph_id);
       n->attrs.subgraphs.push_back(std::make_shared<nnvm::Symbol>(sym));
+
       // set user specified attributes
-      for (int i=0; i < num_attr; i++) {
-        n->attrs.dict[attr_keys[i]] = attr_vals[i];
-        call_free_(attr_vals[i]);
-        call_free_(attr_keys[i]);
-      }
-      // free memory used by custom op to allocate attributes
-      call_free_(attr_vals);
-      call_free_(attr_keys);
+      for(auto attr : user_attrs)
+        n->attrs.dict[attr.first] = attr.second;
+
       return n;
     } else {
       return nullptr;

From 3f57b9b0adbe60daf2b79c51bbdbb36b1f37ef5c Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 29 Feb 2020 18:05:15 +0000
Subject: [PATCH 34/53] added passing down shapes/dtypes to subgraph inputs

---
 src/operator/subgraph/build_subgraph.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index dfbc0b5f7df8..ed5d63c1ba1e 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -567,6 +567,12 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
     } else {
       n->attrs.dict["isArg"] = "False";
     }
+    // pass down other attributes if available
+    if(e->node->attrs.dict.count("__dtype__") > 0)
+      n->attrs.dict["__dtype__"] = e->node->attrs.dict["__dtype__"];
+    if(e->node->attrs.dict.count("__shape__") > 0)
+      n->attrs.dict["__shape__"] = e->node->attrs.dict["__shape__"];
+
     *e = nnvm::NodeEntry{n, 0, 0};
   }
 }

From c971fdcfebfe9b3c30e9027580aba364267f9fba Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 29 Feb 2020 18:44:50 +0000
Subject: [PATCH 35/53] fixed style

---
 src/operator/subgraph/build_subgraph.cc                      | 4 ++--
 src/operator/subgraph/partitioner/custom_subgraph_property.h | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index ed5d63c1ba1e..64cd3d76c2e5 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -568,9 +568,9 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
       n->attrs.dict["isArg"] = "False";
     }
     // pass down other attributes if available
-    if(e->node->attrs.dict.count("__dtype__") > 0)
+    if (e->node->attrs.dict.count("__dtype__") > 0)
       n->attrs.dict["__dtype__"] = e->node->attrs.dict["__dtype__"];
-    if(e->node->attrs.dict.count("__shape__") > 0)
+    if (e->node->attrs.dict.count("__shape__") > 0)
       n->attrs.dict["__shape__"] = e->node->attrs.dict["__shape__"];
 
     *e = nnvm::NodeEntry{n, 0, 0};
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 2cfe5ca7e562..a9df1f0bb118 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -33,6 +33,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <map>
 #include "../common.h"
 #include "../subgraph_property.h"
 #include "../../include/mxnet/lib_api.h"
@@ -251,7 +252,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
                                   aux_dev_type.data(), aux_dev_id.data()))
         << "Error calling review_subgraph for '" << subgraph_prop << "'";
 
-      if(num_attr > 0) {
+      if (num_attr > 0) {
         // set user specified attributes
         for (int i=0; i < num_attr; i++) {
           user_attrs[attr_keys[i]] = attr_vals[i];
@@ -271,7 +272,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       n->attrs.subgraphs.push_back(std::make_shared<nnvm::Symbol>(sym));
 
       // set user specified attributes
-      for(auto attr : user_attrs)
+      for (auto attr : user_attrs)
         n->attrs.dict[attr.first] = attr.second;
 
       return n;

From 2d9995ae44d8f942bef830d5261da57fb1a22fcc Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 5 Mar 2020 22:58:36 +0000
Subject: [PATCH 36/53] fixed docstring

---
 python/mxnet/gluon/block.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index d5a502ab8bfc..f5307d7294c4 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1031,8 +1031,24 @@ def _call_cached_op(self, *args):
         return _regroup(out, self._out_format)
 
     def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
-        """Activates or deactivates :py:class:`HybridBlock` s recursively. Has no effect on
-        non-hybrid children.
+        """Partitions the current HybridBlock and optimizes it for a given backend
+        without executing a forward pass. Modifies the HybridBlock in-place. 
+
+        Immediately partitions a HybridBlock using the specified backend. Combines
+        the work done in the hybridize API with part of the work done in the forward
+        pass without calling the CachedOp. Can be used in place of hybridize,
+        afterwards `export` can be called or inference can be run. See README.md in
+        example/extensions/lib_subgraph/README.md for more details.
+
+        Examples
+        --------
+        # partition and then export to file
+        block.optimize_for(x, backend='myPart')
+        block.export('partitioned')
+
+        # partition and then run inference
+        block.optimize_for(x, backend='myPart')
+        block(x)
 
         Parameters
         ----------

From ade7d48fc7b977e28213ec24b628c61dc1c0c6f0 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Thu, 5 Mar 2020 23:10:29 +0000
Subject: [PATCH 37/53] removed space

---
 python/mxnet/gluon/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index f5307d7294c4..bed6679be2e6 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1032,7 +1032,7 @@ def _call_cached_op(self, *args):
 
     def optimize_for(self, x, *args, backend=None, backend_opts=None, **kwargs):
         """Partitions the current HybridBlock and optimizes it for a given backend
-        without executing a forward pass. Modifies the HybridBlock in-place. 
+        without executing a forward pass. Modifies the HybridBlock in-place.
 
         Immediately partitions a HybridBlock using the specified backend. Combines
         the work done in the hybridize API with part of the work done in the forward

From 736cd8fd217ef879ab8a0cad0054dcc92c994f9b Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Fri, 6 Mar 2020 00:23:44 +0000
Subject: [PATCH 38/53] changed defines

---
 example/extensions/lib_subgraph/README.md                   | 2 +-
 example/extensions/lib_subgraph/subgraph_lib.cc             | 6 +++---
 include/mxnet/lib_api.h                                     | 6 +++---
 src/c_api/c_api.cc                                          | 4 ++--
 src/c_api/c_api_symbolic.cc                                 | 1 -
 .../subgraph/partitioner/custom_subgraph_property.h         | 4 ++--
 6 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index a4b692f22fb2..83c823676f18 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -174,7 +174,7 @@ Let’s take a closer look at those registry functions:
 
 ### Writing A Custom Subgraph Operator
 
-A partitioning strategy specifies how to partition a model and isolate operators into subgraphs. In MXNet, subgraphs are just a [stateful operator](../lib_custom_op#writing-stateful-custom-operator). Subgraph operators have an extra attribute called `SUBGRAPH_SYM_JSON` that maps to a JSON string of the subgraph. The expectation is that when a subgraph operator executes a forward/backward call, it executes all of the operators in the subgraph. 
+A partitioning strategy specifies how to partition a model and isolate operators into subgraphs. In MXNet, subgraphs are just a [stateful operator](../lib_custom_op#writing-stateful-custom-operator). Subgraph operators have an extra attribute called `MX_STR_SUBGRAPH_SYM_JSON` that maps to a JSON string of the subgraph. The expectation is that when a subgraph operator executes a forward/backward call, it executes all of the operators in the subgraph. 
 
 When registering a custom subgraph operator, all thats needed is to register a `createOpState` function and to set that the operator is a subgraph operator by calling the `setIsSubgraphOp` API like:
 
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index dbacf3fff059..8c24dd880f72 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -160,11 +160,11 @@ MXReturnValue createOpState(std::map<std::string, std::string> attrs,
   std::string serialized_subgraph = "[empty]";
   // MXNet subgraph is stored as Symbol in operator node attrs subgraphs field
   // custom subgraph is stored as json string in custom operator attrs map entry
-  if (attrs.count(SUBGRAPH_SYM_JSON)) {
+  if (attrs.count(MX_STR_SUBGRAPH_SYM_JSON)) {
     // user can now parse json and run other custom ops inside subgraph
-    serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
+    serialized_subgraph = attrs[MX_STR_SUBGRAPH_SYM_JSON];
   }
-  attrs.erase(SUBGRAPH_SYM_JSON);
+  attrs.erase(MX_STR_SUBGRAPH_SYM_JSON);
   *op_inst = new MyStatefulOp(serialized_subgraph, attrs);
   std::cout << "Info: stateful operator created" << std::endl;
   return MX_SUCCESS;
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index f99d116b4280..a3ebea5b1c65 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -411,9 +411,9 @@ class OpResource {
  * \brief Json utility to parse serialized subgraph symbol
  */
 /*! \brief Macro to help passing serialized subgraph through attribute dict */
-#define SUBGRAPH_SYM_JSON "subgraph_sym_json"
-#define MX_DTYPE "__dtype__"
-#define MX_SHAPE "__shape__"
+#define MX_STR_SUBGRAPH_SYM_JSON "subgraph_sym_json"
+#define MX_STR_DTYPE "__dtype__"
+#define MX_STR_SHAPE "__shape__"
 
 /*! \brief Types of JSON objects */
 enum JsonType {ERR, STR, NUM, LIST, MAP};
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 3ca44d4d4223..4639751ddd9a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -370,7 +370,7 @@ int MXLoadLib(const char *path) {
         nnvm::Graph g;
         g.outputs = attrs->subgraphs[0].get()->outputs;
         subgraph_json = nnvm::pass::SaveJSON(g);
-        attr_keys.push_back(SUBGRAPH_SYM_JSON);
+        attr_keys.push_back(MX_STR_SUBGRAPH_SYM_JSON);
         attr_vals.push_back(subgraph_json.c_str());
       }
 
@@ -639,7 +639,7 @@ int MXLoadLib(const char *path) {
         nnvm::Graph g;
         g.outputs = attrs.subgraphs[0].get()->outputs;
         subgraph_json = nnvm::pass::SaveJSON(g);
-        attr_keys.push_back(SUBGRAPH_SYM_JSON);
+        attr_keys.push_back(MX_STR_SUBGRAPH_SYM_JSON);
         attr_vals.push_back(subgraph_json.c_str());
       }
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 83b09fb2db4d..4d464e768a01 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1422,7 +1422,6 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
   }
 
-
   std::vector<std::pair<std::string, std::string>> options_map;
   for (mx_uint i = 0; i < num_options; ++i) {
     options_map.emplace_back(keys[i], vals[i]);
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index a9df1f0bb118..4003dbe86a5e 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -163,7 +163,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         mxnet::TShape shape = shapes[i];
         std::stringstream ss;
         ss << shape;
-        node->attrs.dict[MX_SHAPE] = ss.str();
+        node->attrs.dict[MX_STR_SHAPE] = ss.str();
       }
     }
     // set dtype attrs for each node in the graph
@@ -174,7 +174,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         int dtype = dtypes[i];
         std::stringstream ss;
         ss << dtype;
-        node->attrs.dict[MX_DTYPE] = ss.str();
+        node->attrs.dict[MX_STR_DTYPE] = ss.str();
       }
     }
 

From 7fb9ea35d04b9880dd6ef8b56dbfaf29051a82b2 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Fri, 13 Mar 2020 23:36:19 +0000
Subject: [PATCH 39/53] fixed bug in indexing into map with shapes/types when
 annotating the graph

---
 .../partitioner/custom_subgraph_property.h    | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 4003dbe86a5e..e249f0ce9649 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -117,7 +117,15 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     arg_dev_id.clear();
     for (size_t i=0; i < in_arg_names.size(); i++) {
       arg_names.push_back(in_arg_names[i].c_str());
-      const auto &in_arg = *(in_args_ptr[i]);
+      const NDArray &in_arg = *(in_args_ptr[i]);
+
+      // reorder data if in MKLDNN format
+      if (in_arg.IsMKLDNNData()) {
+        in_arg.Reorder2DefaultAsync();
+        in_arg.WaitToRead();
+      }
+
+      // pull out parts of NDArray to send to backend
       arg_data.push_back(in_arg.data().dptr_);
       arg_shapes.push_back(in_arg.shape().data());
       arg_dims.push_back(in_arg.shape().ndim());
@@ -140,6 +148,14 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     for (size_t i=0; i < in_aux_names.size(); i++) {
       aux_names.push_back(in_aux_names[i].c_str());
       const auto &in_aux = *(in_aux_ptr[i]);
+
+      // reorder data if in MKLDNN format
+      if (in_aux.IsMKLDNNData()) {
+        in_aux.Reorder2DefaultAsync();
+        in_aux.WaitToRead();
+      }
+
+      // pull out parts of NDArray to send to backend
       aux_data.push_back(in_aux.data().dptr_);
       aux_shapes.push_back(in_aux.shape().data());
       aux_dims.push_back(in_aux.shape().ndim());
@@ -158,9 +174,11 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     // set shape attrs for each node in the graph
     if (g.HasAttr("shape")) {
       mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
-      for (unsigned i = 0; i < indexed_graph.num_nodes(); i++) {
-        nnvm::Node* node = const_cast<nnvm::Node*>(indexed_graph[i].source);
-        mxnet::TShape shape = shapes[i];
+      for (unsigned nid = 0; nid < indexed_graph.num_nodes(); nid++) {
+        nnvm::Node* node = const_cast<nnvm::Node*>(indexed_graph[nid].source);
+        // get the output entry ID for this node
+        const uint32_t out_entry_id = indexed_graph.entry_id(nid, 0);
+        mxnet::TShape shape = shapes[out_entry_id];
         std::stringstream ss;
         ss << shape;
         node->attrs.dict[MX_STR_SHAPE] = ss.str();
@@ -169,9 +187,11 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     // set dtype attrs for each node in the graph
     if (g.HasAttr("dtype")) {
       std::vector<int> dtypes = g.GetAttr<std::vector<int> >("dtype");
-      for (unsigned i = 0; i < indexed_graph.num_nodes(); i++) {
-        nnvm::Node* node = const_cast<nnvm::Node*>(indexed_graph[i].source);
-        int dtype = dtypes[i];
+      for (unsigned nid = 0; nid < indexed_graph.num_nodes(); nid++) {
+        nnvm::Node* node = const_cast<nnvm::Node*>(indexed_graph[nid].source);
+        // get the output entry ID for this node
+        const uint32_t out_entry_id = indexed_graph.entry_id(nid, 0);
+        int dtype = dtypes[out_entry_id];
         std::stringstream ss;
         ss << dtype;
         node->attrs.dict[MX_STR_DTYPE] = ss.str();
@@ -192,12 +212,16 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     opt_keys_.clear();
     opt_vals_.clear();
     options_map_.clear();
-    for (auto kv : options_map) {
+    // store options in map in subgraph property to re-use later for reviewSubgraph
+    for (auto& kv : options_map) {
       options_map_.push_back(kv);
-      opt_keys_.push_back(options_map_.back().first.c_str());
-      opt_vals_.push_back(options_map_.back().second.c_str());
     }
-
+    // convert options_map_ to char* to pass to backend library
+    for (auto& kv : options_map_) {
+      opt_keys_.push_back(kv.first.c_str());
+      opt_vals_.push_back(kv.second.c_str());
+    }
+    
     CHECK(call_supported_ops_(supported_ops_, json, supported_node_IDs.size(), ids,
                             opt_keys_.data(), opt_vals_.data(), opt_keys_.size()))
       << "Error calling supported_ops for '" << subgraph_prop << "'";

From b0a79e573b25cfeb1e91ee4fbd30a9a0cd7429e6 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Fri, 13 Mar 2020 23:37:19 +0000
Subject: [PATCH 40/53] added support for MKLDNN tensor format conversion in
 case user does preprocessing

---
 src/c_api/c_api.cc | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 4639751ddd9a..5053bb8a55d9 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -122,14 +122,28 @@ void CustomFComputeDispatcher(const std::string op_name,
 
   // convert inputs/outpus NDArray to C types to be passed to lib_api.h
   for (size_t i = 0; i < inputs.size(); i++) {
-    in_data.push_back(inputs[i].data().dptr_);
-    in_shapes.push_back(inputs[i].shape().data());
-    in_dims.push_back(inputs[i].shape().ndim());
-    in_types.push_back(inputs[i].dtype());
-    in_verIDs.push_back(inputs[i].version());
-    const char* ctx_str = inputs[i].ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
-    in_dev_type.push_back(ctx_str);
-    in_dev_id.push_back(inputs[i].ctx().real_dev_id());
+    const NDArray& in_nd = inputs[i];
+    // reorder data if in MKLDNN format
+    if(in_nd.IsMKLDNNData()) {
+      const NDArray& tmp_nd = in_nd.Reorder2Default();
+      in_data.push_back(tmp_nd.data().dptr_);
+      in_shapes.push_back(tmp_nd.shape().data());
+      in_dims.push_back(tmp_nd.shape().ndim());
+      in_types.push_back(tmp_nd.dtype());
+      in_verIDs.push_back(tmp_nd.version());
+      const char* ctx_str = tmp_nd.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+      in_dev_type.push_back(ctx_str);
+      in_dev_id.push_back(tmp_nd.ctx().real_dev_id());
+    } else {
+      in_data.push_back(in_nd.data().dptr_);
+      in_shapes.push_back(in_nd.shape().data());
+      in_dims.push_back(in_nd.shape().ndim());
+      in_types.push_back(in_nd.dtype());
+      in_verIDs.push_back(in_nd.version());
+      const char* ctx_str = in_nd.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+      in_dev_type.push_back(ctx_str);
+      in_dev_id.push_back(in_nd.ctx().real_dev_id());
+    }
   }
 
   for (size_t i = 0; i < outputs.size(); i++) {

From 488740aedfa1213229e6e60e079ccfb1436dd920 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Fri, 13 Mar 2020 23:38:06 +0000
Subject: [PATCH 41/53] cleaned up code and added comments

---
 include/mxnet/lib_api.h     |  6 +++---
 src/c_api/c_api_symbolic.cc | 10 +++++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index a3ebea5b1c65..c3e5ed3276e3 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -1285,11 +1285,11 @@ extern "C" {
                         int num_ids, int *ids, const char* const* opt_keys,
                         const char* const* opt_vals, int num_opts) {
     std::string subgraph_json(json);
-    // create map of attributes from list
+    // create map of options from list
     std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++) {
+    for (int i = 0; i < num_opts; i++)
       opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-    }
+
     // create array of bools for operator support
     std::vector<bool> _ids(num_ids, false);
     // call user's supportedOps function
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4d464e768a01..d2b17a920c9c 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1350,6 +1350,7 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
                          const mx_uint num_options,
                          const char** keys,
                          const char** vals) {
+  // create copy of input symbol
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();
   nnvm::Symbol *sym = static_cast<nnvm::Symbol *>(sym_handle);
@@ -1367,6 +1368,7 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     nnvm::DTypeVector arg_dtypes(args_len + aux_len);
     StorageTypeVector arg_stypes(args_len + aux_len);
     size_t args_top = 0, aux_top = 0;
+    // loop over inputs to symbol in order and add to args/aux if mutable
     for (size_t i = 0; i < num_forward_inputs; ++i) {
       const uint32_t nid = indexed_graph.input_nodes().at(i);
       if (mutable_nodes.count(nid)) {
@@ -1403,6 +1405,7 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
       common::HandleInferStorageTypeError(num_forward_inputs, indexed_graph,
                                           g.GetAttr<StorageTypeVector>("storage_type"));
     }
+    // set args/aux as attributes on graph so that subgraph property can use them
     std::vector<std::string> arg_names = sym->ListInputNames(nnvm::Symbol::kReadOnlyArgs);
     g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
     g.attrs["in_arg_names"] = std::make_shared<nnvm::any>(arg_names);
@@ -1411,6 +1414,7 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
     g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
   } else {
+    // args/aux were not specified, so set nullptr/empty-lists
     NDArray **in_args_ptr = static_cast<NDArray**>(nullptr);
     std::vector<std::string> arg_names;
     g.attrs["in_args"] = std::make_shared<nnvm::any>(in_args_ptr);
@@ -1421,11 +1425,11 @@ int MXOptimizeForBackend(SymbolHandle sym_handle,
     g.attrs["in_aux"] = std::make_shared<nnvm::any>(in_aux_ptr);
     g.attrs["in_aux_names"] = std::make_shared<nnvm::any>(aux_names);
   }
-
+  // create a data structure from pointer array
   std::vector<std::pair<std::string, std::string>> options_map;
-  for (mx_uint i = 0; i < num_options; ++i) {
+  for (mx_uint i = 0; i < num_options; ++i)
     options_map.emplace_back(keys[i], vals[i]);
-  }
+
   const auto backend = mxnet::op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
   const auto& subgraph_prop_list = backend->GetSubgraphProperties();
   for (auto property : subgraph_prop_list) {

From 3b278bec2c2f6dd8980cb563ba9e0c2276d4afc1 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Fri, 13 Mar 2020 23:45:52 +0000
Subject: [PATCH 42/53] fixed whitespace

---
 src/c_api/c_api.cc                                           | 2 +-
 src/operator/subgraph/partitioner/custom_subgraph_property.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 5053bb8a55d9..5e5f70e196b6 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -124,7 +124,7 @@ void CustomFComputeDispatcher(const std::string op_name,
   for (size_t i = 0; i < inputs.size(); i++) {
     const NDArray& in_nd = inputs[i];
     // reorder data if in MKLDNN format
-    if(in_nd.IsMKLDNNData()) {
+    if (in_nd.IsMKLDNNData()) {
       const NDArray& tmp_nd = in_nd.Reorder2Default();
       in_data.push_back(tmp_nd.data().dptr_);
       in_shapes.push_back(tmp_nd.shape().data());
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index e249f0ce9649..fa0819e1495e 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -221,7 +221,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       opt_keys_.push_back(kv.first.c_str());
       opt_vals_.push_back(kv.second.c_str());
     }
-    
+
     CHECK(call_supported_ops_(supported_ops_, json, supported_node_IDs.size(), ids,
                             opt_keys_.data(), opt_vals_.data(), opt_keys_.size()))
       << "Error calling supported_ops for '" << subgraph_prop << "'";

From 26734feb7f79ca00c9831190379bc71e12aeb32a Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Fri, 13 Mar 2020 23:59:37 +0000
Subject: [PATCH 43/53] added guards around MKLDNN checks for non-MKLDNN builds

---
 src/operator/subgraph/partitioner/custom_subgraph_property.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index fa0819e1495e..013f9b2575fd 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -119,11 +119,13 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       arg_names.push_back(in_arg_names[i].c_str());
       const NDArray &in_arg = *(in_args_ptr[i]);
 
+#if MXNET_USE_MKLDNN == 1
       // reorder data if in MKLDNN format
       if (in_arg.IsMKLDNNData()) {
         in_arg.Reorder2DefaultAsync();
         in_arg.WaitToRead();
       }
+#endif
 
       // pull out parts of NDArray to send to backend
       arg_data.push_back(in_arg.data().dptr_);
@@ -149,11 +151,13 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       aux_names.push_back(in_aux_names[i].c_str());
       const auto &in_aux = *(in_aux_ptr[i]);
 
+#if MXNET_USE_MKLDNN == 1
       // reorder data if in MKLDNN format
       if (in_aux.IsMKLDNNData()) {
         in_aux.Reorder2DefaultAsync();
         in_aux.WaitToRead();
       }
+#endif
 
       // pull out parts of NDArray to send to backend
       aux_data.push_back(in_aux.data().dptr_);

From 277288dc902f91e65223093bebbe80a1ec1514f4 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 14 Mar 2020 05:18:07 +0000
Subject: [PATCH 44/53] refactor to use pointers to reduce code duplication

---
 src/c_api/c_api.cc | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 5e5f70e196b6..b7022a0da021 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -120,30 +120,25 @@ void CustomFComputeDispatcher(const std::string op_name,
   std::vector<const char*> in_dev_type, out_dev_type;
   std::vector<int> in_dev_id, out_dev_id;
 
+  std::vector<NDArray> conv_mkl; //converted NDArrays from MKLDNN format
   // convert inputs/outpus NDArray to C types to be passed to lib_api.h
   for (size_t i = 0; i < inputs.size(); i++) {
-    const NDArray& in_nd = inputs[i];
+    NDArray const* in_nd = &(inputs[i]);
     // reorder data if in MKLDNN format
-    if (in_nd.IsMKLDNNData()) {
-      const NDArray& tmp_nd = in_nd.Reorder2Default();
-      in_data.push_back(tmp_nd.data().dptr_);
-      in_shapes.push_back(tmp_nd.shape().data());
-      in_dims.push_back(tmp_nd.shape().ndim());
-      in_types.push_back(tmp_nd.dtype());
-      in_verIDs.push_back(tmp_nd.version());
-      const char* ctx_str = tmp_nd.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
-      in_dev_type.push_back(ctx_str);
-      in_dev_id.push_back(tmp_nd.ctx().real_dev_id());
-    } else {
-      in_data.push_back(in_nd.data().dptr_);
-      in_shapes.push_back(in_nd.shape().data());
-      in_dims.push_back(in_nd.shape().ndim());
-      in_types.push_back(in_nd.dtype());
-      in_verIDs.push_back(in_nd.version());
-      const char* ctx_str = in_nd.ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
-      in_dev_type.push_back(ctx_str);
-      in_dev_id.push_back(in_nd.ctx().real_dev_id());
+    if (in_nd->IsMKLDNNData()) {
+      // convert from MKLDNN
+      conv_mkl.push_back(in_nd->Reorder2Default());
+      in_nd = &(conv_mkl.back());
     }
+    // pull out parts to pass over to library
+    in_data.push_back(in_nd->data().dptr_);
+    in_shapes.push_back(in_nd->shape().data());
+    in_dims.push_back(in_nd->shape().ndim());
+    in_types.push_back(in_nd->dtype());
+    in_verIDs.push_back(in_nd->version());
+    const char* ctx_str = in_nd->ctx().dev_mask() == Context::kCPU ? "cpu" : "gpu";
+    in_dev_type.push_back(ctx_str);
+    in_dev_id.push_back(in_nd->ctx().real_dev_id());
   }
 
   for (size_t i = 0; i < outputs.size(); i++) {

From 5940450cf1678fc55e4086ba7b4f48faced08554 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 14 Mar 2020 05:23:04 +0000
Subject: [PATCH 45/53] added MKLDNN guards for custom op

---
 src/c_api/c_api.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b7022a0da021..957b8f6cf468 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -124,12 +124,14 @@ void CustomFComputeDispatcher(const std::string op_name,
   // convert inputs/outpus NDArray to C types to be passed to lib_api.h
   for (size_t i = 0; i < inputs.size(); i++) {
     NDArray const* in_nd = &(inputs[i]);
+#if MXNET_USE_MKLDNN == 1
     // reorder data if in MKLDNN format
     if (in_nd->IsMKLDNNData()) {
       // convert from MKLDNN
       conv_mkl.push_back(in_nd->Reorder2Default());
       in_nd = &(conv_mkl.back());
     }
+#endif
     // pull out parts to pass over to library
     in_data.push_back(in_nd->data().dptr_);
     in_shapes.push_back(in_nd->shape().data());

From c1d3f5ee1565883ef97e6136184f6442db428000 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Sat, 14 Mar 2020 05:49:32 +0000
Subject: [PATCH 46/53] fixed whitespace

---
 src/c_api/c_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 957b8f6cf468..c805d1516b96 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -119,8 +119,8 @@ void CustomFComputeDispatcher(const std::string op_name,
   std::vector<size_t> in_verIDs, out_verIDs;
   std::vector<const char*> in_dev_type, out_dev_type;
   std::vector<int> in_dev_id, out_dev_id;
+  std::vector<NDArray> conv_mkl;  // converted NDArrays from MKLDNN format
 
-  std::vector<NDArray> conv_mkl; //converted NDArrays from MKLDNN format
   // convert inputs/outpus NDArray to C types to be passed to lib_api.h
   for (size_t i = 0; i < inputs.size(); i++) {
     NDArray const* in_nd = &(inputs[i]);

From 0b38e5c5d16b15b646a47bf482635263b799d6ed Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Mon, 16 Mar 2020 21:27:08 +0000
Subject: [PATCH 47/53] added subgraph property API to let subg_prop initialize
 subgraph inputs

---
 src/operator/subgraph/subgraph_property.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/operator/subgraph/subgraph_property.h b/src/operator/subgraph/subgraph_property.h
index f765aba8a5a4..a710c5e4e668 100644
--- a/src/operator/subgraph/subgraph_property.h
+++ b/src/operator/subgraph/subgraph_property.h
@@ -357,6 +357,14 @@ class SubgraphProperty {
                                      std::vector<nnvm::NodeEntry>* orig_input_entries) const {
     subgraph_node->inputs = *orig_input_entries;
   }
+  /*!
+   * \brief Initialize subgraph internal inputs with external input entries.
+   * Called before CreateSubgraphNode, optional
+   * \param input_entries input entries inside subgraph
+   * \param orig_input_entries input entries outside subgraph
+   */
+  virtual void InitSubgraphInputs(std::vector<nnvm::NodeEntry*>* input_entries,
+                                  std::vector<nnvm::NodeEntry>* orig_input_entries) const {}
   /*!
    * \brief Set an attr with name in the attr map.
    */

From d59a4dc415dd54e6011e6268050ddce219818e68 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Mon, 16 Mar 2020 21:28:04 +0000
Subject: [PATCH 48/53] moved custom code to subgraph property API, cleaned up
 build_subgraph.cc

---
 src/operator/subgraph/build_subgraph.cc | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index 64cd3d76c2e5..2d5501d26f86 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -560,18 +560,6 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
     }
     nnvm::ObjectPtr n = nnvm::CreateVariableNode(
         var_name + std::to_string(name_count_map[var_name]));
-    // set attribute for subgraph input to indicate if it is from an arg/param to model
-    if (e->node->is_variable()) {
-      n->attrs.dict["isArg"] = "True";
-      n->attrs.dict["argName"] = var_name;
-    } else {
-      n->attrs.dict["isArg"] = "False";
-    }
-    // pass down other attributes if available
-    if (e->node->attrs.dict.count("__dtype__") > 0)
-      n->attrs.dict["__dtype__"] = e->node->attrs.dict["__dtype__"];
-    if (e->node->attrs.dict.count("__shape__") > 0)
-      n->attrs.dict["__shape__"] = e->node->attrs.dict["__shape__"];
 
     *e = nnvm::NodeEntry{n, 0, 0};
   }
@@ -591,7 +579,7 @@ void ReattachGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
 }
 
 /*!
- * \brief Replace a set of nodes belonging to the same subgraph with a subgrpah node
+ * \brief Replace a set of nodes belonging to the same subgraph with a subgraph node
  * and keep the subgraph in the subgraph node.
  */
 void CreateSubgraphNode(nnvm::Graph* g,
@@ -621,6 +609,7 @@ void CreateSubgraphNode(nnvm::Graph* g,
     sym.outputs[i] = *output_entries[i];
   }
   const SubgraphPropertyPtr& subg_prop = g->GetAttr<SubgraphPropertyPtr>("subgraph_property");
+  subg_prop->InitSubgraphInputs(&input_entries, &orig_input_entries);
   nnvm::ObjectPtr n = subg_prop->CreateSubgraphNode(sym, subgraph_selector, subgraph_id);
   // CreateSubgraphNode returns NULL if subgraph property determines that subgraph is sub-optimal
   // In that case, subgraph node is not created and graph is not modified

From 5abc8c393b984d4035d170e084ab24180620eb62 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Mon, 16 Mar 2020 21:44:56 +0000
Subject: [PATCH 49/53] added support for ops with multiple outputs and
 InitSubgraphInputs

---
 .../partitioner/custom_subgraph_property.h    | 115 ++++++++++++++++--
 1 file changed, 107 insertions(+), 8 deletions(-)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 013f9b2575fd..e86cbac40d12 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -180,11 +180,16 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
       for (unsigned nid = 0; nid < indexed_graph.num_nodes(); nid++) {
         nnvm::Node* node = const_cast<nnvm::Node*>(indexed_graph[nid].source);
-        // get the output entry ID for this node
-        const uint32_t out_entry_id = indexed_graph.entry_id(nid, 0);
-        mxnet::TShape shape = shapes[out_entry_id];
         std::stringstream ss;
-        ss << shape;
+        ss << "[";
+        // set the output shapes for this node
+        for (unsigned oid = 0; oid < node->num_outputs(); oid++) {
+          const uint32_t out_entry_id = indexed_graph.entry_id(nid, oid);
+          mxnet::TShape shape = shapes[out_entry_id];
+          ss << shape;
+          if(oid < node->num_outputs()-1) ss << ",";
+        }
+        ss << "]";
         node->attrs.dict[MX_STR_SHAPE] = ss.str();
       }
     }
@@ -193,11 +198,16 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       std::vector<int> dtypes = g.GetAttr<std::vector<int> >("dtype");
       for (unsigned nid = 0; nid < indexed_graph.num_nodes(); nid++) {
         nnvm::Node* node = const_cast<nnvm::Node*>(indexed_graph[nid].source);
-        // get the output entry ID for this node
-        const uint32_t out_entry_id = indexed_graph.entry_id(nid, 0);
-        int dtype = dtypes[out_entry_id];
         std::stringstream ss;
-        ss << dtype;
+        ss << "[";
+        // set the output dtypes for this node
+        for (unsigned oid = 0; oid < node->num_outputs(); oid++) {
+          const uint32_t out_entry_id = indexed_graph.entry_id(nid, oid);
+          int dtype = dtypes[out_entry_id];
+          ss << dtype;
+          if(oid < node->num_outputs()-1) ss << ",";
+        }
+        ss << "]";
         node->attrs.dict[MX_STR_DTYPE] = ss.str();
       }
     }
@@ -299,6 +309,38 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       n->attrs.name = "_op" + std::to_string(subgraph_id);
       n->attrs.subgraphs.push_back(std::make_shared<nnvm::Symbol>(sym));
 
+      // set shapes
+      {
+        std::stringstream ss;
+        ss << "[";
+        for (unsigned i=0; i < sym.outputs.size(); i++) {
+          nnvm::Node* n = sym.outputs[i].node.get();
+          if (n->attrs.dict.count("__shape__") > 0) {
+            std::string& shape = n->attrs.dict["__shape__"];
+            ss << shape.substr(1,shape.length()-2); //strip off outer square brackets []
+          }
+          if (i < sym.outputs.size()-1)
+            ss << ",";
+        }
+        ss << "]";
+        n->attrs.dict["__shape__"]=ss.str();
+      }
+      // set dtypes
+      {
+        std::stringstream ss;
+        ss << "[";
+        for (unsigned i=0; i < sym.outputs.size(); i++) {
+          nnvm::Node* n = sym.outputs[i].node.get();
+          if (n->attrs.dict.count("__dtype__") > 0) {
+            std::string& dtype = n->attrs.dict["__dtype__"];
+            ss << dtype.substr(1,dtype.length()-2); //strip off outer square brackets []
+          }
+          if (i < sym.outputs.size()-1)
+            ss << ",";
+        }
+        ss << "]";
+        n->attrs.dict["__dtype__"]=ss.str();
+      }
       // set user specified attributes
       for (auto attr : user_attrs)
         n->attrs.dict[attr.first] = attr.second;
@@ -308,6 +350,63 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       return nullptr;
     }
   }
+
+  virtual void InitSubgraphInputs(std::vector<nnvm::NodeEntry*>* input_entries,
+                                  std::vector<nnvm::NodeEntry>* orig_input_entries) const {
+    std::cout << "in InitSubgraphInputs" << std::endl;
+    for (size_t i = 0; i < input_entries->size(); ++i) {
+      nnvm::NodeEntry *e = input_entries->at(i);
+      nnvm::NodeEntry& orig = orig_input_entries->at(i);
+
+      // set attribute for subgraph input to indicate if it is from an arg/param to model
+      if (orig.node->is_variable()) {
+        // get name of original output entry
+        nnvm::Symbol sym;
+        sym.outputs.push_back(orig);
+        const auto output_names = sym.ListOutputNames();
+        CHECK_EQ(output_names.size(), 1U);
+        const std::string& var_name = output_names[0];
+
+        e->node->attrs.dict["isArg"] = "True";
+        e->node->attrs.dict["argName"] = var_name;
+      } else {
+        e->node->attrs.dict["isArg"] = "False";
+      }
+
+      // pass down other attributes if available
+      if (orig.node->attrs.dict.count("__dtype__") > 0) {
+        // get dtype string from other node
+        std::string& dtype = orig.node->attrs.dict["__dtype__"];
+        int idx = 0;
+        // find the beginning of the output dtype for the particular output index
+        for (unsigned x=0; x < orig.index; x++)
+          idx = dtype.find("[",idx+1);
+        if (idx == 0) idx++; // if output index is 0, start after first square bracket [
+        int stop = dtype.find("]",idx); // find stop index for this output dtype
+        std::stringstream ss;
+        // create new dtype string for this node
+        ss << "[" << dtype.substr(idx,stop-idx+1) << "]";
+        e->node->attrs.dict["__dtype__"] = ss.str();
+      }
+
+      if (orig.node->attrs.dict.count("__shape__") > 0) {
+        // get shape string from other node
+        std::string& shape = orig.node->attrs.dict["__shape__"];
+        int idx = 0;
+        // find the beginning of the output shape for the particular output index
+        for (unsigned x=0; x < orig.index; x++)
+          idx = shape.find("[",idx+1);
+        if (idx == 0) idx++; // if output index is 0, start after first square bracket [
+        int stop = shape.find("]",idx); // find stop index for this output shape
+        std::stringstream ss;
+        // create new shape string for this node
+        ss << "[" << shape.substr(idx,stop-idx+1) << "]";
+        e->node->attrs.dict["__shape__"] = ss.str();
+      }
+    }
+    std::cout << "-----------------------------" << std::endl;
+  }
+
   // override CreateSubgraphSelector
   virtual SubgraphSelectorPtr CreateSubgraphSelector() const {
     return std::make_shared<CustomContainOpSelector>(supported_nodes);

From 90f6973e1ff66dce7a6dd980d26f6fc47a1dbb38 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Mon, 16 Mar 2020 22:30:27 +0000
Subject: [PATCH 50/53] fixed sanity, removed prints

---
 .../partitioner/custom_subgraph_property.h    | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index e86cbac40d12..afdcefaf7b91 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -187,7 +187,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
           const uint32_t out_entry_id = indexed_graph.entry_id(nid, oid);
           mxnet::TShape shape = shapes[out_entry_id];
           ss << shape;
-          if(oid < node->num_outputs()-1) ss << ",";
+          if (oid < node->num_outputs()-1) ss << ",";
         }
         ss << "]";
         node->attrs.dict[MX_STR_SHAPE] = ss.str();
@@ -205,7 +205,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
           const uint32_t out_entry_id = indexed_graph.entry_id(nid, oid);
           int dtype = dtypes[out_entry_id];
           ss << dtype;
-          if(oid < node->num_outputs()-1) ss << ",";
+          if (oid < node->num_outputs()-1) ss << ",";
         }
         ss << "]";
         node->attrs.dict[MX_STR_DTYPE] = ss.str();
@@ -317,13 +317,13 @@ class  CustomSubgraphProperty: public SubgraphProperty {
           nnvm::Node* n = sym.outputs[i].node.get();
           if (n->attrs.dict.count("__shape__") > 0) {
             std::string& shape = n->attrs.dict["__shape__"];
-            ss << shape.substr(1,shape.length()-2); //strip off outer square brackets []
+            ss << shape.substr(1, shape.length()-2);  // strip off outer square brackets []
           }
           if (i < sym.outputs.size()-1)
             ss << ",";
         }
         ss << "]";
-        n->attrs.dict["__shape__"]=ss.str();
+        n->attrs.dict["__shape__"] = ss.str();
       }
       // set dtypes
       {
@@ -333,13 +333,13 @@ class  CustomSubgraphProperty: public SubgraphProperty {
           nnvm::Node* n = sym.outputs[i].node.get();
           if (n->attrs.dict.count("__dtype__") > 0) {
             std::string& dtype = n->attrs.dict["__dtype__"];
-            ss << dtype.substr(1,dtype.length()-2); //strip off outer square brackets []
+            ss << dtype.substr(1, dtype.length()-2);  // strip off outer square brackets []
           }
           if (i < sym.outputs.size()-1)
             ss << ",";
         }
         ss << "]";
-        n->attrs.dict["__dtype__"]=ss.str();
+        n->attrs.dict["__dtype__"] = ss.str();
       }
       // set user specified attributes
       for (auto attr : user_attrs)
@@ -353,7 +353,6 @@ class  CustomSubgraphProperty: public SubgraphProperty {
 
   virtual void InitSubgraphInputs(std::vector<nnvm::NodeEntry*>* input_entries,
                                   std::vector<nnvm::NodeEntry>* orig_input_entries) const {
-    std::cout << "in InitSubgraphInputs" << std::endl;
     for (size_t i = 0; i < input_entries->size(); ++i) {
       nnvm::NodeEntry *e = input_entries->at(i);
       nnvm::NodeEntry& orig = orig_input_entries->at(i);
@@ -380,12 +379,12 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         int idx = 0;
         // find the beginning of the output dtype for the particular output index
         for (unsigned x=0; x < orig.index; x++)
-          idx = dtype.find("[",idx+1);
-        if (idx == 0) idx++; // if output index is 0, start after first square bracket [
-        int stop = dtype.find("]",idx); // find stop index for this output dtype
+          idx = dtype.find("[", idx+1);
+        if (idx == 0) idx++;  // if output index is 0, start after first square bracket [
+        int stop = dtype.find("]", idx);  // find stop index for this output dtype
         std::stringstream ss;
         // create new dtype string for this node
-        ss << "[" << dtype.substr(idx,stop-idx+1) << "]";
+        ss << "[" << dtype.substr(idx, stop-idx+1) << "]";
         e->node->attrs.dict["__dtype__"] = ss.str();
       }
 
@@ -395,16 +394,15 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         int idx = 0;
         // find the beginning of the output shape for the particular output index
         for (unsigned x=0; x < orig.index; x++)
-          idx = shape.find("[",idx+1);
-        if (idx == 0) idx++; // if output index is 0, start after first square bracket [
-        int stop = shape.find("]",idx); // find stop index for this output shape
+          idx = shape.find("[", idx+1);
+        if (idx == 0) idx++;  // if output index is 0, start after first square bracket [
+        int stop = shape.find("]",idx);  // find stop index for this output shape
         std::stringstream ss;
         // create new shape string for this node
-        ss << "[" << shape.substr(idx,stop-idx+1) << "]";
+        ss << "[" << shape.substr(idx, stop-idx+1) << "]";
         e->node->attrs.dict["__shape__"] = ss.str();
       }
     }
-    std::cout << "-----------------------------" << std::endl;
   }
 
   // override CreateSubgraphSelector

From 28b6bef8048aa45bf0e754c15ff78449acecad46 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Mon, 16 Mar 2020 23:43:29 +0000
Subject: [PATCH 51/53] fixed whitespace

---
 src/operator/subgraph/partitioner/custom_subgraph_property.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index afdcefaf7b91..81da9da78f5c 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -396,7 +396,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         for (unsigned x=0; x < orig.index; x++)
           idx = shape.find("[", idx+1);
         if (idx == 0) idx++;  // if output index is 0, start after first square bracket [
-        int stop = shape.find("]",idx);  // find stop index for this output shape
+        int stop = shape.find("]", idx);  // find stop index for this output shape
         std::stringstream ss;
         // create new shape string for this node
         ss << "[" << shape.substr(idx, stop-idx+1) << "]";

From 516d149ea1eb47fbb344a871969629bc1b39da27 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Tue, 17 Mar 2020 07:30:07 +0000
Subject: [PATCH 52/53] fixed shape/dtype parsing

---
 include/mxnet/lib_api.h                       | 26 ++++++++++++
 .../partitioner/custom_subgraph_property.h    | 40 +++++++------------
 2 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c3e5ed3276e3..3ceebb601ee8 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -415,6 +415,32 @@ class OpResource {
 #define MX_STR_DTYPE "__dtype__"
 #define MX_STR_SHAPE "__shape__"
 
+/* \brief get shape value from list of shapes string
+ * format: [[1]] or [[1],[2]]
+ */
+std::string getShapeAt(std::string& shape, unsigned index) {
+  int idx = 1;  // start at 1 to skip the first square bracket [
+  // find the beginning of the output shape for the particular output index
+  for (unsigned x=0; x < index; x++)
+    idx = shape.find("[", idx+1);
+  int stop = shape.find("]", idx);  // find stop index for this output shape
+  // add this shape to the list
+  return shape.substr(idx, stop-idx+1);
+}
+
+/* \brief get dtype value from list of dtypes string
+ * format: [1] or [1,2]
+ */
+std::string getDtypeAt(std::string& dtype, unsigned index) {
+  // find the beginning of the output dtype for the particular output index
+  int idx = 0;
+  for (unsigned x=0; x < index; x++)
+    idx = dtype.find(",", idx+1);
+  int stop = dtype.find(",", idx+1);  // find stop index for this output dtype
+  if (stop == -1) stop = dtype.find("]",idx+1);
+  return dtype.substr(idx+1, stop-idx-1);
+}
+
 /*! \brief Types of JSON objects */
 enum JsonType {ERR, STR, NUM, LIST, MAP};
 
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 81da9da78f5c..d550cd776fd9 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -185,7 +185,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         // set the output shapes for this node
         for (unsigned oid = 0; oid < node->num_outputs(); oid++) {
           const uint32_t out_entry_id = indexed_graph.entry_id(nid, oid);
-          mxnet::TShape shape = shapes[out_entry_id];
+          mxnet::TShape& shape = shapes[out_entry_id];
           ss << shape;
           if (oid < node->num_outputs()-1) ss << ",";
         }
@@ -314,10 +314,11 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         std::stringstream ss;
         ss << "[";
         for (unsigned i=0; i < sym.outputs.size(); i++) {
-          nnvm::Node* n = sym.outputs[i].node.get();
-          if (n->attrs.dict.count("__shape__") > 0) {
-            std::string& shape = n->attrs.dict["__shape__"];
-            ss << shape.substr(1, shape.length()-2);  // strip off outer square brackets []
+          const nnvm::NodeEntry& e = sym.outputs[i];
+          if (e.node->attrs.dict.count("__shape__") > 0) {
+            std::string& shape = e.node->attrs.dict["__shape__"];
+            // add this shape to the list
+            ss << getShapeAt(shape, e.index);
           }
           if (i < sym.outputs.size()-1)
             ss << ",";
@@ -330,10 +331,11 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         std::stringstream ss;
         ss << "[";
         for (unsigned i=0; i < sym.outputs.size(); i++) {
-          nnvm::Node* n = sym.outputs[i].node.get();
-          if (n->attrs.dict.count("__dtype__") > 0) {
-            std::string& dtype = n->attrs.dict["__dtype__"];
-            ss << dtype.substr(1, dtype.length()-2);  // strip off outer square brackets []
+          const nnvm::NodeEntry& e = sym.outputs[i];
+          if (e.node->attrs.dict.count("__dtype__") > 0) {
+            std::string& dtype = e.node->attrs.dict["__dtype__"]; // format: [1,2]
+            // add this dtype to the list
+            ss << getDtypeAt(dtype, e.index);
           }
           if (i < sym.outputs.size()-1)
             ss << ",";
@@ -344,7 +346,6 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       // set user specified attributes
       for (auto attr : user_attrs)
         n->attrs.dict[attr.first] = attr.second;
-
       return n;
     } else {
       return nullptr;
@@ -376,30 +377,17 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       if (orig.node->attrs.dict.count("__dtype__") > 0) {
         // get dtype string from other node
         std::string& dtype = orig.node->attrs.dict["__dtype__"];
-        int idx = 0;
-        // find the beginning of the output dtype for the particular output index
-        for (unsigned x=0; x < orig.index; x++)
-          idx = dtype.find("[", idx+1);
-        if (idx == 0) idx++;  // if output index is 0, start after first square bracket [
-        int stop = dtype.find("]", idx);  // find stop index for this output dtype
         std::stringstream ss;
-        // create new dtype string for this node
-        ss << "[" << dtype.substr(idx, stop-idx+1) << "]";
+        ss << "[" << getDtypeAt(dtype, orig.index) << "]";
         e->node->attrs.dict["__dtype__"] = ss.str();
       }
 
       if (orig.node->attrs.dict.count("__shape__") > 0) {
         // get shape string from other node
         std::string& shape = orig.node->attrs.dict["__shape__"];
-        int idx = 0;
-        // find the beginning of the output shape for the particular output index
-        for (unsigned x=0; x < orig.index; x++)
-          idx = shape.find("[", idx+1);
-        if (idx == 0) idx++;  // if output index is 0, start after first square bracket [
-        int stop = shape.find("]", idx);  // find stop index for this output shape
-        std::stringstream ss;
         // create new shape string for this node
-        ss << "[" << shape.substr(idx, stop-idx+1) << "]";
+        std::stringstream ss;
+        ss << "[" << getShapeAt(shape, orig.index) << "]";
         e->node->attrs.dict["__shape__"] = ss.str();
       }
     }

From 4e2efec545def329730502d737fc0817e98cf710 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <samskalicky@gmail.com>
Date: Tue, 17 Mar 2020 16:31:26 +0000
Subject: [PATCH 53/53] fixed lint

---
 include/mxnet/lib_api.h                                     | 6 +++---
 .../subgraph/partitioner/custom_subgraph_property.h         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 3ceebb601ee8..9b32122c7d7a 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -418,7 +418,7 @@ class OpResource {
 /* \brief get shape value from list of shapes string
  * format: [[1]] or [[1],[2]]
  */
-std::string getShapeAt(std::string& shape, unsigned index) {
+std::string getShapeAt(const std::string& shape, unsigned index) {
   int idx = 1;  // start at 1 to skip the first square bracket [
   // find the beginning of the output shape for the particular output index
   for (unsigned x=0; x < index; x++)
@@ -431,13 +431,13 @@ std::string getShapeAt(std::string& shape, unsigned index) {
 /* \brief get dtype value from list of dtypes string
  * format: [1] or [1,2]
  */
-std::string getDtypeAt(std::string& dtype, unsigned index) {
+std::string getDtypeAt(const std::string& dtype, unsigned index) {
   // find the beginning of the output dtype for the particular output index
   int idx = 0;
   for (unsigned x=0; x < index; x++)
     idx = dtype.find(",", idx+1);
   int stop = dtype.find(",", idx+1);  // find stop index for this output dtype
-  if (stop == -1) stop = dtype.find("]",idx+1);
+  if (stop == -1) stop = dtype.find("]", idx+1);
   return dtype.substr(idx+1, stop-idx-1);
 }
 
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index d550cd776fd9..b7f2cc2d0fef 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -333,7 +333,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
         for (unsigned i=0; i < sym.outputs.size(); i++) {
           const nnvm::NodeEntry& e = sym.outputs[i];
           if (e.node->attrs.dict.count("__dtype__") > 0) {
-            std::string& dtype = e.node->attrs.dict["__dtype__"]; // format: [1,2]
+            std::string& dtype = e.node->attrs.dict["__dtype__"];
             // add this dtype to the list
             ss << getDtypeAt(dtype, e.index);
           }