From 4fea36d9952109538cb0423d158c9708df6fa594 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 11 Aug 2020 16:07:29 +0000
Subject: [PATCH 01/43] initial commit

---
 example/extensions/lib_external_ops/Makefile  | 25 +++++++
 example/extensions/lib_external_ops/README.md |  1 +
 .../extensions/lib_external_ops/libtest.cc    | 69 +++++++++++++++++++
 .../extensions/lib_external_ops/min_ex-inl.h  | 40 +++++++++++
 example/extensions/lib_external_ops/min_ex.cc | 15 ++++
 5 files changed, 150 insertions(+)
 create mode 100644 example/extensions/lib_external_ops/Makefile
 create mode 100644 example/extensions/lib_external_ops/README.md
 create mode 100644 example/extensions/lib_external_ops/libtest.cc
 create mode 100644 example/extensions/lib_external_ops/min_ex-inl.h
 create mode 100644 example/extensions/lib_external_ops/min_ex.cc
diff --git a/example/extensions/lib_external_ops/Makefile b/example/extensions/lib_external_ops/Makefile
new file mode 100644
index 000000000000..c3e82635ec87
--- /dev/null
+++ b/example/extensions/lib_external_ops/Makefile
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all:
+	g++ -shared -fPIC -std=c++11 min_ex.cc.o -o libmin_ex.so
+
+test:
+	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl
+
+clean:
+	rm -rf libmin_ex.so
diff --git a/example/extensions/lib_external_ops/README.md b/example/extensions/lib_external_ops/README.md
new file mode 100644
index 000000000000..2fd9f9570028
--- /dev/null
+++ b/example/extensions/lib_external_ops/README.md
@@ -0,0 +1 @@
+TBD
\ No newline at end of file
diff --git a/example/extensions/lib_external_ops/libtest.cc b/example/extensions/lib_external_ops/libtest.cc
new file mode 100644
index 000000000000..3453c2815dd3
--- /dev/null
+++ b/example/extensions/lib_external_ops/libtest.cc
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file libtest.cc
+ * \brief This test checks if the library is implemented correctly
+ * and does not involve dynamic loading of library into MXNet
+ * This test is supposed to be run before test.py
+ */
+
+#include <dlfcn.h>
+
+#include <iostream>
+
+
+int main(void) {
+  dlerror();
+  void *mx;
+  mx = dlopen("libmxnet.so", RTLD_LAZY | RTLD_GLOBAL);
+  
+  if (!mx) {
+    std::cerr << "Unable to load libmxnet.so" << std::endl;
+    char* err = dlerror();
+    if(err)
+      std::cerr << err << std::endl;
+    return 1;
+  }
+
+  // Get a handle to the library.
+  void *handle;
+  handle = dlopen("libmin_ex.so", RTLD_LAZY);
+
+  if (!handle) {
+    std::cerr << "Unable to load library" << std::endl;
+    char* err = dlerror();
+    if(err)
+      std::cerr << err << std::endl;
+    return 1;
+  }
+
+  // get initialize function address from the library
+  void* init_lib = dlsym(handle, "initialize");
+
+  if (!init_lib) {
+    std::cerr << "Unable to get function 'intialize' from library" << std::endl;
+    return 1;
+  }
+
+  dlclose(handle);
+
+  return 0;
+}
diff --git a/example/extensions/lib_external_ops/min_ex-inl.h b/example/extensions/lib_external_ops/min_ex-inl.h
new file mode 100644
index 000000000000..56784f6dbdd1
--- /dev/null
+++ b/example/extensions/lib_external_ops/min_ex-inl.h
@@ -0,0 +1,40 @@
+#ifndef MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
+#define MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
+
+#include <dmlc/parameter.h>
+#include <vector>
+#include <algorithm>
+#include "mxnet_op.h"
+#include "operator_common.h"
+#include "elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+void MinExForward(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  //do nothing                                                                                                                                                                         
+}
+
+
+inline bool MinExOpShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector* in_attrs,
+                         mxnet::ShapeVector* out_attrs) {
+    //do nothing                                                                                                                                                                       
+    return true;
+}
+
+inline bool MinExOpType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  //do nothing                                                                                                                                                                         
+  return true;
+}
+
+}  // namespace op                                                                                                                                                                     
+}  // namespace mxnet                                                                                                                                                                  
+
+#endif  // MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
diff --git a/example/extensions/lib_external_ops/min_ex.cc b/example/extensions/lib_external_ops/min_ex.cc
new file mode 100644
index 000000000000..930704dcaad2
--- /dev/null
+++ b/example/extensions/lib_external_ops/min_ex.cc
@@ -0,0 +1,15 @@
+#include "min_ex-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(min_ex)
+.describe("some description")
+.set_num_inputs(0)
+.set_num_outputs(0)
+.set_attr<mxnet::FInferShape>("FInferShape", MinExOpShape)
+.set_attr<nnvm::FInferType>("FInferType", MinExOpType)
+.set_attr<FCompute>("FCompute<cpu>", MinExForward);
+
+}  // namespace op                                                                                                                                                                     
+}  // namespace mxnet      

From 3cea397bcc202aadacf25208c8ef226481ff4b35 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 06:05:45 +0000
Subject: [PATCH 02/43] split lib_api.h into lib_api.cc, updated examples for
 2.0/gluon

---
 example/extensions/lib_api/Makefile           |    2 +-
 example/extensions/lib_api/init_lib.cc        |    2 +-
 example/extensions/lib_custom_op/Makefile     |    8 +-
 example/extensions/lib_custom_op/gemm_lib.cc  |    2 +-
 example/extensions/lib_custom_op/relu_lib.cu  |    2 +-
 .../lib_custom_op/test_transposecsr.py        |   24 +-
 .../lib_custom_op/test_transposerowsp.py      |   26 +-
 .../lib_custom_op/transposecsr_lib.cc         |    2 +-
 .../lib_custom_op/transposerowsp_lib.cc       |    2 +-
 example/extensions/lib_pass/Makefile          |    2 +-
 example/extensions/lib_pass/pass_lib.cc       |    2 +-
 example/extensions/lib_subgraph/Makefile      |    2 +-
 .../extensions/lib_subgraph/subgraph_lib.cc   |    2 +-
 include/mxnet/lib_api.h                       | 1535 +++-------------
 src/lib_api.cc                                | 1567 +++++++++++++++++
 15 files changed, 1814 insertions(+), 1366 deletions(-)
 create mode 100644 src/lib_api.cc

diff --git a/example/extensions/lib_api/Makefile b/example/extensions/lib_api/Makefile
index cb529390b77f..e71e4d8ac114 100644
--- a/example/extensions/lib_api/Makefile
+++ b/example/extensions/lib_api/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -std=c++11 -shared -fPIC init_lib.cc -o libinit_lib.so -I ../../../include/mxnet
+	g++ -std=c++11 -shared -fPIC init_lib.cc ../../../src/lib_api.cc -o libinit_lib.so -I ../../../include
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../../include/mxnet
diff --git a/example/extensions/lib_api/init_lib.cc b/example/extensions/lib_api/init_lib.cc
index 0ed43761fe53..a21c481bee2f 100644
--- a/example/extensions/lib_api/init_lib.cc
+++ b/example/extensions/lib_api/init_lib.cc
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/example/extensions/lib_custom_op/Makefile b/example/extensions/lib_custom_op/Makefile
index feded2947ca3..c16d7cd0207e 100644
--- a/example/extensions/lib_custom_op/Makefile
+++ b/example/extensions/lib_custom_op/Makefile
@@ -18,16 +18,16 @@
 all: gemm_lib relu_lib transposecsr_lib transposerowsp_lib
 
 gemm_lib:
-	g++ -shared -fPIC -std=c++11 gemm_lib.cc -o libgemm_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 gemm_lib.cc ../../../src/lib_api.cc -o libgemm_lib.so -I ../../../include
 
 relu_lib:
-	nvcc -shared -std=c++11 -Xcompiler -fPIC relu_lib.cu -o librelu_lib.so -I ../../../include/mxnet
+	nvcc -shared -std=c++11 -Xcompiler -fPIC relu_lib.cu ../../../src/lib_api.cc -o librelu_lib.so -I ../../../include
 
 transposecsr_lib:
-	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc -o libtransposecsr_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc ../../../src/lib_api.cc -o libtransposecsr_lib.so -I ../../../include
 
 transposerowsp_lib:
-	g++ -shared -fPIC -std=c++11 transposerowsp_lib.cc -o libtransposerowsp_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 transposerowsp_lib.cc ../../../src/lib_api.cc -o libtransposerowsp_lib.so -I ../../../include
 
 clean:
 	rm -rf libgemm_lib.so librelu_lib.so libtransposecsr_lib.so libtransposerowsp_lib.so
diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index 6081713cda67..164ac014f922 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -25,7 +25,7 @@
 
 #include <iostream>
 #include <utility>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/example/extensions/lib_custom_op/relu_lib.cu b/example/extensions/lib_custom_op/relu_lib.cu
index 7022c76e6999..34ce08db6373 100644
--- a/example/extensions/lib_custom_op/relu_lib.cu
+++ b/example/extensions/lib_custom_op/relu_lib.cu
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/example/extensions/lib_custom_op/test_transposecsr.py b/example/extensions/lib_custom_op/test_transposecsr.py
index 37d066a7bec2..5f670aeedd9b 100644
--- a/example/extensions/lib_custom_op/test_transposecsr.py
+++ b/example/extensions/lib_custom_op/test_transposecsr.py
@@ -54,25 +54,25 @@
 print("indices:", c.indices.asnumpy())
 print("indptr:", c.indptr.asnumpy())
 
-print("--------start symbolic compute--------")
+print("--------start Gluon compute--------")
 d = mx.sym.Variable('d')
 e = mx.sym.my_transposecsr(d)
 f = mx.sym.my_state_transposecsr(d, test_kw=200)
 
-exe = e.bind(ctx=mx.cpu(),args={'d':a})
-exe2 = f.bind(ctx=mx.cpu(),args={'d':a})
-out = exe.forward()
+block = mx.gluon.nn.SymbolBlock(e, [d])
+out = block(a)
 print("Compute Results:")
-print("data:", out[0].data.asnumpy())
-print("indices:", out[0].indices.asnumpy())
-print("indptr:", out[0].indptr.asnumpy())
+print("data:", out.data.asnumpy())
+print("indices:", out.indices.asnumpy())
+print("indptr:", out.indptr.asnumpy())
 
-out2 = exe2.forward()
-out2 = exe2.forward()
+block2 = mx.gluon.nn.SymbolBlock(f,[d])
+out2 = block2(a)
+out2 = block2(a)
 print("Stateful Compute Result:")
-print("data:", out2[0].data.asnumpy())
-print("indices:", out2[0].indices.asnumpy())
-print("indptr:", out2[0].indptr.asnumpy())
+print("data:", out2.data.asnumpy())
+print("indices:", out2.indices.asnumpy())
+print("indptr:", out2.indptr.asnumpy())
 
 print("--------Baseline(dense)--------")
 print(mx.nd.transpose(a.tostype('default')))
diff --git a/example/extensions/lib_custom_op/test_transposerowsp.py b/example/extensions/lib_custom_op/test_transposerowsp.py
index cea62ec6e98c..d2d2c2eeeb32 100644
--- a/example/extensions/lib_custom_op/test_transposerowsp.py
+++ b/example/extensions/lib_custom_op/test_transposerowsp.py
@@ -51,23 +51,29 @@
 print("data:", c.data.asnumpy())
 print("indices:", c.indices.asnumpy())
 
-print("--------start symbolic compute--------")
+print("--------start Gluon compute--------")
 d = mx.sym.Variable('d')
 e = mx.sym.my_transposerowsp(d)
 f = mx.sym.my_state_transposerowsp(d, test_kw=200)
 
-exe = e.bind(ctx=mx.cpu(),args={'d':a})
-exe2 = f.bind(ctx=mx.cpu(),args={'d':a})
-out = exe.forward()
+#exe = e.bind(ctx=mx.cpu(),args={'d':a})
+block = mx.gluon.nn.SymbolBlock(e,[d])
+#out = exe.forward()
+out = block(a)
 print("Compute Results:")
-print("data:", out[0].data.asnumpy())
-print("indices:", out[0].indices.asnumpy())
+print(out)
+print("data:", out.data.asnumpy())
+print("indices:", out.indices.asnumpy())
 
-out2 = exe2.forward()
-out2 = exe2.forward()
+#exe2 = f.bind(ctx=mx.cpu(),args={'d':a})
+block2 = mx.gluon.nn.SymbolBlock(f,[d])
+#out2 = exe2.forward()
+out2 = block2(a)
+#out2 = exe2.forward()
+out2 = block2(a)
 print("Stateful Compute Result:")
-print("data:", out2[0].data.asnumpy())
-print("indices:", out2[0].indices.asnumpy())
+print("data:", out2.data.asnumpy())
+print("indices:", out2.indices.asnumpy())
 
 print("--------Baseline(dense)--------")
 print(mx.nd.transpose(a.tostype('default')))
diff --git a/example/extensions/lib_custom_op/transposecsr_lib.cc b/example/extensions/lib_custom_op/transposecsr_lib.cc
index 0a882f4d2517..2ef85c4b46eb 100644
--- a/example/extensions/lib_custom_op/transposecsr_lib.cc
+++ b/example/extensions/lib_custom_op/transposecsr_lib.cc
@@ -25,7 +25,7 @@
 
 #include <iostream>
 #include <utility>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/example/extensions/lib_custom_op/transposerowsp_lib.cc b/example/extensions/lib_custom_op/transposerowsp_lib.cc
index cb4592239ef9..72b9b312566a 100644
--- a/example/extensions/lib_custom_op/transposerowsp_lib.cc
+++ b/example/extensions/lib_custom_op/transposerowsp_lib.cc
@@ -25,7 +25,7 @@
 
 #include <iostream>
 #include <utility>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/example/extensions/lib_pass/Makefile b/example/extensions/lib_pass/Makefile
index 759a08c48c89..e555b191ecf5 100644
--- a/example/extensions/lib_pass/Makefile
+++ b/example/extensions/lib_pass/Makefile
@@ -18,7 +18,7 @@
 all: pass_lib
 
 pass_lib:
-	g++ -shared -fPIC -std=c++11 pass_lib.cc -o libpass_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 pass_lib.cc ../../../src/lib_api.cc -o libpass_lib.so -I ../../../include
 
 clean:
 	rm -rf libpass_lib.so
diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index 825d38290936..fb9a2d42f8d3 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -26,7 +26,7 @@
 #include <cmath>
 #include <iostream>
 #include <algorithm>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/example/extensions/lib_subgraph/Makefile b/example/extensions/lib_subgraph/Makefile
index c45100b69ef7..5449e3af9c58 100644
--- a/example/extensions/lib_subgraph/Makefile
+++ b/example/extensions/lib_subgraph/Makefile
@@ -18,7 +18,7 @@
 all: subgraph_lib
 
 subgraph_lib:
-	g++ -shared -fPIC -std=c++11 subgraph_lib.cc -o libsubgraph_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 subgraph_lib.cc ../../../src/lib_api.cc -o libsubgraph_lib.so -I ../../../include
 
 clean:
 	rm -rf libsubgraph_lib.so
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index b2b5a74f2d0a..1f39345cc460 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -27,7 +27,7 @@
 #include <iostream>
 #include <algorithm>
 #include <utility>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 3367bc661c12..9e17056f4c44 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -216,6 +216,44 @@ extern "C" {
 namespace mxnet {
 namespace ext {
 
+/* \brief Class to store error messages from extensions to pass to MXNet */
+class MXerrorMsgs {
+ public:
+  /*!
+   * \brief get singleton pointer to class
+   * \returns pointer to class
+   */
+  static MXerrorMsgs* get() {
+    static MXerrorMsgs inst;
+    return &inst;
+  }
+  /*!
+   * \brief add a new error message
+   */
+  std::stringstream& add(const char* file, int line) {
+    messages.push_back(std::stringstream());
+    messages.back() << file << "[" << line << "]: ";
+    return messages.back();
+  }
+  int size() {
+    return messages.size();
+  }
+  const std::string* get(int idx) {
+    return new std::string(messages.at(idx).str());
+  }
+
+ private:
+  /*! \brief constructor */
+  MXerrorMsgs() {}
+  /*! \brief destructor */
+  ~MXerrorMsgs() {}
+  /*! \brief map of entries in registry */
+  std::vector<std::stringstream> messages;
+};
+
+// Add a new error message, example: MX_ERROR_MSG << "my error msg";
+#define MX_ERROR_MSG MXerrorMsgs::get()->add(__FILE__, __LINE__)
+
 /*!
  * \brief Tensor data type, consistent with mshadow data type
  */
@@ -248,15 +286,13 @@ enum MXStorageType {
  * dev_id is the device index where the tensor locates
  */
 struct MXContext {
-  MXContext() : dev_type("error"), dev_id(-1) {}
-  explicit MXContext(std::string dev_type_, int dev_id_)
-    : dev_type(dev_type_), dev_id(dev_id_) {}
-  explicit MXContext(const char* dev_type_, int dev_id_)
-    : dev_type(dev_type_), dev_id(dev_id_) {}
-  static MXContext CPU() { return MXContext("cpu", 0); }
-  static MXContext GPU() { return MXContext("gpu", 0); }
-  static MXContext CPU(int dev_id) { return MXContext("cpu", dev_id); }
-  static MXContext GPU(int dev_id) { return MXContext("gpu", dev_id); }
+  MXContext();
+  explicit MXContext(std::string dev_type_, int dev_id_);
+  explicit MXContext(const char* dev_type_, int dev_id_);
+  static MXContext CPU();
+  static MXContext GPU();
+  static MXContext CPU(int dev_id);
+  static MXContext GPU(int dev_id);
 
   std::string dev_type;
   int dev_id;
@@ -286,115 +322,25 @@ struct MXSparse {
   int64_t indptr_len;
 
   void set(void *data_ptr, const int64_t* dims, int ndims, void *idx,
-          int64_t num_idx, void *idx_ptr = nullptr, int64_t num_idx_ptr = 0) {
-    data = data_ptr;
-    // If CSR, num of non-zero elemets is num_idx,
-    // If row sparse, num of elements is num_idx * width.
-    data_len = num_idx;
-    if (!idx_ptr) {
-      for (int i = 1; i < ndims; ++i)
-         data_len *= dims[i];
-    }
-
-    indices = reinterpret_cast<int64_t*>(idx);
-    indices_len = num_idx;
-
-    if (idx_ptr) {
-      indptr = reinterpret_cast<int64_t*>(idx_ptr);
-      indptr_len = num_idx_ptr;
-    }
-  }
+           int64_t num_idx, void *idx_ptr = nullptr, int64_t num_idx_ptr = 0);
 };
 
 /*!
  * \brief Tensor data structure used by custom operator
  */
 struct MXTensor {
-  MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0), stype(kDefaultStorage) {}
-  MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
-    dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx), stype(oth.stype) {
-    setDLTensor();
-  }
+  MXTensor();
+  MXTensor(const MXTensor& oth);
   MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
-           size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx), stype(stype) {
-    setDLTensor();
-  }
+           size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage);
 
   /*! \brief populate internal tensor fields */
   void setTensor(void *dptr, MXDType type, const int64_t* dims, int ndims,
-                 size_t vID, MXContext mx_ctx, MXStorageType storage_type) {
-    data_ptr = dptr; dtype = type; verID = vID; ctx = mx_ctx; stype = storage_type;
-    shape.clear();
-    for (int j = 0; j < ndims; j++) {
-      shape.push_back(dims[j]);
-    }
-    setDLTensor();
-  }
+                 size_t vID, MXContext mx_ctx, MXStorageType storage_type);
 
   /*! \brief populate DLTensor fields */
-  void setDLTensor() {
-    dltensor.data = data_ptr;
-    dltensor.ndim = shape.size();
-    dltensor.shape = const_cast<int64_t*>(shape.data());
-    dltensor.strides = nullptr;
-    dltensor.byte_offset = 0;
-    dltensor.dtype.lanes = 1;
-    dltensor.ctx.device_id = ctx.dev_id;
-    if (ctx.dev_type == "cpu")
-      dltensor.ctx.device_type = kDLCPU;
-    else if (ctx.dev_type == "gpu")
-      dltensor.ctx.device_type = kDLGPU;
-    else if (ctx.dev_type == "opencl")
-      dltensor.ctx.device_type = kDLOpenCL;
-    else if (ctx.dev_type == "vulcan")
-      dltensor.ctx.device_type = kDLVulkan;
-    else if (ctx.dev_type == "metal")
-      dltensor.ctx.device_type = kDLMetal;
-    else if (ctx.dev_type == "vpi")
-      dltensor.ctx.device_type = kDLVPI;
-    else if (ctx.dev_type == "rocm")
-      dltensor.ctx.device_type = kDLROCM;
-    else
-      dltensor.ctx.device_type = kDLExtDev;
-    switch (dtype) {
-    case kFloat32:
-      dltensor.dtype.code = kDLFloat;
-      dltensor.dtype.bits = 32;
-      break;
-    case kFloat64:
-      dltensor.dtype.code = kDLFloat;
-      dltensor.dtype.bits = 64;
-      break;
-    case kFloat16:
-      dltensor.dtype.code = kDLFloat;
-      dltensor.dtype.bits = 16;
-      break;
-    case kUint8:
-      dltensor.dtype.code = kDLUInt;
-      dltensor.dtype.bits = 8;
-      break;
-    case kInt32:
-      dltensor.dtype.code = kDLInt;
-      dltensor.dtype.bits = 32;
-      break;
-    case kInt8:
-      dltensor.dtype.code = kDLInt;
-      dltensor.dtype.bits = 8;
-      break;
-    case kInt64:
-      dltensor.dtype.code = kDLInt;
-      dltensor.dtype.bits = 64;
-      break;
-    default:
-      dltensor.dtype.code = 0;
-      dltensor.dtype.bits = 0;
-      throw std::runtime_error("Error! Invalid dtype flag: "
-                               + std::to_string(static_cast<int>(dtype))
-                               + " when constructing MXTensor");
-    }
-  }
-
+  void setDLTensor();
+  
   /*! \brief helper function to cast data pointer */
   template<typename data_type>
   inline data_type* data() {
@@ -402,24 +348,10 @@ struct MXTensor {
   }
 
   /*! \brief helper function to get data size */
-  inline int64_t size() const {
-    int64_t size = 1;
-    for (unsigned int i = 0; i < shape.size(); i++) {
-      size *= shape[i];
-    }
-    return size;
-  }
+  int64_t size() const;
 
   /*! \brief helper function to compare two MXTensors */
-  inline bool isSame(const MXTensor &oth) const {
-    return data_ptr == oth.data_ptr &&
-           dtype == oth.dtype &&
-           verID == oth.verID &&
-           ctx.dev_type == oth.ctx.dev_type &&
-           ctx.dev_id == oth.ctx.dev_id &&
-           shape == oth.shape &&
-           stype == oth.stype;
-  }
+  bool isSame(const MXTensor &oth) const;
 
   // For dense, data_ptr points to 1D flattened tensor data
   // For sparse, data_ptr points to MXSparse
@@ -473,28 +405,15 @@ class PassResource {
  public:
   PassResource(std::unordered_map<std::string, MXTensor>* new_args,
                std::unordered_map<std::string, MXTensor>* new_aux,
-               nd_malloc_t nd_malloc, const void* nd_alloc)
-    : new_args_(new_args), new_aux_(new_aux), nd_malloc_(nd_malloc), nd_alloc_(nd_alloc) {}
+               nd_malloc_t nd_malloc, const void* nd_alloc);
+  
   // allocate new arg param, adds to args map, returns newly allocated tensor
   MXTensor* alloc_arg(const std::string& name, const std::vector<int64_t>& shapes,
-                      const MXContext &ctx, MXDType dtype) const {
-    void* data;
-    nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
-               dtype, name.c_str(), 1, &data);
-    MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
-    (*new_args_)[name] = tensor;
-    return &(new_args_->at(name));
-  }
+                      const MXContext &ctx, MXDType dtype) const;
+  
   // allocate new aux param, adds to aux map, returns newly allocated tensor
   MXTensor* alloc_aux(const std::string& name, const std::vector<int64_t>& shapes,
-                      const MXContext &ctx, MXDType dtype) const {
-    void* data;
-    nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
-               dtype, name.c_str(), 0, &data);
-    MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
-    (*new_aux_)[name] = tensor;
-    return &(new_aux_->at(name));
-  }
+                      const MXContext &ctx, MXDType dtype) const;
 
  private:
   std::unordered_map<std::string, MXTensor>* new_args_;
@@ -511,45 +430,28 @@ class OpResource {
   OpResource(xpu_malloc_t cpu_malloc_fp, void* cpu_alloc_fp,
              xpu_malloc_t gpu_malloc_fp, void* gpu_alloc_fp, void* stream,
              sparse_malloc_t sparse_malloc_fp, void* sparse_alloc_fp,
-             void* rng_cpu_states, void* rng_gpu_states)
-    : cpu_malloc(cpu_malloc_fp), gpu_malloc(gpu_malloc_fp),
-      cpu_alloc(cpu_alloc_fp), gpu_alloc(gpu_alloc_fp), cuda_stream(stream),
-      sparse_malloc(sparse_malloc_fp), sparse_alloc(sparse_alloc_fp),
-      rand_cpu_states(rng_cpu_states), rand_gpu_states(rng_gpu_states) {}
-
+             void* rng_cpu_states, void* rng_gpu_states);
+  
   /*! \brief allocate cpu memory controlled by MXNet */
-  void* alloc_cpu(int size) const {
-    return cpu_malloc(cpu_alloc, size);
-  }
+  void* alloc_cpu(int size) const;
 
   /*! \brief allocate gpu memory controlled by MXNet */
-  void* alloc_gpu(int size) const {
-    return gpu_malloc(gpu_alloc, size);
-  }
+  void* alloc_gpu(int size) const;
 
   /*! \brief return the cuda stream object with correct type */
-  mx_stream_t get_cuda_stream() const {
-    return static_cast<mx_stream_t>(cuda_stream);
-  }
+  mx_stream_t get_cuda_stream() const;
 
   /*! \brief allocate sparse memory controlled by MXNet */
-  void alloc_sparse(MXSparse* sparse, int index, int indices_len, int indptr_len = 0) const {
-    sparse_malloc(sparse_alloc, index, indices_len, indptr_len,
-                   &(sparse->data), &(sparse->indices), &(sparse->indptr));
-  }
+  void alloc_sparse(MXSparse* sparse, int index, int indices_len, int indptr_len = 0) const;
 
   /*! \brief get pointer to initialized and seeded random number states located on CPU */
   /* Access each state by states[id], but this id should be <= MX_NUM_CPU_RANDOM_STATES */
-  mx_cpu_rand_t* get_cpu_rand_states() const {
-    return static_cast<mx_cpu_rand_t*>(rand_cpu_states);
-  }
+  mx_cpu_rand_t* get_cpu_rand_states() const;
 
   /*! \brief get pointer to initialized and seeded random number states located on GPU */
   /* Access each state by states[id], but this id should be <= MX_NUM_GPU_RANDOM_STATES */
   /* Note that if you are using cpu build, it will return a nullptr */
-  mx_gpu_rand_t* get_gpu_rand_states() const {
-    return static_cast<mx_gpu_rand_t*>(rand_gpu_states);
-  }
+  mx_gpu_rand_t* get_gpu_rand_states() const;
 
  private:
   /*! \brief allocation lambda function */
@@ -582,15 +484,7 @@ class OpResource {
  * getShapeAt("[[1]]", 0) returns "[1]"
  * getShapeAt("[[1],[2,3]]", 1) returns "[2,3]"
  */
-std::string getShapeAt(const std::string& shape, unsigned index) {
-  int idx = 1;  // start at 1 to skip the first square bracket [
-  // find the beginning of the output shape for the particular output index
-  for (unsigned x=0; x < index; x++)
-    idx = shape.find("[", idx+1);
-  int stop = shape.find("]", idx);  // find stop index for this output shape
-  // add this shape to the list
-  return shape.substr(idx, stop-idx+1);
-}
+std::string getShapeAt(const std::string& shape, unsigned index);
 
 /* \brief get dtype value from list of dtypes string
  *
@@ -599,15 +493,7 @@ std::string getShapeAt(const std::string& shape, unsigned index) {
  * getDtypeAt("[1]", 0) returns "1"
  * getDtypeAt("[1,2]", 1) returns "2" 
  */
-std::string getDtypeAt(const std::string& dtype, unsigned index) {
-  // find the beginning of the output dtype for the particular output index
-  int idx = 0;
-  for (unsigned x=0; x < index; x++)
-    idx = dtype.find(",", idx+1);
-  int stop = dtype.find(",", idx+1);  // find stop index for this output dtype
-  if (stop == -1) stop = dtype.find("]", idx+1);
-  return dtype.substr(idx+1, stop-idx-1);
-}
+std::string getDtypeAt(const std::string& dtype, unsigned index);
 
 /*!
  * \brief Json utility to parse serialized subgraph symbol
@@ -617,196 +503,41 @@ enum JsonType {ERR, STR, NUM, LIST, MAP};
 
 /*! \brief definition of JSON objects */
 struct JsonVal {
-  JsonVal() : type(ERR), num(-1), str("") {}  // default constructor
+  JsonVal();  // default constructor
   // construct a JSON object by type
-  explicit JsonVal(JsonType t) : type(t), num(-1), str("") {}
+  explicit JsonVal(JsonType t);
   // construct a string JSON object
-  explicit JsonVal(std::string s) : type(STR), num(-1), str(s) {}
+  explicit JsonVal(std::string s);
   // construct a number JSON object
-  explicit JsonVal(int n) : type(NUM), num(n), str(std::to_string(n)) {}
+  explicit JsonVal(int n);
   // complex constructor
-  JsonVal(JsonType t, int n, std::string s) : type(t), num(n), str(s) {}
-  bool operator<(const JsonVal &o) const {
-    // for string JSON objects compare the string
-    if (type == STR) return type == o.type && str < o.str;
-    // for number JSON objects compare the number
-    if (type == NUM) return type == o.type && num < o.num;
-    // for list JSON objects, compare the size of list, and then each object in the list
-    if (type == LIST) {
-      if (list.size() != o.list.size()) return false;
-      for (unsigned int i=0; i< list.size(); i++)
-        if (list[i] < o.list[i])
-          return false;  // if we find an object that doesnt match return
-      return true;  // all objects in lists matched
-    }
-    // for map JSON objects, compare the size of map, and then each key/value in the maps
-    if (type == MAP) {
-      if (map.size() != o.map.size()) return false;
-      for (auto &item : map) {
-        // if one map is missing a key in another return
-        if (o.map.find(item.first) == o.map.end()) return false;
-        if (item.second < o.map.at(item.first)) return false;
-      }
-      return true;
-    }
-    return type < o.type;
-  }
+  JsonVal(JsonType t, int n, std::string s);
+  bool operator<(const JsonVal &o) const;
 
   // convert JSON object back to JSON-compatible string
-  std::string dump() const {
-    std::string ret;
-    switch (type) {
-    case ERR:
-      ret = "json(Error)";
-      break;
-    case STR:
-      ret = "\"" + str + "\"";
-      break;
-    case NUM:
-      ret = str;
-      break;
-    case LIST:
-      ret = "[";
-      for (unsigned i=0; i < list.size(); i++) {
-        auto &item = list[i];
-        ret += item.dump();
-        if (i < list.size()-1)
-          ret += ",";
-      }
-      ret += "]";
-      break;
-    case MAP:
-      ret = "{";
-      unsigned cnt = 0;
-      for (auto &item : map) {
-        ret += item.first.dump() + " : " + item.second.dump();
-        if (cnt++ < map.size()-1)
-          ret += ",";
-      }
-      ret += "}";
-      break;
-    }
-    return ret;
-  }
+  std::string dump() const;
+  
   // convert JSON-compatible string to JSON object
-  static JsonVal parse(const std::string& json) {
-    unsigned int idx = 0;
-    return JsonVal::parse(json, &idx);
-  }
+  static JsonVal parse(const std::string& json);
+  
   // parse a string JSON object
-  static JsonVal parse_string(const std::string& json, unsigned int* idx) {
-    JsonVal ret(STR);
-    while (*idx < json.size()) {
-      if (json[*idx] == '"') {
-        ++(*idx);
-        return ret;
-      } else {
-        ret.str += json[*idx];
-        ++(*idx);
-      }
-    }
-    std::cout << "Error! Unable to parse string" << std::endl;
-    return JsonVal();
-  }
+  static JsonVal parse_string(const std::string& json, unsigned int* idx);
+  
   // parse a number JSON object
-  static JsonVal parse_num(const std::string& json, unsigned int* idx) {
-    JsonVal ret(NUM);
-    while (*idx < json.size()) {
-      if (json[*idx] >= '0' && json[*idx] <= '9') {
-        ret.str += json[*idx];
-        ++(*idx);
-      } else {
-        break;
-      }
-    }
-    ret.num = std::stoi(ret.str);
-    return ret;
-  }
+  static JsonVal parse_num(const std::string& json, unsigned int* idx);
+  
   // parse a list of JSON objects
-  static JsonVal parse_list(const std::string& json, unsigned int* idx) {
-    JsonVal ret(LIST);
-    while (*idx < json.size()) {
-      if (json[*idx] == ']') {
-        ++(*idx);
-        return ret;
-      } else {
-        JsonVal item = JsonVal::parse(json, idx);
-        if (item.type != ERR)
-          ret.list.push_back(item);
-      }
-    }
-    std::cout << "Error! Unable to parse list" << std::endl;
-    return JsonVal();
-  }
+  static JsonVal parse_list(const std::string& json, unsigned int* idx);
+
   // parse a map of JSON objects
-  static JsonVal parse_map(const std::string& json, unsigned int* idx) {
-    JsonVal ret(MAP), key;
-    while (*idx < json.size()) {
-      if (json[*idx] == '}') {
-        ++(*idx);
-        return ret;
-      } else {
-        JsonVal item = JsonVal::parse(json, idx);
-        if (key.type == ERR) {
-          key = item;
-        } else {
-          ret.map[key] = item;
-          key.type = ERR;
-        }
-      }
-    }
-    std::cout << "Error! Unable to parse map" << std::endl;
-    return JsonVal();
-  }
+  static JsonVal parse_map(const std::string& json, unsigned int* idx);
+
   // generic parse function
-  static JsonVal parse(const std::string& json, unsigned int *idx) {
-    JsonVal ret;
-    while (*idx < json.size()) {
-      if (json[*idx] == '"') {
-        ++(*idx);
-        ret = JsonVal::parse_string(json, idx);
-      } else if (json[*idx] >= '0' && json[*idx] <= '9') {
-        ret = JsonVal::parse_num(json, idx);
-      } else if (json[*idx] == '[') {
-        ++(*idx);
-        ret = JsonVal::parse_list(json, idx);
-      } else if (json[*idx] == '{') {
-        ++(*idx);
-        ret = JsonVal::parse_map(json, idx);
-      } else if (json[*idx] == ']' || json[*idx] == '}') {return ret;}
-      if (ret.type != ERR) return ret;
-      ++(*idx);
-    }
-    return ret;
-  }
+  static JsonVal parse(const std::string& json, unsigned int *idx);
+  
   // debug function to convert data structure to a debugstring
-  std::string toString() const {
-    std::string ret;
-    switch (type) {
-    case ERR:
-      ret = "json(Error)";
-      break;
-    case STR:
-      ret = "json(STR:" + str + ")";
-      break;
-    case NUM:
-      ret = "json(INT:" + str + ")";
-      break;
-    case LIST:
-      ret = "json(LIST:[";
-      for (auto &item : list)
-        ret += item.toString() + ",";
-      ret += "])";
-      break;
-    case MAP:
-      ret = "json(MAP:{";
-      for (auto &item : map)
-        ret += item.first.toString() + " : " + item.second.toString() + ",";
-      ret += "})";
-      break;
-    }
-    return ret;
-  }
+  std::string toString() const;
+  
   JsonType type;
   int num;
   std::string str;
@@ -829,25 +560,19 @@ struct NodeEntry {
 // Representation of a node in the graph
 class Node {
  public:
-  Node() {tensor = nullptr;}
+  Node();
+  
   // internally set passResource to enable tensor allocation for graph passes
-  void _setPassResource(PassResource* res_) {res = res_;}
+  void _setPassResource(PassResource* res_);
+
   /* \brief allocate an arg tensor for this node */
   void alloc_arg(const std::vector<int64_t>& shapes,
-                 const MXContext &ctx, MXDType dtype) {
-    if (!res)
-      throw std::runtime_error(
-                 "Node not initialized. Cannot use alloc_arg outside of graph passes.");
-    tensor = res->alloc_arg(name, shapes, ctx, dtype);
-  }
+                 const MXContext &ctx, MXDType dtype);
+  
   /* \brief allocate an aux tensor for this node */
   void alloc_aux(const std::vector<int64_t>& shapes,
-                 const MXContext &ctx, MXDType dtype) {
-    if (!res)
-      throw std::runtime_error(
-                 "Node not initialized. Cannot use alloc_aux outside of graph passes.");
-    tensor = res->alloc_aux(name, shapes, ctx, dtype);
-  }
+                 const MXContext &ctx, MXDType dtype);
+  
   std::string op;  // operator name (ie. Convolution)
   std::string name;  // unique node name (ie. conv_0 or conv_1)
   MXTensor* tensor;  // tensor data for input nodes
@@ -863,299 +588,58 @@ class Node {
 // Representation of the graph
 class Graph {
  public:
-  Graph() : res(nullptr) {}
+  Graph();
+  
   /* \brief deleted nodes when deleting the graph */
-  ~Graph() {
-    for (int i = 0; i < nodes.size(); i++)
-      delete nodes[i];
-  }
+  ~Graph();
 
   /* \brief create a graph object from an unparsed string */
-  static Graph* fromString(const std::string& json) {
-    JsonVal val = JsonVal::parse(json);
-    return fromJson(val);
-  }
+  static Graph* fromString(const std::string& json);
 
   /* \brief create a graph object from a parsed JSON object */
-  static Graph* fromJson(JsonVal val) {
-    // get nodes list
-    JsonVal nodes = val.map[JsonVal("nodes")];
-    Graph *g = new Graph();
-
-    std::map<int, Node*> nodeMap;
-    // loop over nodes
-    for (int i = 0; i < nodes.list.size(); i++) {
-      Node* n = new Node();
-      g->nodes.push_back(n);
-      JsonVal node = nodes.list[i];
-
-      // set the op info
-      n->op = node.map[JsonVal("op")].str;
-      n->name = node.map[JsonVal("name")].str;
-
-      // if op is null it is an input to the graph
-      if (n->op.compare("null") == 0)
-        g->inputs.push_back(n);
-
-      // set attrs
-      JsonVal attributes = node.map[JsonVal("attrs")];
-      for (auto& kv : attributes.map) {
-        n->attrs[kv.first.str] = kv.second.str;
-      }
-
-      // set subgraphs, parsing each into a graph
-      if (node.map.count(JsonVal("subgraphs")) > 0) {
-        JsonVal subgraphs = node.map[JsonVal("subgraphs")];
-        for (auto &subgraph : subgraphs.list) {
-          n->subgraphs.push_back(fromJson(subgraph));
-        }
-      }
-
-      // set node inputs
-      JsonVal node_inputs = node.map[JsonVal("inputs")];
-      n->inputs.resize(node_inputs.list.size());
-      for (int j = 0; j < node_inputs.list.size(); j++) {
-        JsonVal input = node_inputs.list[j];
-        NodeEntry& entry = n->inputs[j];
-        // get pointer to other node
-        entry.node = nodeMap[input.list[0].num];
-        // get the other node's output index
-        entry.entry = input.list[1].num;
-        // set other nodes output as connected to this node
-        entry.node->outputs.push_back({n, j});
-      }
-      nodeMap[i] = n;
-    }
-
-    // set graph level outputs
-    JsonVal& heads = val.map[JsonVal("heads")];
-    g->outputs.resize(heads.list.size());
-    for (int i = 0; i < heads.list.size(); i++) {
-      JsonVal head = heads.list[i];
-      g->outputs[i].node = nodeMap[head.list[0].num];
-      g->outputs[i].entry = head.list[1].num;
-    }
-
-    // add all attributes to the graph
-    for (auto& kv : val.map) {
-      if (kv.first.str.compare("nodes") != 0 &&
-         kv.first.str.compare("heads") != 0 &&
-         kv.first.str.compare("node_row_ptr") != 0 &&
-         kv.first.str.compare("arg_nodes") != 0) {
-        g->attrs[kv.first.str] = kv.second;
-      }
-    }
-    return g;
-  }
+  static Graph* fromJson(JsonVal val);
 
   /* \brief convert graph object back to JSON object */
-  JsonVal toJson() {
-    // top level object is a map
-    JsonVal val(MAP);
-
-    // add attributes
-    for (auto& kv : attrs) {
-      val.map[JsonVal(kv.first)] = kv.second;
-    }
-
-    // sort graph nodes in topological order, create mapping of node to index
-    std::map<Node*, int> nodeMap;
-    std::vector<Node*> sorted = topological_sort();
-    // nodes are in reverse topological order in the vector (back is first)
-    // so loop from end to front over the vector 'sorted'
-    for (int i = sorted.size()-1; i >= 0; i--) {
-      nodeMap[sorted[i]] = sorted.size()-1-i;
-    }
-
-    // create node_row_ptr entry
-    val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
-    JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
-    for (int i = 0; i < nodes.size(); i++)
-      node_row_ptr.list.push_back(JsonVal(i));
-
-    // add all input nodes
-    val.map[JsonVal("arg_nodes")] = JsonVal(LIST);
-    JsonVal& arg_nodes = val.map[JsonVal("arg_nodes")];
-    for (int i = 0; i < inputs.size(); i++)
-      arg_nodes.list.push_back(JsonVal(nodeMap[inputs[i]]));
-
-    // add all output nodes
-    val.map[JsonVal("heads")] = JsonVal(LIST);
-    JsonVal& heads = val.map[JsonVal("heads")];
-    for (int i = 0; i < outputs.size(); i++) {
-      heads.list.push_back(JsonVal(LIST));
-      JsonVal& out = heads.list[i];
-      out.list.push_back(JsonVal(nodeMap[outputs[i].node]));
-      out.list.push_back(JsonVal(outputs[i].entry));
-      out.list.push_back(JsonVal(0));
-    }
-
-    // add all graph nodes
-    val.map[JsonVal("nodes")] = JsonVal(LIST);
-    JsonVal& nodes_ = val.map[JsonVal("nodes")];
-    for (int i = sorted.size()-1; i >= 0; i--) {
-      // each node is a map
-      nodes_.list.push_back(JsonVal(MAP));
-      Node* n = sorted[i];
-      JsonVal& n_ = nodes_.list[nodes_.list.size()-1];
-
-      n_.map[JsonVal("op")] = JsonVal(n->op);
-      n_.map[JsonVal("name")] = JsonVal(n->name);
-      n_.map[JsonVal("inputs")] = JsonVal(LIST);
-
-      // add inputs for this node
-      JsonVal& inputs_ = n_.map[JsonVal("inputs")];
-      for (int j = 0; j < n->inputs.size(); j++) {
-        inputs_.list.push_back(JsonVal(LIST));
-        NodeEntry& entry = n->inputs[j];
-        JsonVal& in = inputs_.list[j];
-        in.list.push_back(JsonVal(nodeMap[entry.node]));
-        in.list.push_back(JsonVal(entry.entry));
-        in.list.push_back(JsonVal(0));
-      }
-
-      // add subgraphs for this node, convert each back to JSON
-      if (n->subgraphs.size() > 0) {
-        n_.map[JsonVal("subgraphs")] = JsonVal(LIST);
-        JsonVal &subgraphs_ = n_.map[JsonVal("subgraphs")];
-        for (Graph *subgraph : n->subgraphs) {
-          subgraphs_.list.push_back(subgraph->toJson());
-        }
-      }
-
-      // add attributes for this node
-      n_.map[JsonVal("attrs")] = JsonVal(MAP);
-      JsonVal& attrs_ = n_.map[JsonVal("attrs")];
-      for (auto& kv : n->attrs) {
-        attrs_.map[JsonVal(kv.first)] = JsonVal(kv.second);
-      }
-    }
-    return val;
-  }
+  JsonVal toJson();
 
   /* \brief convert graph object to JSON string */
-  std::string toString() {
-    return toJson().dump();
-  }
-
+  std::string toString();
+  
   /* \brief visits a node "n" */
   void _dfs_util(Node* n, std::unordered_set<Node*>* to_visit,
-                 std::function<void(Node*)> handler) const {
-    to_visit->erase(n);  // remove node now that we're visiting it
-    for (NodeEntry& e : n->outputs) {
-      Node* o = e.node;
-      if (to_visit->count(o) != 0) {
-        _dfs_util(o, to_visit, handler);  // visit neighbor
-      }
-    }
-    handler(n);  // post-order visit this node
-  }
+                 std::function<void(Node*)> handler) const;
 
   /* \brief post-order DFS graph traversal */
-  void DFS(std::function<void(Node*)> handler) const {
-    std::unordered_set<Node*> to_visit;
-    // put all nodes in set to visit
-    for (auto& n : nodes)
-      to_visit.insert(n);
-    // visit all inputs first
-    for (auto& i : inputs)
-      if (to_visit.count(i) != 0)
-        _dfs_util(i, &to_visit, handler);
-    // visit any nodes left
-    while (to_visit.size() > 0)
-      _dfs_util(*(to_visit.begin()), &to_visit, handler);
-  }
+  void DFS(std::function<void(Node*)> handler) const;
 
   /* \brief sort graph nodes in topological order */
-  std::vector<Node*> topological_sort() const {
-    std::vector<Node*> sorted;
-    auto handler = [&](Node* n) {
-      sorted.push_back(n);  // when visiting each node, add it in order to the vector
-    };
-    DFS(handler);
-    return sorted;
-  }
+  std::vector<Node*> topological_sort() const;
 
   /* \brief print out graph details */
-  void print(int indent = 0) const {
-    std::string space = "";
-    for (int i = 0; i < indent; i++) space+=" ";
-
-    std::cout << space << "########### Graph #############" << std::endl;
-    std::cout << space << "attributes: " << std::endl;
-    for (auto &kv : attrs)
-      std::cout << space << "\t" << kv.first << " : " << kv.second.str << std::endl;
-    std::cout << space << "inputs: " << inputs.size() << std::endl;
-    std::cout << space << "outputs: " << outputs.size() << std::endl;
-    std::cout << space << "nodes: " << nodes.size() << std::endl;
-    std::vector<Node*> sorted = topological_sort();
-    // loop over each node and print out its inputs/outputs
-    for (int i = sorted.size()-1; i >= 0; i--) {
-      std::cout << space << "Node: " << sorted[i]->name << std::endl;
-      for (int j = 0; j < sorted[i]->inputs.size(); j++) {
-        std::cout << space << "\tInput: " << sorted[i]->inputs[j].node->name << " "
-                  << sorted[i]->inputs[j].entry << std::endl;
-      }
-      for (int j = 0; j < sorted[i]->outputs.size(); j++) {
-        std::cout << space << "\tOutput: " << sorted[i]->outputs[j].node->name << " "
-                  << sorted[i]->outputs[j].entry << std::endl;
-      }
-      if (sorted[i]->subgraphs.size() > 0) {
-        for (auto &subgraph : sorted[i]->subgraphs) {
-          std::cout << space << "\tSubgraph:" << std::endl;
-          subgraph->print(indent+2);
-        }
-      }
-    }
-    std::cout << space << "###############################" << std::endl;
-  }
+  void print(int indent = 0) const;
 
   /* \brief add a new node to this graph */
-  Node* addNode(const std::string& name, const std::string& op) {
-    Node* n = new Node();
-    n->name = name;
-    n->op = op;
-    if (res)
-      n->_setPassResource(res);
-    return n;
-  }
+  Node* addNode(const std::string& name, const std::string& op);
+  
   /* \brief get node at index in graph */
-  Node* getNode(size_t idx) {
-    return nodes[idx];
-  }
+  Node* getNode(size_t idx);
+    
   /* \brief get const node at index in const graph */
-  const Node* getNode(size_t idx) const {
-    return nodes.at(idx);
-  }
+  const Node* getNode(size_t idx) const;
+  
   /* \brief get attribute on graph */
-  const JsonVal& getAttr(const std::string& key) const {
-    return attrs.at(key);
-  }
+  const JsonVal& getAttr(const std::string& key) const;
+  
   /* \brief get number of nodes in the graph */
-  size_t size() const {
-    return nodes.size();
-  }
+  size_t size() const;
+  
   // internally set passResource to enable tensor allocation for graph passes
-  void _setPassResource(PassResource* res_) {res = res_;}
+  void _setPassResource(PassResource* res_);
+
   // internally set arg/aux params when available
   void _setParams(std::unordered_map<std::string, mxnet::ext::MXTensor>* args,
-                  std::unordered_map<std::string, mxnet::ext::MXTensor>* aux) {
-    // set params for each input node
-    for (Node* node : inputs) {
-      if (args->count(node->name) > 0)
-        node->tensor = &args->at(node->name);
-      else if (aux->count(node->name) > 0)
-        node->tensor = &aux->at(node->name);
-    }
-
-    if (res) {
-      // set passResource for each node
-      for (Node* node : nodes) {
-        node->_setPassResource(res);
-      }
-    }
-  }
-
+                  std::unordered_map<std::string, mxnet::ext::MXTensor>* aux);
+  
   std::vector<Node*> inputs;
   std::vector<NodeEntry> outputs;
   std::map<std::string, JsonVal> attrs;
@@ -1214,7 +698,7 @@ class CustomStatefulOp {
   virtual MXReturnValue Backward(std::vector<MXTensor>* inputs,
                                  std::vector<MXTensor>* outputs,
                                  const OpResource& op_res) {
-    std::cout << "Error! Operator does not support backward" << std::endl;
+    MX_ERROR_MSG << "Error! Operator does not support backward" << std::endl;
     return MX_FAIL;
   }
 };
@@ -1230,30 +714,30 @@ class CustomStatefulOpWrapper {
 
 /*! \brief Custom Operator function templates */
 typedef MXReturnValue (*fcomp_t)(const std::unordered_map<std::string,
-                                                          std::string>& attributes,
+                                 std::string>& attributes,
                                  std::vector<MXTensor>* inputs,
                                  std::vector<MXTensor>* outputs,
                                  const OpResource& res);
 typedef MXReturnValue (*parseAttrs_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                      std::string>& attributes,
                                       int* num_inputs, int* num_outputs);
 typedef MXReturnValue (*inferType_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                     std::string>& attributes,
                                      std::vector<int>* in_types,
                                      std::vector<int>* out_types);
 typedef MXReturnValue (*inferSType_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                      std::string>& attributes,
                                       std::vector<int>* in_storage_types,
                                       std::vector<int>* out_storage_types);
 typedef MXReturnValue (*inferShape_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                      std::string>& attributes,
                                       std::vector<std::vector<unsigned int> >* in_shapes,
                                       std::vector<std::vector<unsigned int> >* out_shapes);
 typedef MXReturnValue (*mutateInputs_t)(const std::unordered_map<std::string,
-                                                                 std::string>& attributes,
+                                        std::string>& attributes,
                                         std::vector<int>* input_indices);
 typedef MXReturnValue (*createOpState_t)(const std::unordered_map<std::string,
-                                                                  std::string>& attributes,
+                                         std::string>& attributes,
                                          CustomStatefulOp**);
 
 /*!
@@ -1261,66 +745,27 @@ typedef MXReturnValue (*createOpState_t)(const std::unordered_map<std::string,
  */
 class CustomOp {
  public:
-  explicit CustomOp(const char* op_name) : name(op_name),
-    parse_attrs(NULL), infer_type(NULL), infer_storage_type(NULL), infer_shape(NULL),
-    mutate_inputs(NULL), isSGop(false) {}
-  CustomOp& setForward(fcomp_t fcomp, const char* ctx) {
-    if (forward_ctx_map.count(ctx) > 0)
-      raiseDuplicateContextError();
-    forward_ctx_map[ctx] = fcomp;
-    return *this;
-  }
-  CustomOp& setBackward(fcomp_t fgrad, const char* ctx) {
-    if (backward_ctx_map.count(ctx) > 0)
-      raiseDuplicateContextError();
-    backward_ctx_map[ctx] = fgrad;
-    return *this;
-  }
-  CustomOp& setParseAttrs(parseAttrs_t func) {
-    parse_attrs = func;
-    return *this;
-  }
-  CustomOp& setInferType(inferType_t func) {
-    infer_type = func;
-    return *this;
-  }
-  CustomOp& setInferSType(inferSType_t func) {
-    infer_storage_type = func;
-    return *this;
-  }
-  CustomOp& setInferShape(inferShape_t func) {
-    infer_shape = func;
-    return *this;
-  }
-  CustomOp& setMutateInputs(mutateInputs_t func) {
-    mutate_inputs = func;
-    return *this;
-  }
-  CustomOp& setCreateOpState(createOpState_t func, const char* ctx) {
-    if (create_op_ctx_map.count(ctx) > 0)
-      raiseDuplicateContextError();
-    create_op_ctx_map[ctx] = func;
-    return *this;
-  }
-  CustomOp& setIsSubgraphOp() {
-    isSGop = true;
-    return *this;
-  }
-  void mapToVector() {
-    for (auto kv : forward_ctx_map) {
-      forward_ctx_cstr.push_back(kv.first);
-      forward_fp.push_back(kv.second);
-    }
-    for (auto kv : backward_ctx_map) {
-      backward_ctx_cstr.push_back(kv.first);
-      backward_fp.push_back(kv.second);
-    }
-    for (auto kv : create_op_ctx_map) {
-      create_op_ctx_cstr.push_back(kv.first);
-      create_op_fp.push_back(kv.second);
-    }
-  }
-  ~CustomOp() {}
+  explicit CustomOp(const char* op_name);
+  
+  CustomOp& setForward(fcomp_t fcomp, const char* ctx);
+  
+  CustomOp& setBackward(fcomp_t fgrad, const char* ctx);
+  
+  CustomOp& setParseAttrs(parseAttrs_t func);
+  
+  CustomOp& setInferType(inferType_t func);
+
+  CustomOp& setInferSType(inferSType_t func);
+  
+  CustomOp& setInferShape(inferShape_t func);
+  
+  CustomOp& setMutateInputs(mutateInputs_t func);
+    
+  CustomOp& setCreateOpState(createOpState_t func, const char* ctx);
+
+  CustomOp& setIsSubgraphOp();
+  
+  void mapToVector();
 
   /*! \brief operator name */
   const char* name;
@@ -1339,12 +784,7 @@ class CustomOp {
   std::vector<createOpState_t> create_op_fp;
 
  private:
-  void raiseDuplicateContextError() {
-    std::string op_name_str(name);
-    throw std::runtime_error(
-      "Error! Error! Cannot register multiple functions under same context for operator '"
-      + op_name_str + "'");
-  }
+  void raiseDuplicateContextError();
 
   /*! \brief dedup context maps - static string ctx to custom function */
   std::unordered_map<const char*, fcomp_t> forward_ctx_map, backward_ctx_map;
@@ -1360,13 +800,10 @@ typedef MXReturnValue (*graphPass_t)(mxnet::ext::Graph* graph,
  */
 class CustomPass {
  public:
-  CustomPass() : name("ERROR") {}
-  explicit CustomPass(const char* pass_name)
-    : name(pass_name) {}
-  CustomPass& setBody(graphPass_t fn) {
-    pass = fn;
-    return *this;
-  }
+  CustomPass();
+  explicit CustomPass(const char* pass_name);
+  
+  CustomPass& setBody(graphPass_t fn);
 
   /*! \brief pass name */
   const char* name;
@@ -1392,48 +829,24 @@ typedef MXReturnValue (*reviewSubgraph_t)(const mxnet::ext::Graph *subgraph, int
  */
 class CustomPartitioner {
  public:
-  CustomPartitioner() : name("ERROR") {}
-  explicit CustomPartitioner(const char* backend_name) :
-    name(backend_name) {}
+  CustomPartitioner();
+  
+  explicit CustomPartitioner(const char* backend_name);
+    
   CustomPartitioner& addStrategy(const char* prop_name,
-                                 const char* sg_name) {
-    strategies.push_back(prop_name);
-    op_names.push_back(sg_name);
-    return *this;
-  }
-  CustomPartitioner& setSupportedOps(const char* prop_name, supportedOps_t fn) {
-    supported_map[std::string(prop_name)] = fn;
-    return *this;
-  }
-  CustomPartitioner& setCreateSelector(const char* prop_name, createSelector_t fn) {
-    selector_map[std::string(prop_name)] = fn;
-    return *this;
-  }
-  CustomPartitioner& setReviewSubgraph(const char* prop_name, reviewSubgraph_t fn) {
-    review_map[std::string(prop_name)] = fn;
-    return *this;
-  }
-  supportedOps_t getSupportedOps(int stg_id) {
-    std::string prop(strategies[stg_id]);
-    if (supported_map.count(prop) > 0)
-      return supported_map[prop];
-    else
-      return nullptr;
-  }
-  createSelector_t getCreateSelector(int stg_id) {
-    std::string prop(strategies[stg_id]);
-    if (selector_map.count(prop) > 0)
-      return selector_map[prop];
-    else
-      return nullptr;
-  }
-  reviewSubgraph_t getReviewSubgraph(int stg_id) {
-    std::string prop(strategies[stg_id]);
-    if (review_map.count(prop) > 0)
-      return review_map[prop];
-    else
-      return nullptr;
-  }
+                                 const char* sg_name);
+  
+  CustomPartitioner& setSupportedOps(const char* prop_name, supportedOps_t fn);
+  
+  CustomPartitioner& setCreateSelector(const char* prop_name, createSelector_t fn);
+  
+  CustomPartitioner& setReviewSubgraph(const char* prop_name, reviewSubgraph_t fn);
+  
+  supportedOps_t getSupportedOps(int stg_id);
+  
+  createSelector_t getCreateSelector(int stg_id);
+  
+  reviewSubgraph_t getReviewSubgraph(int stg_id);
 
   /*! \brief partitioner name */
   const char* name;
@@ -1520,44 +933,6 @@ class Registry {
   MX_STR_CONCAT(MX_REGISTER_PASS_DEF_(Name), __COUNTER__) = \
     Registry<CustomPass>::get()->add(MX_TOSTRING(Name))
 
-/* \brief Class to store error messages from extensions to pass to MXNet */
-class MXerrorMsgs {
- public:
-  /*!
-   * \brief get singleton pointer to class
-   * \returns pointer to class
-   */
-  static MXerrorMsgs* get() {
-    static MXerrorMsgs inst;
-    return &inst;
-  }
-  /*!
-   * \brief add a new error message
-   */
-  std::stringstream& add(const char* file, int line) {
-    messages.push_back(std::stringstream());
-    messages.back() << file << "[" << line << "]: ";
-    return messages.back();
-  }
-  int size() {
-    return messages.size();
-  }
-  const std::string* get(int idx) {
-    return new std::string(messages.at(idx).str());
-  }
-
- private:
-  /*! \brief constructor */
-  MXerrorMsgs() {}
-  /*! \brief destructor */
-  ~MXerrorMsgs() {}
-  /*! \brief map of entries in registry */
-  std::vector<std::stringstream> messages;
-};
-
-// Add a new error message, example: MX_ERROR_MSG << "my error msg";
-#define MX_ERROR_MSG MXerrorMsgs::get()->add(__FILE__, __LINE__)
-
 /* -------------- BELOW ARE CTYPE FUNCTIONS PROTOTYPES --------------- */
 
 /*!
@@ -1756,14 +1131,10 @@ typedef int (*msgGet_t)(int idx, const char** msg);
 
 extern "C" {
   /*! \brief returns MXNet library version */
-  MX_INT_RET _opVersion() {
-    return MX_LIBRARY_VERSION;
-  }
+  MX_INT_RET _opVersion();
 
   /*! \brief returns number of ops registered in this library */
-  MX_INT_RET _opRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->size();
-  }
+  MX_INT_RET _opRegSize();
 
   /*! \brief returns operator registration at specified index */
   MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
@@ -1773,170 +1144,32 @@ extern "C" {
                         const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
                         int* create_op_count, mxnet::ext::parseAttrs_t* parse,
                         mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
-                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate) {
-    mxnet::ext::CustomOp &op = mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->get(idx);
-    *name = op.name;
-    *parse = op.parse_attrs;
-    *type = op.infer_type;
-    *stype = op.infer_storage_type;
-    *shape = op.infer_shape;
-    *mutate = op.mutate_inputs;
-    *isSGop = op.isSGop;
-    op.mapToVector();
-    *forward_ctx = op.forward_ctx_cstr.data();
-    *forward_fp = op.forward_fp.data();
-    *forward_count = op.forward_fp.size();
-    *backward_ctx = op.backward_ctx_cstr.data();
-    *backward_fp = op.backward_fp.data();
-    *backward_count = op.backward_fp.size();
-    *create_op_ctx = op.create_op_ctx_cstr.data();
-    *create_op_fp = op.create_op_fp.data();
-    *create_op_count = op.create_op_fp.size();
-  }
+                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate);
 
   /*! \brief calls free from the external library for library allocated arrays */
-  MX_VOID_RET _opCallFree(void* ptr) {
-    free(ptr);
-  }
+  MX_VOID_RET _opCallFree(void* ptr);
 
   /*! \brief returns status of calling parse attributes function for operator from library */
   MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
                                const char* const* vals, int num,
-                               int* num_in, int* num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    return parseAttrs(attrs, num_in, num_out);
-  }
+                               int* num_in, int* num_out);
 
   /*! \brief returns status of calling inferShape function for operator from library */
   MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
                                const char* const* vals, int num,
                                unsigned int** inshapes, int* indims, int num_in,
                                unsigned int*** mod_inshapes, int** mod_indims,
-                               unsigned int*** outshapes, int** outdims, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of shapes for inputs
-    std::vector<std::vector<unsigned int> > in_shapes(num_in);
-    for (int i = 0; i < num_in; i++) {
-      for (int j = 0; j < indims[i]; j++) {
-        in_shapes[i].push_back(inshapes[i][j]);
-      }
-    }
-
-    // create a vector of shapes for outputs
-    std::vector<std::vector<unsigned int> > out_shapes(num_out);
-
-    int retval = inferShape(attrs, &in_shapes, &out_shapes);
-    if (!retval) return retval;
-
-    // allocate space for modified input dims, shape
-    *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
-    *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
-
-    // copy modified input shapes
-    for (int i = 0; i < num_in; i++) {
-      (*mod_indims)[i] = in_shapes[i].size();
-      (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
-      for (int j = 0; j < (*mod_indims)[i]; j++) {
-        (*mod_inshapes)[i][j] = in_shapes[i][j];
-      }
-    }
-
-    // allocate space for output dims, shape
-    *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
-    *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
-
-    // copy output shapes
-    for (int i = 0; i < num_out; i++) {
-      (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
-      for (int j = 0; j < (*outdims)[i]; j++) {
-        (*outshapes)[i][j] = out_shapes[i][j];
-      }
-    }
-
-    return retval;
-  }
+                               unsigned int*** outshapes, int** outdims, int num_out);
 
   /*! \brief returns status of calling inferType function for operator from library */
   MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
                               const char* const* vals, int num,
-                              int* intypes, int num_in, int* outtypes, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of types for inputs
-    std::vector<int> in_types(num_in);
-    for (int i = 0; i < num_in; i++) {
-      in_types[i] = intypes[i];
-    }
-
-    // create a vector of types for outputs
-    std::vector<int> out_types(num_out, -1);
-
-    int retval = inferType(attrs, &in_types, &out_types);
-    if (!retval)
-      return retval;
-
-    // copy modified input types
-    for (int i = 0; i < num_in; i++) {
-      intypes[i] = in_types[i];
-    }
-    // copy output types
-    for (int i = 0; i < num_out; i++) {
-      outtypes[i] = out_types[i];
-    }
-
-    return retval;
-  }
+                              int* intypes, int num_in, int* outtypes, int num_out);
 
   /*! \brief returns status of calling inferSType function for operator from library */
   MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
                                const char* const* vals, int num,
-                               int* instypes, int num_in, int* outstypes, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of types for inputs
-    std::vector<int> in_stypes(num_in);
-    for (int i = 0; i < num_in; i++) {
-      in_stypes[i] = instypes[i];
-    }
-
-    // create a vector of types for outputs
-    std::vector<int> out_stypes(num_out, -1);
-
-    int retval = inferSType(attrs, &in_stypes, &out_stypes);
-
-    if (!retval)
-      return retval;
-
-    // copy modified input storage types
-    for (int i = 0; i < num_in; i++) {
-      instypes[i] = in_stypes[i];
-    }
-    // copy output storage types
-    for (int i = 0; i < num_out; i++) {
-      outstypes[i] = out_stypes[i];
-    }
-
-    return retval;
-  }
+                               int* instypes, int num_in, int* outstypes, int num_out);
 
   /*! \brief returns status of calling Forward/Backward function for operator from library */
   MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
@@ -1954,119 +1187,17 @@ extern "C" {
                              void** in_indptr, void** out_indptr,
                              int64_t* in_indices_shapes, int64_t* out_indices_shapes,
                              int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
-                             void* rng_cpu_states, void* rng_gpu_states) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of tensors for inputs
-    std::vector<mxnet::ext::MXTensor> inputs(num_in);
-    // create a vector for sparse inputs
-    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
-
-    for (int i = 0; i < num_in; i++) {
-      // Dense representation.
-      if (instypes[i] == 0) {
-        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
-                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
-                            mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (instypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
-                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
-        }
-        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
-                            inshapes[i], indims[i], inIDs[i],
-                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
-      }
-    }
-
-    // create a vector of tensors for outputs
-    std::vector<mxnet::ext::MXTensor> outputs(num_out);
-    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
-
-    for (int i = 0; i < num_out; i++) {
-      // Dense representation.
-      if (outstypes[i] == 0) {
-        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
-                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
-                             mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (outstypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i],
-                            out_indices[i], out_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
-        }
-        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
-                             (mxnet::ext::MXDType)outtypes[i],
-                             outshapes[i], outdims[i], outIDs[i],
-                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
-      }
-    }
-
-    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
-                               cuda_stream, sparse_malloc, sparse_alloc,
-                               rng_cpu_states, rng_gpu_states);
-    return fcomp(attrs, &inputs, &outputs, res);
-  }
+                             void* rng_cpu_states, void* rng_gpu_states);
 
   /*! \brief returns status of calling mutateInputs function for operator from library */
   MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
                                  const char* const* vals, int num,
-                                 int** mutate_indices, int* indices_size) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of mutate input indices
-    std::vector<int> mut_ind;
-
-    int retval = mutate(attrs, &mut_ind);
-    if (!retval)
-      return retval;
-
-    // output the input indices
-    *indices_size = mut_ind.size();
-    *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
-    for (int i = 0; i < *indices_size; i++) {
-      (*mutate_indices)[i] = mut_ind[i];
-    }
-
-    return retval;
-  }
+                                 int** mutate_indices, int* indices_size);
 
   /*! \brief returns status of calling createStatefulOp function for operator from library */
   MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
                                   const char* const* vals, int num,
-                                  void** state_op) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // void pointer to hold custom state op instance created in custom library
-    // eventually state_op pointer is populated by instance from custom library
-    mxnet::ext::CustomStatefulOp** op_ptr =
-      reinterpret_cast<mxnet::ext::CustomStatefulOp**>(state_op);
-    return create_op(attrs, op_ptr);
-  }
+                                  void** state_op);
 
   /*! \brief returns status of calling Stateful Forward/Backward for operator from library */
   MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
@@ -2084,194 +1215,48 @@ extern "C" {
                                      void** out_indptr, int64_t* in_indices_shapes,
                                      int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
                                      int64_t* out_indptr_shapes,
-                                     void* rng_cpu_states, void* rng_gpu_states) {
-    // create a vector of tensors for inputs
-    std::vector<mxnet::ext::MXTensor> inputs(num_in);
-    // create a vector for sparse inputs
-    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
-
-    for (int i = 0; i < num_in; i++) {
-      if (instypes[i] == 0) {
-        // Dense representation.
-        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
-                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
-                            mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (instypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
-                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
-        }
-        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
-                            inshapes[i], indims[i], inIDs[i],
-                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
-      }
-    }
-
-    // create a vector of tensors for outputs
-    std::vector<mxnet::ext::MXTensor> outputs(num_out);
-    // create a vector for sparse outputs
-    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
-
-    for (int i = 0; i < num_out; i++) {
-      if (outstypes[i] == 0) {
-        // Dense representation.
-        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
-                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
-                             mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (outstypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
-        }
-        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
-                             (mxnet::ext::MXDType)outtypes[i],
-                             outshapes[i], outdims[i], outIDs[i],
-                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
-      }
-    }
-
-    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
-                               stream, sparse_malloc, sparse_alloc, rng_cpu_states, rng_gpu_states);
-
-    mxnet::ext::CustomStatefulOp* op_ptr =
-      reinterpret_cast<mxnet::ext::CustomStatefulOp*>(state_op);
-    if (is_forward) {
-      return op_ptr->Forward(&inputs, &outputs, res);
-    }
-    return op_ptr->Backward(&inputs, &outputs, res);
-  }
+                                     void* rng_cpu_states, void* rng_gpu_states);
 
   /*! \brief returns number of partitioners registered in this library */
-  MX_INT_RET _partRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->size();
-  }
+  MX_INT_RET _partRegSize();
 
   /* returns number of strategies registered for partitioner
    * at specified index */
-  MX_INT_RET _partRegGetCount(int idx, const char** name) {
-    mxnet::ext::CustomPartitioner part =
-      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(idx);
-    *name = part.name;
-    return part.strategies.size();
-  }
+  MX_INT_RET _partRegGetCount(int idx, const char** name);
 
   /*! \brief returns partitioner registration at specified index */
   MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
                           mxnet::ext::supportedOps_t* supportedOps,
                           mxnet::ext::createSelector_t* createSelector,
-                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name) {
-    mxnet::ext::CustomPartitioner part =
-      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(part_idx);
-    *strategy = part.strategies[stg_idx];
-    *op_name = part.op_names[stg_idx];
-    *supportedOps = part.getSupportedOps(stg_idx);
-    *createSelector = part.getCreateSelector(stg_idx);
-    *reviewSubgraph = part.getReviewSubgraph(stg_idx);
-  }
+                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name);
 
   /*! \brief returns status of calling supported ops function from library */
   MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
                                    int num_ids, int *ids, const char* const* opt_keys,
-                                   const char* const* opt_vals, int num_opts) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of options from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create array of subgraph IDs for operator support
-    std::vector<int> _ids(num_ids, -2);
-    // call user's supportedOps function
-    mxnet::ext::MXReturnValue retval = supportedOps(graph, &_ids, opts);
-    if (!retval) return retval;
-
-    // copy bools in ids to ints
-    for (int i = 0; i < num_ids; i++)
-      ids[i] = _ids[i];
-
-    return retval;
-  }
+                                   const char* const* opt_vals, int num_opts);
 
   /*! \brief returns status of calling create selector function from library */
   MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
                                      void** selector, const char* const* opt_keys,
-                                     const char* const* opt_vals, int num_opts) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of options from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // void pointer to hold selector instance created in custom library
-    // eventually pointer is populated by instance from custom library
-    mxnet::ext::CustomOpSelector** sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector**>(selector);
-
-    // call user's createSelector function
-    return createSelector(graph, sel_ptr, opts);
-  }
+                                     const char* const* opt_vals, int num_opts);
 
   /*! \brief returns status of calling select function from library */
-  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->Select(nodeID);
-  }
+  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected);
 
   /*! \brief returns status of calling select input function from library */
   MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
-                                  int input_nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->SelectInput(nodeID, input_nodeID);
-  }
+                                   int input_nodeID, int* selected);
 
   /*! \brief returns status of calling select output function from library */
   MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
-                                    int output_nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->SelectOutput(nodeID, output_nodeID);
-  }
+                                    int output_nodeID, int* selected);
 
   /*! \brief returns status of calling filter function from library */
   MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
-                              int** keep, int* num_keep) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    std::vector<int> candidates_(num_candidates);
-    for (int i=0; i < num_candidates; i++) {
-      candidates_[i] = candidates[i];
-    }
-    std::vector<int> keep_;
-
-    sel_ptr->Filter(candidates_, &keep_);
-
-    *num_keep = keep_.size();
-    *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
-    for (unsigned i=0; i < keep_.size(); i++)
-      (*keep)[i] = keep_[i];
-  }
+                              int** keep, int* num_keep);
 
   /*! \brief returns status of calling reset selector function from library */
-  MX_VOID_RET _partCallReset(void* sel_inst) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    sel_ptr->Reset();
-  }
+  MX_VOID_RET _partCallReset(void* sel_inst);
 
   /*! \brief returns status of calling review subgraph function from library */
   MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
@@ -2287,79 +1272,14 @@ extern "C" {
                                      void* const* aux_data, const int64_t* const* aux_shapes,
                                      const int* aux_dims, const int* aux_types,
                                      const size_t* aux_IDs, const char* const* aux_dev_type,
-                                     const int* aux_dev_id) {
-    mxnet::ext::Graph *subgraph = mxnet::ext::Graph::fromString(json);
-    bool accept_bool = false;
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create a map of named tensors for args
-    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
-    for (int i = 0; i < num_args; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < arg_dims[i]; j++)
-        shapes.push_back(arg_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
-                      arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i], arg_dev_id[i]));
-      args[arg_names[i]] = tensor;
-    }
-    // create a map of named tensors for aux
-    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
-    for (int i = 0; i < num_aux; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < aux_dims[i]; j++)
-        shapes.push_back(aux_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
-                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
-                                                                    aux_dev_id[i]));
-      aux[aux_names[i]] = tensor;
-    }
-
-    subgraph->_setParams(&args, &aux);
-    mxnet::ext::MXReturnValue retval = reviewSubgraph(subgraph, subgraph_id, &accept_bool,
-                                                      opts);
-    if (!retval) return retval;
-
-    *accept = accept_bool;
-
-    if (subgraph->attrs.size() > 0) {
-      *num_attrs = subgraph->attrs.size();
-      // allocate space for attributes
-      *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-      *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-
-      // copy attributes
-      int i = 0;
-      for (auto kv : subgraph->attrs) {
-        (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
-        std::string val = kv.second.dump();  // convert JsonVal back to string
-        (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
-        snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
-        snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
-        i++;
-      }
-    }
-
-    return retval;
-  }
+                                     const int* aux_dev_id);
 
   /*! \brief returns number of graph passes registered in this library */
-  MX_INT_RET _passRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->size();
-  }
+  MX_INT_RET _passRegSize();
 
   /*! \brief returns pass registration at specified index */
   MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
-                          const char** pass_name) {
-    mxnet::ext::CustomPass pass =
-      mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->get(pass_idx);
-    *graphPass = pass.pass;
-    *pass_name = pass.name;
-  }
+                          const char** pass_name);
 
   /*! \brief returns status of calling graph pass function from library */
   MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
@@ -2374,49 +1294,7 @@ extern "C" {
                                 const int* aux_dims, const int* aux_types,
                                 const size_t* aux_IDs, const char* const* aux_dev_type,
                                 const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
-                                const void* nd_alloc) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create a map of named tensors for args
-    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
-    for (int i = 0; i < num_args; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < arg_dims[i]; j++)
-        shapes.push_back(arg_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
-                                  arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i],
-                                                                    arg_dev_id[i]));
-      args[arg_names[i]] = tensor;
-    }
-    // create a map of named tensors for aux
-    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
-    for (int i = 0; i < num_aux; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < aux_dims[i]; j++)
-        shapes.push_back(aux_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
-                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
-                                                                    aux_dev_id[i]));
-      aux[aux_names[i]] = tensor;
-    }
-
-    std::unordered_map<std::string, mxnet::ext::MXTensor> new_args, new_aux;
-    mxnet::ext::PassResource res(&new_args, &new_aux, nd_malloc, nd_alloc);
-    graph->_setParams(&args, &aux);
-    graph->_setPassResource(&res);
-    mxnet::ext::MXReturnValue retval = graphPass(graph, opts);
-    if (!retval) return retval;
-
-    std::string *tmp = new std::string(graph->toString());
-    *out_graph = const_cast<char*>(tmp->c_str());
-    return retval;
-  }
+                                const void* nd_alloc);
 
   /*!
    * \brief Checks if the MXNet version is supported by the library.
@@ -2432,13 +1310,10 @@ extern "C" {
 #endif
   initialize(int version);
 
-  MX_INT_RET _msgSize() {
-    return mxnet::ext::MXerrorMsgs::get()->size();
-  }
+  MX_INT_RET _msgSize();
 
   /*! \brief returns operator registration at specified index */
-  MX_VOID_RET _msgGet(int idx, const char** msg) {
-    *msg = mxnet::ext::MXerrorMsgs::get()->get(idx)->c_str();
-  }
+  MX_VOID_RET _msgGet(int idx, const char** msg);
 }  // extern "C"
+
 #endif  // MXNET_LIB_API_H_
diff --git a/src/lib_api.cc b/src/lib_api.cc
new file mode 100644
index 000000000000..0c404dd31265
--- /dev/null
+++ b/src/lib_api.cc
@@ -0,0 +1,1567 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file lib_api.cc
+ * \brief APIs to interact with libraries
+ * This API specifies function prototypes to
+ * register custom ops, partitioner, and passes
+ * for library authors
+ * See example/extension/lib_custom_op/README.md
+ * See example/extension/lib_subgraph/README.md
+ * See example/extension/lib_pass/README.md
+ */
+
+#include "mxnet/lib_api.h"
+
+mxnet::ext::MXContext::MXContext() : dev_type("error"), dev_id(-1) {}
+mxnet::ext::MXContext::MXContext(std::string dev_type_, int dev_id_)
+  : dev_type(dev_type_), dev_id(dev_id_) {}
+mxnet::ext::MXContext::MXContext(const char* dev_type_, int dev_id_)
+    : dev_type(dev_type_), dev_id(dev_id_) {}
+mxnet::ext::MXContext mxnet::ext::MXContext::CPU() { return MXContext("cpu", 0); }
+mxnet::ext::MXContext mxnet::ext::MXContext::GPU() { return MXContext("gpu", 0); }
+mxnet::ext::MXContext mxnet::ext::MXContext::CPU(int dev_id) { return MXContext("cpu", dev_id); }
+mxnet::ext::MXContext mxnet::ext::MXContext::GPU(int dev_id) { return MXContext("gpu", dev_id); }
+
+void mxnet::ext::MXSparse::set(void *data_ptr, const int64_t* dims, int ndims, void *idx,
+                               int64_t num_idx, void *idx_ptr, int64_t num_idx_ptr) {
+  data = data_ptr;
+  // If CSR, num of non-zero elemets is num_idx,
+  // If row sparse, num of elements is num_idx * width.
+  data_len = num_idx;
+  if (!idx_ptr) {
+    for (int i = 1; i < ndims; ++i)
+      data_len *= dims[i];
+  }
+
+  indices = reinterpret_cast<int64_t*>(idx);
+  indices_len = num_idx;
+
+  if (idx_ptr) {
+    indptr = reinterpret_cast<int64_t*>(idx_ptr);
+    indptr_len = num_idx_ptr;
+  }
+}
+
+mxnet::ext::MXTensor::MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0), stype(kDefaultStorage) {}
+mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
+                                                      dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx), stype(oth.stype) {
+  setDLTensor();
+}
+
+mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
+                               size_t vID, MXContext mx_ctx, MXStorageType stype)
+  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx), stype(stype) {
+  setDLTensor();
+}
+
+void mxnet::ext::MXTensor::setTensor(void *dptr, MXDType type, const int64_t* dims, int ndims,
+                                     size_t vID, MXContext mx_ctx, MXStorageType storage_type) {
+  data_ptr = dptr; dtype = type; verID = vID; ctx = mx_ctx; stype = storage_type;
+  shape.clear();
+  for (int j = 0; j < ndims; j++) {
+    shape.push_back(dims[j]);
+  }
+  setDLTensor();
+}
+
+void mxnet::ext::MXTensor::setDLTensor() {
+  dltensor.data = data_ptr;
+  dltensor.ndim = shape.size();
+  dltensor.shape = const_cast<int64_t*>(shape.data());
+  dltensor.strides = nullptr;
+  dltensor.byte_offset = 0;
+  dltensor.dtype.lanes = 1;
+  dltensor.ctx.device_id = ctx.dev_id;
+  if (ctx.dev_type == "cpu")
+    dltensor.ctx.device_type = kDLCPU;
+  else if (ctx.dev_type == "gpu")
+    dltensor.ctx.device_type = kDLGPU;
+  else if (ctx.dev_type == "opencl")
+    dltensor.ctx.device_type = kDLOpenCL;
+  else if (ctx.dev_type == "vulcan")
+    dltensor.ctx.device_type = kDLVulkan;
+  else if (ctx.dev_type == "metal")
+    dltensor.ctx.device_type = kDLMetal;
+  else if (ctx.dev_type == "vpi")
+    dltensor.ctx.device_type = kDLVPI;
+  else if (ctx.dev_type == "rocm")
+    dltensor.ctx.device_type = kDLROCM;
+  else
+    dltensor.ctx.device_type = kDLExtDev;
+  switch (dtype) {
+  case kFloat32:
+    dltensor.dtype.code = kDLFloat;
+    dltensor.dtype.bits = 32;
+    break;
+  case kFloat64:
+    dltensor.dtype.code = kDLFloat;
+    dltensor.dtype.bits = 64;
+    break;
+  case kFloat16:
+    dltensor.dtype.code = kDLFloat;
+    dltensor.dtype.bits = 16;
+    break;
+  case kUint8:
+    dltensor.dtype.code = kDLUInt;
+    dltensor.dtype.bits = 8;
+    break;
+  case kInt32:
+    dltensor.dtype.code = kDLInt;
+    dltensor.dtype.bits = 32;
+    break;
+  case kInt8:
+    dltensor.dtype.code = kDLInt;
+    dltensor.dtype.bits = 8;
+    break;
+  case kInt64:
+    dltensor.dtype.code = kDLInt;
+    dltensor.dtype.bits = 64;
+    break;
+  default:
+    dltensor.dtype.code = 0;
+    dltensor.dtype.bits = 0;
+    throw std::runtime_error("Error! Invalid dtype flag: "
+                             + std::to_string(static_cast<int>(dtype))
+                             + " when constructing MXTensor");
+  }
+}
+
+int64_t mxnet::ext::MXTensor::size() const {
+  int64_t size = 1;
+  for (unsigned int i = 0; i < shape.size(); i++) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+bool mxnet::ext::MXTensor::isSame(const MXTensor &oth) const {
+  return data_ptr == oth.data_ptr &&
+    dtype == oth.dtype &&
+    verID == oth.verID &&
+    ctx.dev_type == oth.ctx.dev_type &&
+    ctx.dev_id == oth.ctx.dev_id &&
+    shape == oth.shape &&
+    stype == oth.stype;
+}
+
+mxnet::ext::PassResource::PassResource(std::unordered_map<std::string, MXTensor>* new_args,
+                                       std::unordered_map<std::string, MXTensor>* new_aux,
+                                       nd_malloc_t nd_malloc, const void* nd_alloc)
+  : new_args_(new_args), new_aux_(new_aux), nd_malloc_(nd_malloc), nd_alloc_(nd_alloc) {}
+
+mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_arg(const std::string& name, const std::vector<int64_t>& shapes,
+                                                          const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) const {
+  void* data;
+  nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
+             dtype, name.c_str(), 1, &data);
+  MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
+  (*new_args_)[name] = tensor;
+  return &(new_args_->at(name));
+}
+
+mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_aux(const std::string& name, const std::vector<int64_t>& shapes,
+                                                          const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) const {
+  void* data;
+  nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
+             dtype, name.c_str(), 0, &data);
+  MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
+  (*new_aux_)[name] = tensor;
+  return &(new_aux_->at(name));
+}
+
+mxnet::ext::OpResource::OpResource(xpu_malloc_t cpu_malloc_fp, void* cpu_alloc_fp,
+                                   xpu_malloc_t gpu_malloc_fp, void* gpu_alloc_fp, void* stream,
+                                   sparse_malloc_t sparse_malloc_fp, void* sparse_alloc_fp,
+                                   void* rng_cpu_states, void* rng_gpu_states)
+  : cpu_malloc(cpu_malloc_fp), gpu_malloc(gpu_malloc_fp),
+    cpu_alloc(cpu_alloc_fp), gpu_alloc(gpu_alloc_fp), cuda_stream(stream),
+    sparse_malloc(sparse_malloc_fp), sparse_alloc(sparse_alloc_fp),
+    rand_cpu_states(rng_cpu_states), rand_gpu_states(rng_gpu_states) {}
+
+void* mxnet::ext::OpResource::alloc_cpu(int size) const {
+  return cpu_malloc(cpu_alloc, size);
+}
+
+void* mxnet::ext::OpResource::alloc_gpu(int size) const {
+  return gpu_malloc(gpu_alloc, size);
+}
+
+mxnet::ext::mx_stream_t mxnet::ext::OpResource::get_cuda_stream() const {
+  return static_cast<mx_stream_t>(cuda_stream);
+}
+
+void mxnet::ext::OpResource::alloc_sparse(mxnet::ext::MXSparse* sparse, int index, int indices_len, int indptr_len) const {
+  sparse_malloc(sparse_alloc, index, indices_len, indptr_len,
+                &(sparse->data), &(sparse->indices), &(sparse->indptr));
+}
+
+mxnet::ext::mx_cpu_rand_t* mxnet::ext::OpResource::get_cpu_rand_states() const {
+  return static_cast<mx_cpu_rand_t*>(rand_cpu_states);
+}
+
+mxnet::ext::mx_gpu_rand_t* mxnet::ext::OpResource::get_gpu_rand_states() const {
+  return static_cast<mx_gpu_rand_t*>(rand_gpu_states);
+}
+
+std::string mxnet::ext::getShapeAt(const std::string& shape, unsigned index) {
+  int idx = 1;  // start at 1 to skip the first square bracket [
+  // find the beginning of the output shape for the particular output index
+  for (unsigned x=0; x < index; x++)
+    idx = shape.find("[", idx+1);
+  int stop = shape.find("]", idx);  // find stop index for this output shape
+  // add this shape to the list
+  return shape.substr(idx, stop-idx+1);
+}
+
+std::string mxnet::ext::getDtypeAt(const std::string& dtype, unsigned index) {
+  // find the beginning of the output dtype for the particular output index
+  int idx = 0;
+  for (unsigned x=0; x < index; x++)
+    idx = dtype.find(",", idx+1);
+  int stop = dtype.find(",", idx+1);  // find stop index for this output dtype
+  if (stop == -1) stop = dtype.find("]", idx+1);
+  return dtype.substr(idx+1, stop-idx-1);
+}
+
+mxnet::ext::JsonVal::JsonVal() : type(ERR), num(-1), str("") {}
+mxnet::ext::JsonVal::JsonVal(mxnet::ext::JsonType t) : type(t), num(-1), str("") {}
+mxnet::ext::JsonVal::JsonVal(std::string s) : type(STR), num(-1), str(s) {}
+mxnet::ext::JsonVal::JsonVal(int n) : type(NUM), num(n), str(std::to_string(n)) {}
+mxnet::ext::JsonVal::JsonVal(JsonType t, int n, std::string s) : type(t), num(n), str(s) {}
+
+bool mxnet::ext::JsonVal::operator<(const mxnet::ext::JsonVal &o) const {
+  // for string JSON objects compare the string
+  if (type == STR) return type == o.type && str < o.str;
+  // for number JSON objects compare the number
+  if (type == NUM) return type == o.type && num < o.num;
+  // for list JSON objects, compare the size of list, and then each object in the list
+  if (type == LIST) {
+    if (list.size() != o.list.size()) return false;
+    for (unsigned int i=0; i< list.size(); i++)
+      if (list[i] < o.list[i])
+        return false;  // if we find an object that doesnt match return
+    return true;  // all objects in lists matched
+  }
+  // for map JSON objects, compare the size of map, and then each key/value in the maps
+  if (type == MAP) {
+    if (map.size() != o.map.size()) return false;
+    for (auto &item : map) {
+      // if one map is missing a key in another return
+      if (o.map.find(item.first) == o.map.end()) return false;
+      if (item.second < o.map.at(item.first)) return false;
+    }
+    return true;
+  }
+  return type < o.type;
+}
+
+std::string mxnet::ext::JsonVal::dump() const {
+  std::string ret;
+  switch (type) {
+  case ERR:
+    ret = "json(Error)";
+    break;
+  case STR:
+    ret = "\"" + str + "\"";
+    break;
+  case NUM:
+    ret = str;
+    break;
+  case LIST:
+    ret = "[";
+    for (unsigned i=0; i < list.size(); i++) {
+      auto &item = list[i];
+      ret += item.dump();
+      if (i < list.size()-1)
+        ret += ",";
+    }
+    ret += "]";
+    break;
+  case MAP:
+    ret = "{";
+    unsigned cnt = 0;
+    for (auto &item : map) {
+      ret += item.first.dump() + " : " + item.second.dump();
+      if (cnt++ < map.size()-1)
+        ret += ",";
+    }
+    ret += "}";
+    break;
+  }
+  return ret;
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse(const std::string& json) {
+  unsigned int idx = 0;
+  return JsonVal::parse(json, &idx);
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_string(const std::string& json, unsigned int* idx) {
+  JsonVal ret(STR);
+  while (*idx < json.size()) {
+    if (json[*idx] == '"') {
+      ++(*idx);
+      return ret;
+    } else {
+      ret.str += json[*idx];
+      ++(*idx);
+    }
+  }
+  MX_ERROR_MSG << "Error! Unable to parse string: '" << json.substr(*idx) << "'" << std::endl;
+  return JsonVal();
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_num(const std::string& json, unsigned int* idx) {
+  JsonVal ret(NUM);
+  while (*idx < json.size()) {
+    if (json[*idx] >= '0' && json[*idx] <= '9') {
+      ret.str += json[*idx];
+      ++(*idx);
+    } else {
+      break;
+    }
+  }
+  ret.num = std::stoi(ret.str);
+  return ret;
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_list(const std::string& json, unsigned int* idx) {
+  JsonVal ret(LIST);
+  while (*idx < json.size()) {
+    if (json[*idx] == ']') {
+      ++(*idx);
+      return ret;
+    } else {
+      JsonVal item = JsonVal::parse(json, idx);
+      if (item.type != ERR)
+        ret.list.push_back(item);
+    }
+  }
+  MX_ERROR_MSG << "Error! Unable to parse list: '" << json.substr(*idx) << "'" << std::endl;
+  return JsonVal();
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_map(const std::string& json, unsigned int* idx) {
+  JsonVal ret(MAP), key;
+  while (*idx < json.size()) {
+    if (json[*idx] == '}') {
+      ++(*idx);
+      return ret;
+    } else {
+      JsonVal item = JsonVal::parse(json, idx);
+      if (key.type == ERR) {
+        key = item;
+      } else {
+        ret.map[key] = item;
+        key.type = ERR;
+      }
+    }
+  }
+  MX_ERROR_MSG << "Error! Unable to parse map: '" << json.substr(*idx) << "'" << std::endl;
+  return mxnet::ext::JsonVal();
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse(const std::string& json, unsigned int *idx) {
+  JsonVal ret;
+  while (*idx < json.size()) {
+    if (json[*idx] == '"') {
+      ++(*idx);
+      ret = JsonVal::parse_string(json, idx);
+    } else if (json[*idx] >= '0' && json[*idx] <= '9') {
+      ret = JsonVal::parse_num(json, idx);
+    } else if (json[*idx] == '[') {
+      ++(*idx);
+      ret = JsonVal::parse_list(json, idx);
+    } else if (json[*idx] == '{') {
+      ++(*idx);
+      ret = JsonVal::parse_map(json, idx);
+    } else if (json[*idx] == ']' || json[*idx] == '}') {return ret;}
+    if (ret.type != ERR) return ret;
+    ++(*idx);
+  }
+  return ret;
+}
+
+std::string mxnet::ext::JsonVal::toString() const {
+  std::string ret;
+  switch (type) {
+  case ERR:
+    ret = "json(Error)";
+    break;
+  case STR:
+    ret = "json(STR:" + str + ")";
+    break;
+  case NUM:
+    ret = "json(INT:" + str + ")";
+    break;
+  case LIST:
+    ret = "json(LIST:[";
+    for (auto &item : list)
+      ret += item.toString() + ",";
+    ret += "])";
+    break;
+  case MAP:
+    ret = "json(MAP:{";
+    for (auto &item : map)
+      ret += item.first.toString() + " : " + item.second.toString() + ",";
+    ret += "})";
+    break;
+  }
+  return ret;
+}
+
+mxnet::ext::Node::Node() {tensor = nullptr;}
+
+void mxnet::ext::Node::_setPassResource(mxnet::ext::PassResource* res_) {res = res_;}
+
+void mxnet::ext::Node::alloc_arg(const std::vector<int64_t>& shapes,
+                                 const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) {
+  if (!res)
+    throw std::runtime_error("Node not initialized. Cannot use alloc_arg outside of graph passes.");
+  tensor = res->alloc_arg(name, shapes, ctx, dtype);
+}
+
+void mxnet::ext::Node::alloc_aux(const std::vector<int64_t>& shapes,
+                                 const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) {
+  if (!res)
+    throw std::runtime_error("Node not initialized. Cannot use alloc_aux outside of graph passes.");
+  tensor = res->alloc_aux(name, shapes, ctx, dtype);
+}
+
+mxnet::ext::Graph::Graph() : res(nullptr) {}
+
+mxnet::ext::Graph::~Graph() {
+  for (int i = 0; i < nodes.size(); i++)
+    delete nodes[i];
+}
+
+mxnet::ext::Graph* mxnet::ext::Graph::fromString(const std::string& json) {
+  JsonVal val = JsonVal::parse(json);
+  return fromJson(val);
+}
+
+mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
+  // get nodes list
+  JsonVal nodes = val.map[JsonVal("nodes")];
+  Graph *g = new Graph();
+
+  std::map<int, Node*> nodeMap;
+  // loop over nodes
+  for (int i = 0; i < nodes.list.size(); i++) {
+    Node* n = new Node();
+    g->nodes.push_back(n);
+    JsonVal node = nodes.list[i];
+
+    // set the op info
+    n->op = node.map[JsonVal("op")].str;
+    n->name = node.map[JsonVal("name")].str;
+
+    // if op is null it is an input to the graph
+    if (n->op.compare("null") == 0)
+      g->inputs.push_back(n);
+
+    // set attrs
+    JsonVal attributes = node.map[JsonVal("attrs")];
+    for (auto& kv : attributes.map) {
+      n->attrs[kv.first.str] = kv.second.str;
+    }
+
+    // set subgraphs, parsing each into a graph
+    if (node.map.count(JsonVal("subgraphs")) > 0) {
+      JsonVal subgraphs = node.map[JsonVal("subgraphs")];
+      for (auto &subgraph : subgraphs.list) {
+        n->subgraphs.push_back(fromJson(subgraph));
+      }
+    }
+
+    // set node inputs
+    JsonVal node_inputs = node.map[JsonVal("inputs")];
+    n->inputs.resize(node_inputs.list.size());
+    for (int j = 0; j < node_inputs.list.size(); j++) {
+      JsonVal input = node_inputs.list[j];
+      NodeEntry& entry = n->inputs[j];
+      // get pointer to other node
+      entry.node = nodeMap[input.list[0].num];
+      // get the other node's output index
+      entry.entry = input.list[1].num;
+      // set other nodes output as connected to this node
+      entry.node->outputs.push_back({n, j});
+    }
+    nodeMap[i] = n;
+  }
+
+  // set graph level outputs
+  JsonVal& heads = val.map[JsonVal("heads")];
+  g->outputs.resize(heads.list.size());
+  for (int i = 0; i < heads.list.size(); i++) {
+    JsonVal head = heads.list[i];
+    g->outputs[i].node = nodeMap[head.list[0].num];
+    g->outputs[i].entry = head.list[1].num;
+  }
+
+  // add all attributes to the graph
+  for (auto& kv : val.map) {
+    if (kv.first.str.compare("nodes") != 0 &&
+        kv.first.str.compare("heads") != 0 &&
+        kv.first.str.compare("node_row_ptr") != 0 &&
+        kv.first.str.compare("arg_nodes") != 0) {
+      g->attrs[kv.first.str] = kv.second;
+    }
+  }
+  return g;
+}
+
+/* \brief convert graph object back to JSON object */
+mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
+  // top level object is a map
+  JsonVal val(MAP);
+
+  // add attributes
+  for (auto& kv : attrs) {
+    val.map[JsonVal(kv.first)] = kv.second;
+  }
+
+  // sort graph nodes in topological order, create mapping of node to index
+  std::map<Node*, int> nodeMap;
+  std::vector<Node*> sorted = topological_sort();
+  // nodes are in reverse topological order in the vector (back is first)
+  // so loop from end to front over the vector 'sorted'
+  for (int i = sorted.size()-1; i >= 0; i--) {
+    nodeMap[sorted[i]] = sorted.size()-1-i;
+  }
+
+  // create node_row_ptr entry
+  val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
+  JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
+  for (int i = 0; i < nodes.size(); i++)
+    node_row_ptr.list.push_back(JsonVal(i));
+
+  // add all input nodes
+  val.map[JsonVal("arg_nodes")] = JsonVal(LIST);
+  JsonVal& arg_nodes = val.map[JsonVal("arg_nodes")];
+  for (int i = 0; i < inputs.size(); i++)
+    arg_nodes.list.push_back(JsonVal(nodeMap[inputs[i]]));
+
+  // add all output nodes
+  val.map[JsonVal("heads")] = JsonVal(LIST);
+  JsonVal& heads = val.map[JsonVal("heads")];
+  for (int i = 0; i < outputs.size(); i++) {
+    heads.list.push_back(JsonVal(LIST));
+    JsonVal& out = heads.list[i];
+    out.list.push_back(JsonVal(nodeMap[outputs[i].node]));
+    out.list.push_back(JsonVal(outputs[i].entry));
+    out.list.push_back(JsonVal(0));
+  }
+
+  // add all graph nodes
+  val.map[JsonVal("nodes")] = JsonVal(LIST);
+  JsonVal& nodes_ = val.map[JsonVal("nodes")];
+  for (int i = sorted.size()-1; i >= 0; i--) {
+    // each node is a map
+    nodes_.list.push_back(JsonVal(MAP));
+    Node* n = sorted[i];
+    JsonVal& n_ = nodes_.list[nodes_.list.size()-1];
+
+    n_.map[JsonVal("op")] = JsonVal(n->op);
+    n_.map[JsonVal("name")] = JsonVal(n->name);
+    n_.map[JsonVal("inputs")] = JsonVal(LIST);
+
+    // add inputs for this node
+    JsonVal& inputs_ = n_.map[JsonVal("inputs")];
+    for (int j = 0; j < n->inputs.size(); j++) {
+      inputs_.list.push_back(JsonVal(LIST));
+      NodeEntry& entry = n->inputs[j];
+      JsonVal& in = inputs_.list[j];
+      in.list.push_back(JsonVal(nodeMap[entry.node]));
+      in.list.push_back(JsonVal(entry.entry));
+      in.list.push_back(JsonVal(0));
+    }
+
+    // add subgraphs for this node, convert each back to JSON
+    if (n->subgraphs.size() > 0) {
+      n_.map[JsonVal("subgraphs")] = JsonVal(LIST);
+      JsonVal &subgraphs_ = n_.map[JsonVal("subgraphs")];
+      for (Graph *subgraph : n->subgraphs) {
+        subgraphs_.list.push_back(subgraph->toJson());
+      }
+    }
+
+    // add attributes for this node
+    n_.map[JsonVal("attrs")] = JsonVal(MAP);
+    JsonVal& attrs_ = n_.map[JsonVal("attrs")];
+    for (auto& kv : n->attrs) {
+      attrs_.map[JsonVal(kv.first)] = JsonVal(kv.second);
+    }
+  }
+  return val;
+}
+
+/* \brief convert graph object to JSON string */
+std::string mxnet::ext::Graph::toString() {
+  return toJson().dump();
+}
+
+  /* \brief visits a node "n" */
+void mxnet::ext::Graph::_dfs_util(Node* n, std::unordered_set<mxnet::ext::Node*>* to_visit,
+                                  std::function<void(mxnet::ext::Node*)> handler) const {
+  to_visit->erase(n);  // remove node now that we're visiting it
+  for (NodeEntry& e : n->outputs) {
+    Node* o = e.node;
+    if (to_visit->count(o) != 0) {
+      _dfs_util(o, to_visit, handler);  // visit neighbor
+    }
+  }
+  handler(n);  // post-order visit this node
+}
+
+/* \brief post-order DFS graph traversal */
+void mxnet::ext::Graph::DFS(std::function<void(Node*)> handler) const {
+  std::unordered_set<Node*> to_visit;
+  // put all nodes in set to visit
+  for (auto& n : nodes)
+    to_visit.insert(n);
+  // visit all inputs first
+  for (auto& i : inputs)
+    if (to_visit.count(i) != 0)
+      _dfs_util(i, &to_visit, handler);
+  // visit any nodes left
+  while (to_visit.size() > 0)
+    _dfs_util(*(to_visit.begin()), &to_visit, handler);
+}
+
+/* \brief sort graph nodes in topological order */
+std::vector<mxnet::ext::Node*> mxnet::ext::Graph::topological_sort() const {
+  std::vector<mxnet::ext::Node*> sorted;
+  auto handler = [&](mxnet::ext::Node* n) {
+    sorted.push_back(n);  // when visiting each node, add it in order to the vector
+  };
+  DFS(handler);
+  return sorted;
+}
+
+/* \brief print out graph details */
+void mxnet::ext::Graph::print(int indent) const {
+  std::string space = "";
+  for (int i = 0; i < indent; i++) space+=" ";
+  
+  std::cout << space << "########### Graph #############" << std::endl;
+  std::cout << space << "attributes: " << std::endl;
+  for (auto &kv : attrs)
+    std::cout << space << "\t" << kv.first << " : " << kv.second.str << std::endl;
+  std::cout << space << "inputs: " << inputs.size() << std::endl;
+  std::cout << space << "outputs: " << outputs.size() << std::endl;
+  std::cout << space << "nodes: " << nodes.size() << std::endl;
+  std::vector<mxnet::ext::Node*> sorted = topological_sort();
+  // loop over each node and print out its inputs/outputs
+  for (int i = sorted.size()-1; i >= 0; i--) {
+    std::cout << space << "Node: " << sorted[i]->name << std::endl;
+    for (int j = 0; j < sorted[i]->inputs.size(); j++) {
+      std::cout << space << "\tInput: " << sorted[i]->inputs[j].node->name << " "
+                << sorted[i]->inputs[j].entry << std::endl;
+    }
+    for (int j = 0; j < sorted[i]->outputs.size(); j++) {
+      std::cout << space << "\tOutput: " << sorted[i]->outputs[j].node->name << " "
+                << sorted[i]->outputs[j].entry << std::endl;
+    }
+    if (sorted[i]->subgraphs.size() > 0) {
+      for (auto &subgraph : sorted[i]->subgraphs) {
+        std::cout << space << "\tSubgraph:" << std::endl;
+        subgraph->print(indent+2);
+      }
+    }
+  }
+  std::cout << space << "###############################" << std::endl;
+}
+
+/* \brief add a new node to this graph */
+mxnet::ext::Node* mxnet::ext::Graph::addNode(const std::string& name, const std::string& op) {
+  Node* n = new Node();
+  n->name = name;
+  n->op = op;
+  if (res)
+    n->_setPassResource(res);
+  return n;
+}
+
+/* \brief get node at index in graph */
+mxnet::ext::Node* mxnet::ext::Graph::getNode(size_t idx) {
+  return nodes[idx];
+}
+
+/* \brief get const node at index in const graph */
+const mxnet::ext::Node* mxnet::ext::Graph::getNode(size_t idx) const {
+  return nodes.at(idx);
+}
+
+/* \brief get attribute on graph */
+const mxnet::ext::JsonVal& mxnet::ext::Graph::getAttr(const std::string& key) const {
+  return attrs.at(key);
+}
+
+/* \brief get number of nodes in the graph */
+size_t mxnet::ext::Graph::size() const {
+  return nodes.size();
+}
+
+// internally set passResource to enable tensor allocation for graph passes
+void mxnet::ext::Graph::_setPassResource(PassResource* res_) {
+  res = res_;
+  // set passResource for each node
+  for (Node* node : nodes) {
+    node->_setPassResource(res);
+  }
+}
+
+// internally set arg/aux params when available
+void mxnet::ext::Graph::_setParams(std::unordered_map<std::string, mxnet::ext::MXTensor>* args,
+                                   std::unordered_map<std::string, mxnet::ext::MXTensor>* aux) {
+  // set params for each input node
+  for (Node* node : inputs) {
+    if (args->count(node->name) > 0)
+      node->tensor = &args->at(node->name);
+    else if (aux->count(node->name) > 0)
+      node->tensor = &aux->at(node->name);
+  }
+}
+
+mxnet::ext::CustomOp::CustomOp(const char* op_name)
+  : name(op_name), parse_attrs(NULL), infer_type(NULL), infer_storage_type(NULL), infer_shape(NULL),
+    mutate_inputs(NULL), isSGop(false) {}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setForward(mxnet::ext::fcomp_t fcomp, const char* ctx) {
+  if (forward_ctx_map.count(ctx) > 0)
+    raiseDuplicateContextError();
+  forward_ctx_map[ctx] = fcomp;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setBackward(mxnet::ext::fcomp_t fgrad, const char* ctx) {
+  if (backward_ctx_map.count(ctx) > 0)
+    raiseDuplicateContextError();
+  backward_ctx_map[ctx] = fgrad;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setParseAttrs(mxnet::ext::parseAttrs_t func) {
+  parse_attrs = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setInferType(mxnet::ext::inferType_t func) {
+  infer_type = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setInferSType(mxnet::ext::inferSType_t func) {
+  infer_storage_type = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setInferShape(mxnet::ext::inferShape_t func) {
+  infer_shape = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setMutateInputs(mxnet::ext::mutateInputs_t func) {
+  mutate_inputs = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setCreateOpState(mxnet::ext::createOpState_t func, const char* ctx) {
+  if (create_op_ctx_map.count(ctx) > 0)
+    raiseDuplicateContextError();
+  create_op_ctx_map[ctx] = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setIsSubgraphOp() {
+  isSGop = true;
+  return *this;
+}
+
+void mxnet::ext::CustomOp::mapToVector() {
+  for (auto kv : forward_ctx_map) {
+    forward_ctx_cstr.push_back(kv.first);
+    forward_fp.push_back(kv.second);
+  }
+  for (auto kv : backward_ctx_map) {
+    backward_ctx_cstr.push_back(kv.first);
+    backward_fp.push_back(kv.second);
+  }
+  for (auto kv : create_op_ctx_map) {
+    create_op_ctx_cstr.push_back(kv.first);
+    create_op_fp.push_back(kv.second);
+  }
+}
+
+void mxnet::ext::CustomOp::raiseDuplicateContextError() {
+  std::string op_name_str(name);
+  throw std::runtime_error("Error! Error! Cannot register multiple functions under same context for operator '"
+                           + op_name_str + "'");
+}
+
+mxnet::ext::CustomPass::CustomPass() : name("ERROR") {}
+mxnet::ext::CustomPass::CustomPass(const char* pass_name)
+  : name(pass_name) {}
+mxnet::ext::CustomPass& mxnet::ext::CustomPass::setBody(graphPass_t fn) {
+  pass = fn;
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner::CustomPartitioner() : name("ERROR") {}
+mxnet::ext::CustomPartitioner::CustomPartitioner(const char* backend_name) :
+  name(backend_name) {}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::addStrategy(const char* prop_name,
+                                                                          const char* sg_name) {
+  strategies.push_back(prop_name);
+  op_names.push_back(sg_name);
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setSupportedOps(const char* prop_name,
+                                                                              mxnet::ext::supportedOps_t fn) {
+  supported_map[std::string(prop_name)] = fn;
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setCreateSelector(const char* prop_name,
+                                                                                mxnet::ext::createSelector_t fn) {
+  selector_map[std::string(prop_name)] = fn;
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setReviewSubgraph(const char* prop_name,
+                                                                                mxnet::ext::reviewSubgraph_t fn) {
+  review_map[std::string(prop_name)] = fn;
+  return *this;
+}
+
+mxnet::ext::supportedOps_t mxnet::ext::CustomPartitioner::getSupportedOps(int stg_id) {
+  std::string prop(strategies[stg_id]);
+  if (supported_map.count(prop) > 0)
+    return supported_map[prop];
+  else
+    return nullptr;
+}
+
+mxnet::ext::createSelector_t mxnet::ext::CustomPartitioner::getCreateSelector(int stg_id) {
+  std::string prop(strategies[stg_id]);
+  if (selector_map.count(prop) > 0)
+    return selector_map[prop];
+  else
+    return nullptr;
+}
+
+mxnet::ext::reviewSubgraph_t mxnet::ext::CustomPartitioner::getReviewSubgraph(int stg_id) {
+  std::string prop(strategies[stg_id]);
+  if (review_map.count(prop) > 0)
+    return review_map[prop];
+  else
+    return nullptr;
+}
+
+  /*! \brief returns MXNet library version */
+  MX_INT_RET _opVersion() {
+    return MX_LIBRARY_VERSION;
+  }
+
+  /*! \brief returns number of ops registered in this library */
+  MX_INT_RET _opRegSize() {
+    return mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->size();
+  }
+
+  /*! \brief returns operator registration at specified index */
+  MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
+                        const char*** forward_ctx, mxnet::ext::fcomp_t** forward_fp,
+                        int* forward_count, const char*** backward_ctx,
+                        mxnet::ext::fcomp_t** backward_fp, int* backward_count,
+                        const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
+                        int* create_op_count, mxnet::ext::parseAttrs_t* parse,
+                        mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
+                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate) {
+    mxnet::ext::CustomOp &op = mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->get(idx);
+    *name = op.name;
+    *parse = op.parse_attrs;
+    *type = op.infer_type;
+    *stype = op.infer_storage_type;
+    *shape = op.infer_shape;
+    *mutate = op.mutate_inputs;
+    *isSGop = op.isSGop;
+    op.mapToVector();
+    *forward_ctx = op.forward_ctx_cstr.data();
+    *forward_fp = op.forward_fp.data();
+    *forward_count = op.forward_fp.size();
+    *backward_ctx = op.backward_ctx_cstr.data();
+    *backward_fp = op.backward_fp.data();
+    *backward_count = op.backward_fp.size();
+    *create_op_ctx = op.create_op_ctx_cstr.data();
+    *create_op_fp = op.create_op_fp.data();
+    *create_op_count = op.create_op_fp.size();
+  }
+
+  /*! \brief calls free from the external library for library allocated arrays */
+  MX_VOID_RET _opCallFree(void* ptr) {
+    free(ptr);
+  }
+
+  /*! \brief returns status of calling parse attributes function for operator from library */
+  MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
+                               const char* const* vals, int num,
+                               int* num_in, int* num_out) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    return parseAttrs(attrs, num_in, num_out);
+  }
+
+  /*! \brief returns status of calling inferShape function for operator from library */
+  MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
+                               const char* const* vals, int num,
+                               unsigned int** inshapes, int* indims, int num_in,
+                               unsigned int*** mod_inshapes, int** mod_indims,
+                               unsigned int*** outshapes, int** outdims, int num_out) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of shapes for inputs
+    std::vector<std::vector<unsigned int> > in_shapes(num_in);
+    for (int i = 0; i < num_in; i++) {
+      for (int j = 0; j < indims[i]; j++) {
+        in_shapes[i].push_back(inshapes[i][j]);
+      }
+    }
+
+    // create a vector of shapes for outputs
+    std::vector<std::vector<unsigned int> > out_shapes(num_out);
+
+    int retval = inferShape(attrs, &in_shapes, &out_shapes);
+    if (!retval) return retval;
+
+    // allocate space for modified input dims, shape
+    *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
+    *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
+
+    // copy modified input shapes
+    for (int i = 0; i < num_in; i++) {
+      (*mod_indims)[i] = in_shapes[i].size();
+      (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
+      for (int j = 0; j < (*mod_indims)[i]; j++) {
+        (*mod_inshapes)[i][j] = in_shapes[i][j];
+      }
+    }
+
+    // allocate space for output dims, shape
+    *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
+    *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
+
+    // copy output shapes
+    for (int i = 0; i < num_out; i++) {
+      (*outdims)[i] = out_shapes[i].size();
+      (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
+      for (int j = 0; j < (*outdims)[i]; j++) {
+        (*outshapes)[i][j] = out_shapes[i][j];
+      }
+    }
+
+    return retval;
+  }
+
+  /*! \brief returns status of calling inferType function for operator from library */
+  MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
+                              const char* const* vals, int num,
+                              int* intypes, int num_in, int* outtypes, int num_out) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of types for inputs
+    std::vector<int> in_types(num_in);
+    for (int i = 0; i < num_in; i++) {
+      in_types[i] = intypes[i];
+    }
+
+    // create a vector of types for outputs
+    std::vector<int> out_types(num_out, -1);
+
+    int retval = inferType(attrs, &in_types, &out_types);
+    if (!retval)
+      return retval;
+
+    // copy modified input types
+    for (int i = 0; i < num_in; i++) {
+      intypes[i] = in_types[i];
+    }
+    // copy output types
+    for (int i = 0; i < num_out; i++) {
+      outtypes[i] = out_types[i];
+    }
+
+    return retval;
+  }
+
+  /*! \brief returns status of calling inferSType function for operator from library */
+  MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
+                               const char* const* vals, int num,
+                               int* instypes, int num_in, int* outstypes, int num_out) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of types for inputs
+    std::vector<int> in_stypes(num_in);
+    for (int i = 0; i < num_in; i++) {
+      in_stypes[i] = instypes[i];
+    }
+
+    // create a vector of types for outputs
+    std::vector<int> out_stypes(num_out, -1);
+
+    int retval = inferSType(attrs, &in_stypes, &out_stypes);
+
+    if (!retval)
+      return retval;
+
+    // copy modified input storage types
+    for (int i = 0; i < num_in; i++) {
+      instypes[i] = in_stypes[i];
+    }
+    // copy output storage types
+    for (int i = 0; i < num_out; i++) {
+      outstypes[i] = out_stypes[i];
+    }
+
+    return retval;
+  }
+
+  /*! \brief returns status of calling Forward/Backward function for operator from library */
+  MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
+                             const char* const* vals,
+                             int num, const int64_t** inshapes, int* indims, void** indata,
+                             int* intypes, size_t* inIDs, const char** indev_type, int* indev_id,
+                             int num_in, const int64_t** outshapes, int* outdims, void** outdata,
+                             int* outtypes, size_t* outIDs, const char** outdev_type,
+                             int* outdev_id, int num_out, mxnet::ext::xpu_malloc_t cpu_malloc,
+                             void* cpu_alloc,
+                             mxnet::ext::xpu_malloc_t gpu_malloc, void* gpu_alloc,
+                             void* cuda_stream,
+                             mxnet::ext::sparse_malloc_t sparse_malloc, void* sparse_alloc,
+                             int* instypes, int* outstypes, void** in_indices, void** out_indices,
+                             void** in_indptr, void** out_indptr,
+                             int64_t* in_indices_shapes, int64_t* out_indices_shapes,
+                             int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
+                             void* rng_cpu_states, void* rng_gpu_states) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of tensors for inputs
+    std::vector<mxnet::ext::MXTensor> inputs(num_in);
+    // create a vector for sparse inputs
+    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+
+    for (int i = 0; i < num_in; i++) {
+      // Dense representation.
+      if (instypes[i] == 0) {
+        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
+                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
+                            mxnet::ext::kDefaultStorage);
+      } else {
+        // Sparse representation.
+        mxnet::ext::MXStorageType type;
+        if (instypes[i] == 1) {
+          type = mxnet::ext::kRowSparseStorage;
+          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
+        } else {
+          type = mxnet::ext::kCSRStorage;
+          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
+                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
+        }
+        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
+                            inshapes[i], indims[i], inIDs[i],
+                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
+      }
+    }
+
+    // create a vector of tensors for outputs
+    std::vector<mxnet::ext::MXTensor> outputs(num_out);
+    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
+
+    for (int i = 0; i < num_out; i++) {
+      // Dense representation.
+      if (outstypes[i] == 0) {
+        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
+                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
+                             mxnet::ext::kDefaultStorage);
+      } else {
+        // Sparse representation.
+        mxnet::ext::MXStorageType type;
+        if (outstypes[i] == 1) {
+          type = mxnet::ext::kRowSparseStorage;
+          out_sparse[i].set(outdata[i], outshapes[i], outdims[i],
+                            out_indices[i], out_indices_shapes[i]);
+        } else {
+          type = mxnet::ext::kCSRStorage;
+          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
+        }
+        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
+                             (mxnet::ext::MXDType)outtypes[i],
+                             outshapes[i], outdims[i], outIDs[i],
+                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
+      }
+    }
+
+    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
+                               cuda_stream, sparse_malloc, sparse_alloc,
+                               rng_cpu_states, rng_gpu_states);
+    return fcomp(attrs, &inputs, &outputs, res);
+  }
+
+  /*! \brief returns status of calling mutateInputs function for operator from library */
+  MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
+                                 const char* const* vals, int num,
+                                 int** mutate_indices, int* indices_size) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of mutate input indices
+    std::vector<int> mut_ind;
+
+    int retval = mutate(attrs, &mut_ind);
+    if (!retval)
+      return retval;
+
+    // output the input indices
+    *indices_size = mut_ind.size();
+    *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
+    for (int i = 0; i < *indices_size; i++) {
+      (*mutate_indices)[i] = mut_ind[i];
+    }
+
+    return retval;
+  }
+
+  /*! \brief returns status of calling createStatefulOp function for operator from library */
+  MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
+                                  const char* const* vals, int num,
+                                  void** state_op) {
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // void pointer to hold custom state op instance created in custom library
+    // eventually state_op pointer is populated by instance from custom library
+    mxnet::ext::CustomStatefulOp** op_ptr =
+      reinterpret_cast<mxnet::ext::CustomStatefulOp**>(state_op);
+    return create_op(attrs, op_ptr);
+  }
+
+  /*! \brief returns status of calling Stateful Forward/Backward for operator from library */
+  MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
+                                     int* indims, void** indata, int* intypes, size_t* inIDs,
+                                     const char** indev_type, int* indev_id, int num_in,
+                                     const int64_t** outshapes, int* outdims, void** outdata,
+                                     int* outtypes, size_t* outIDs, const char** outdev_type,
+                                     int* outdev_id, int num_out,
+                                     mxnet::ext::xpu_malloc_t cpu_malloc,
+                                     void* cpu_alloc, mxnet::ext::xpu_malloc_t gpu_malloc,
+                                     void* gpu_alloc,
+                                     void* stream, mxnet::ext::sparse_malloc_t sparse_malloc,
+                                     void* sparse_alloc, int* instypes, int* outstypes,
+                                     void** in_indices, void** out_indices, void** in_indptr,
+                                     void** out_indptr, int64_t* in_indices_shapes,
+                                     int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
+                                     int64_t* out_indptr_shapes,
+                                     void* rng_cpu_states, void* rng_gpu_states) {
+    // create a vector of tensors for inputs
+    std::vector<mxnet::ext::MXTensor> inputs(num_in);
+    // create a vector for sparse inputs
+    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+
+    for (int i = 0; i < num_in; i++) {
+      if (instypes[i] == 0) {
+        // Dense representation.
+        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
+                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
+                            mxnet::ext::kDefaultStorage);
+      } else {
+        // Sparse representation.
+        mxnet::ext::MXStorageType type;
+        if (instypes[i] == 1) {
+          type = mxnet::ext::kRowSparseStorage;
+          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
+        } else {
+          type = mxnet::ext::kCSRStorage;
+          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
+                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
+        }
+        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
+                            inshapes[i], indims[i], inIDs[i],
+                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
+      }
+    }
+
+    // create a vector of tensors for outputs
+    std::vector<mxnet::ext::MXTensor> outputs(num_out);
+    // create a vector for sparse outputs
+    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
+
+    for (int i = 0; i < num_out; i++) {
+      if (outstypes[i] == 0) {
+        // Dense representation.
+        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
+                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
+                             mxnet::ext::kDefaultStorage);
+      } else {
+        // Sparse representation.
+        mxnet::ext::MXStorageType type;
+        if (outstypes[i] == 1) {
+          type = mxnet::ext::kRowSparseStorage;
+          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                            out_indices_shapes[i]);
+        } else {
+          type = mxnet::ext::kCSRStorage;
+          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
+        }
+        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
+                             (mxnet::ext::MXDType)outtypes[i],
+                             outshapes[i], outdims[i], outIDs[i],
+                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
+      }
+    }
+
+    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
+                               stream, sparse_malloc, sparse_alloc, rng_cpu_states, rng_gpu_states);
+
+    mxnet::ext::CustomStatefulOp* op_ptr =
+      reinterpret_cast<mxnet::ext::CustomStatefulOp*>(state_op);
+    if (is_forward) {
+      return op_ptr->Forward(&inputs, &outputs, res);
+    }
+    return op_ptr->Backward(&inputs, &outputs, res);
+  }
+
+  /*! \brief returns number of partitioners registered in this library */
+  MX_INT_RET _partRegSize() {
+    return mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->size();
+  }
+
+  /* returns number of strategies registered for partitioner
+   * at specified index */
+  MX_INT_RET _partRegGetCount(int idx, const char** name) {
+    mxnet::ext::CustomPartitioner part =
+      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(idx);
+    *name = part.name;
+    return part.strategies.size();
+  }
+
+  /*! \brief returns partitioner registration at specified index */
+  MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
+                          mxnet::ext::supportedOps_t* supportedOps,
+                          mxnet::ext::createSelector_t* createSelector,
+                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name) {
+    mxnet::ext::CustomPartitioner part =
+      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(part_idx);
+    *strategy = part.strategies[stg_idx];
+    *op_name = part.op_names[stg_idx];
+    *supportedOps = part.getSupportedOps(stg_idx);
+    *createSelector = part.getCreateSelector(stg_idx);
+    *reviewSubgraph = part.getReviewSubgraph(stg_idx);
+  }
+
+  /*! \brief returns status of calling supported ops function from library */
+  MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
+                                   int num_ids, int *ids, const char* const* opt_keys,
+                                   const char* const* opt_vals, int num_opts) {
+    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+    // create map of options from list
+    std::unordered_map<std::string, std::string> opts;
+    for (int i = 0; i < num_opts; i++)
+      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+    // create array of subgraph IDs for operator support
+    std::vector<int> _ids(num_ids, -2);
+    // call user's supportedOps function
+    mxnet::ext::MXReturnValue retval = supportedOps(graph, &_ids, opts);
+    if (!retval) return retval;
+
+    // copy bools in ids to ints
+    for (int i = 0; i < num_ids; i++)
+      ids[i] = _ids[i];
+
+    return retval;
+  }
+
+  /*! \brief returns status of calling create selector function from library */
+  MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
+                                     void** selector, const char* const* opt_keys,
+                                     const char* const* opt_vals, int num_opts) {
+    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+    // create map of options from list
+    std::unordered_map<std::string, std::string> opts;
+    for (int i = 0; i < num_opts; i++)
+      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+    // void pointer to hold selector instance created in custom library
+    // eventually pointer is populated by instance from custom library
+    mxnet::ext::CustomOpSelector** sel_ptr =
+      reinterpret_cast<mxnet::ext::CustomOpSelector**>(selector);
+
+    // call user's createSelector function
+    return createSelector(graph, sel_ptr, opts);
+  }
+
+  /*! \brief returns status of calling select function from library */
+  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected) {
+    mxnet::ext::CustomOpSelector* sel_ptr =
+      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    *selected = sel_ptr->Select(nodeID);
+  }
+
+  /*! \brief returns status of calling select input function from library */
+  MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
+                                  int input_nodeID, int* selected) {
+    mxnet::ext::CustomOpSelector* sel_ptr =
+      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    *selected = sel_ptr->SelectInput(nodeID, input_nodeID);
+  }
+
+  /*! \brief returns status of calling select output function from library */
+  MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
+                                    int output_nodeID, int* selected) {
+    mxnet::ext::CustomOpSelector* sel_ptr =
+      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    *selected = sel_ptr->SelectOutput(nodeID, output_nodeID);
+  }
+
+  /*! \brief returns status of calling filter function from library */
+  MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
+                              int** keep, int* num_keep) {
+    mxnet::ext::CustomOpSelector* sel_ptr =
+      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    std::vector<int> candidates_(num_candidates);
+    for (int i=0; i < num_candidates; i++) {
+      candidates_[i] = candidates[i];
+    }
+    std::vector<int> keep_;
+
+    sel_ptr->Filter(candidates_, &keep_);
+
+    *num_keep = keep_.size();
+    *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
+    for (unsigned i=0; i < keep_.size(); i++)
+      (*keep)[i] = keep_[i];
+  }
+
+  /*! \brief returns status of calling reset selector function from library */
+  MX_VOID_RET _partCallReset(void* sel_inst) {
+    mxnet::ext::CustomOpSelector* sel_ptr =
+      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    sel_ptr->Reset();
+  }
+
+  /*! \brief returns status of calling review subgraph function from library */
+  MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
+                                     int subgraph_id, int *accept, const char* const* opt_keys,
+                                     const char* const* opt_vals, int num_opts,
+                                     char*** attr_keys, char*** attr_vals, int *num_attrs,
+                                     const char* const* arg_names, int num_args,
+                                     void* const* arg_data, const int64_t* const* arg_shapes,
+                                     const int* arg_dims, const int* arg_types,
+                                     const size_t* arg_IDs, const char* const* arg_dev_type,
+                                     const int* arg_dev_id,
+                                     const char* const* aux_names, int num_aux,
+                                     void* const* aux_data, const int64_t* const* aux_shapes,
+                                     const int* aux_dims, const int* aux_types,
+                                     const size_t* aux_IDs, const char* const* aux_dev_type,
+                                     const int* aux_dev_id) {
+    mxnet::ext::Graph *subgraph = mxnet::ext::Graph::fromString(json);
+    bool accept_bool = false;
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> opts;
+    for (int i = 0; i < num_opts; i++)
+      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+    // create a map of named tensors for args
+    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
+    for (int i = 0; i < num_args; i++) {
+      std::vector<int64_t> shapes;
+      for (int j = 0; j < arg_dims[i]; j++)
+        shapes.push_back(arg_shapes[i][j]);
+
+      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
+                      arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i], arg_dev_id[i]));
+      args[arg_names[i]] = tensor;
+    }
+    // create a map of named tensors for aux
+    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
+    for (int i = 0; i < num_aux; i++) {
+      std::vector<int64_t> shapes;
+      for (int j = 0; j < aux_dims[i]; j++)
+        shapes.push_back(aux_shapes[i][j]);
+
+      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
+                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
+                                                                    aux_dev_id[i]));
+      aux[aux_names[i]] = tensor;
+    }
+
+    subgraph->_setParams(&args, &aux);
+    mxnet::ext::MXReturnValue retval = reviewSubgraph(subgraph, subgraph_id, &accept_bool,
+                                                      opts);
+    if (!retval) return retval;
+
+    *accept = accept_bool;
+
+    if (subgraph->attrs.size() > 0) {
+      *num_attrs = subgraph->attrs.size();
+      // allocate space for attributes
+      *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+      *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+
+      // copy attributes
+      int i = 0;
+      for (auto kv : subgraph->attrs) {
+        (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
+        std::string val = kv.second.dump();  // convert JsonVal back to string
+        (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
+        snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
+        snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
+        i++;
+      }
+    }
+
+    return retval;
+  }
+
+  /*! \brief returns number of graph passes registered in this library */
+  MX_INT_RET _passRegSize() {
+    return mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->size();
+  }
+
+  /*! \brief returns pass registration at specified index */
+  MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
+                          const char** pass_name) {
+    mxnet::ext::CustomPass pass =
+      mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->get(pass_idx);
+    *graphPass = pass.pass;
+    *pass_name = pass.name;
+  }
+
+  /*! \brief returns status of calling graph pass function from library */
+  MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
+                                char** out_graph, const char* const* opt_keys,
+                                const char* const* opt_vals, int num_opts,
+                                const char* pass_name, const char* const* arg_names, int num_args,
+                                void* const* arg_data, const int64_t* const* arg_shapes,
+                                const int* arg_dims, const int* arg_types,
+                                const size_t* arg_IDs, const char* const* arg_dev_type,
+                                const int* arg_dev_id, const char* const* aux_names, int num_aux,
+                                void* const* aux_data, const int64_t* const* aux_shapes,
+                                const int* aux_dims, const int* aux_types,
+                                const size_t* aux_IDs, const char* const* aux_dev_type,
+                                const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
+                                const void* nd_alloc) {
+    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+    // create map of attributes from list
+    std::unordered_map<std::string, std::string> opts;
+    for (int i = 0; i < num_opts; i++)
+      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+    // create a map of named tensors for args
+    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
+    for (int i = 0; i < num_args; i++) {
+      std::vector<int64_t> shapes;
+      for (int j = 0; j < arg_dims[i]; j++)
+        shapes.push_back(arg_shapes[i][j]);
+
+      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
+                                  arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i],
+                                                                    arg_dev_id[i]));
+      args[arg_names[i]] = tensor;
+    }
+    // create a map of named tensors for aux
+    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
+    for (int i = 0; i < num_aux; i++) {
+      std::vector<int64_t> shapes;
+      for (int j = 0; j < aux_dims[i]; j++)
+        shapes.push_back(aux_shapes[i][j]);
+
+      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
+                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
+                                                                    aux_dev_id[i]));
+      aux[aux_names[i]] = tensor;
+    }
+
+    std::unordered_map<std::string, mxnet::ext::MXTensor> new_args, new_aux;
+    mxnet::ext::PassResource res(&new_args, &new_aux, nd_malloc, nd_alloc);
+    graph->_setParams(&args, &aux);
+    graph->_setPassResource(&res);
+    mxnet::ext::MXReturnValue retval = graphPass(graph, opts);
+    if (!retval) return retval;
+
+    std::string *tmp = new std::string(graph->toString());
+    *out_graph = const_cast<char*>(tmp->c_str());
+    return retval;
+  }
+
+  /*!
+   * \brief Checks if the MXNet version is supported by the library.
+   * If supported, initializes the library.
+   * \param version MXNet version number passed to library and defined as:
+   *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+   * \return Non-zero value on error i.e. library incompatible with passed MXNet version
+   */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) mxnet::ext::MXReturnValue __cdecl
+#else
+  mxnet::ext::MXReturnValue
+#endif
+  initialize(int version);
+
+  MX_INT_RET _msgSize() {
+    return mxnet::ext::MXerrorMsgs::get()->size();
+  }
+
+  /*! \brief returns operator registration at specified index */
+  MX_VOID_RET _msgGet(int idx, const char** msg) {
+    *msg = mxnet::ext::MXerrorMsgs::get()->get(idx)->c_str();
+  }

From ead26847549c4cdaaa92b672eadc0bcfca10ed0a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 06:28:11 +0000
Subject: [PATCH 03/43] fixed licenses

---
 example/extensions/lib_external_ops/README.md | 17 +++++++++++++++++
 .../extensions/lib_external_ops/min_ex-inl.h  | 19 +++++++++++++++++++
 example/extensions/lib_external_ops/min_ex.cc | 19 +++++++++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/example/extensions/lib_external_ops/README.md b/example/extensions/lib_external_ops/README.md
index 2fd9f9570028..8239989b84e4 100644
--- a/example/extensions/lib_external_ops/README.md
+++ b/example/extensions/lib_external_ops/README.md
@@ -1 +1,18 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
 TBD
\ No newline at end of file
diff --git a/example/extensions/lib_external_ops/min_ex-inl.h b/example/extensions/lib_external_ops/min_ex-inl.h
index 56784f6dbdd1..82a10e6540e5 100644
--- a/example/extensions/lib_external_ops/min_ex-inl.h
+++ b/example/extensions/lib_external_ops/min_ex-inl.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #ifndef MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
 #define MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
 
diff --git a/example/extensions/lib_external_ops/min_ex.cc b/example/extensions/lib_external_ops/min_ex.cc
index 930704dcaad2..f8cf7003138a 100644
--- a/example/extensions/lib_external_ops/min_ex.cc
+++ b/example/extensions/lib_external_ops/min_ex.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #include "min_ex-inl.h"
 
 namespace mxnet {

From 51ce458d61b1bee78df1e4c076fa431e2eedb7f1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 07:16:39 +0000
Subject: [PATCH 04/43] whitespace

---
 include/mxnet/lib_api.h |   74 +--
 src/lib_api.cc          | 1224 ++++++++++++++++++++-------------------
 2 files changed, 653 insertions(+), 645 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 9e17056f4c44..21004c6f1603 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -340,7 +340,7 @@ struct MXTensor {
 
   /*! \brief populate DLTensor fields */
   void setDLTensor();
-  
+
   /*! \brief helper function to cast data pointer */
   template<typename data_type>
   inline data_type* data() {
@@ -406,11 +406,11 @@ class PassResource {
   PassResource(std::unordered_map<std::string, MXTensor>* new_args,
                std::unordered_map<std::string, MXTensor>* new_aux,
                nd_malloc_t nd_malloc, const void* nd_alloc);
-  
+
   // allocate new arg param, adds to args map, returns newly allocated tensor
   MXTensor* alloc_arg(const std::string& name, const std::vector<int64_t>& shapes,
                       const MXContext &ctx, MXDType dtype) const;
-  
+
   // allocate new aux param, adds to aux map, returns newly allocated tensor
   MXTensor* alloc_aux(const std::string& name, const std::vector<int64_t>& shapes,
                       const MXContext &ctx, MXDType dtype) const;
@@ -431,7 +431,7 @@ class OpResource {
              xpu_malloc_t gpu_malloc_fp, void* gpu_alloc_fp, void* stream,
              sparse_malloc_t sparse_malloc_fp, void* sparse_alloc_fp,
              void* rng_cpu_states, void* rng_gpu_states);
-  
+
   /*! \brief allocate cpu memory controlled by MXNet */
   void* alloc_cpu(int size) const;
 
@@ -516,16 +516,16 @@ struct JsonVal {
 
   // convert JSON object back to JSON-compatible string
   std::string dump() const;
-  
+
   // convert JSON-compatible string to JSON object
   static JsonVal parse(const std::string& json);
-  
+
   // parse a string JSON object
   static JsonVal parse_string(const std::string& json, unsigned int* idx);
-  
+
   // parse a number JSON object
   static JsonVal parse_num(const std::string& json, unsigned int* idx);
-  
+
   // parse a list of JSON objects
   static JsonVal parse_list(const std::string& json, unsigned int* idx);
 
@@ -534,10 +534,10 @@ struct JsonVal {
 
   // generic parse function
   static JsonVal parse(const std::string& json, unsigned int *idx);
-  
+
   // debug function to convert data structure to a debugstring
   std::string toString() const;
-  
+
   JsonType type;
   int num;
   std::string str;
@@ -561,18 +561,18 @@ struct NodeEntry {
 class Node {
  public:
   Node();
-  
+
   // internally set passResource to enable tensor allocation for graph passes
   void _setPassResource(PassResource* res_);
 
   /* \brief allocate an arg tensor for this node */
   void alloc_arg(const std::vector<int64_t>& shapes,
                  const MXContext &ctx, MXDType dtype);
-  
+
   /* \brief allocate an aux tensor for this node */
   void alloc_aux(const std::vector<int64_t>& shapes,
                  const MXContext &ctx, MXDType dtype);
-  
+
   std::string op;  // operator name (ie. Convolution)
   std::string name;  // unique node name (ie. conv_0 or conv_1)
   MXTensor* tensor;  // tensor data for input nodes
@@ -589,7 +589,7 @@ class Node {
 class Graph {
  public:
   Graph();
-  
+
   /* \brief deleted nodes when deleting the graph */
   ~Graph();
 
@@ -604,7 +604,7 @@ class Graph {
 
   /* \brief convert graph object to JSON string */
   std::string toString();
-  
+
   /* \brief visits a node "n" */
   void _dfs_util(Node* n, std::unordered_set<Node*>* to_visit,
                  std::function<void(Node*)> handler) const;
@@ -620,26 +620,26 @@ class Graph {
 
   /* \brief add a new node to this graph */
   Node* addNode(const std::string& name, const std::string& op);
-  
+
   /* \brief get node at index in graph */
   Node* getNode(size_t idx);
-    
+
   /* \brief get const node at index in const graph */
   const Node* getNode(size_t idx) const;
-  
+
   /* \brief get attribute on graph */
   const JsonVal& getAttr(const std::string& key) const;
-  
+
   /* \brief get number of nodes in the graph */
   size_t size() const;
-  
+
   // internally set passResource to enable tensor allocation for graph passes
   void _setPassResource(PassResource* res_);
 
   // internally set arg/aux params when available
   void _setParams(std::unordered_map<std::string, mxnet::ext::MXTensor>* args,
                   std::unordered_map<std::string, mxnet::ext::MXTensor>* aux);
-  
+
   std::vector<Node*> inputs;
   std::vector<NodeEntry> outputs;
   std::map<std::string, JsonVal> attrs;
@@ -746,21 +746,21 @@ typedef MXReturnValue (*createOpState_t)(const std::unordered_map<std::string,
 class CustomOp {
  public:
   explicit CustomOp(const char* op_name);
-  
+
   CustomOp& setForward(fcomp_t fcomp, const char* ctx);
-  
+
   CustomOp& setBackward(fcomp_t fgrad, const char* ctx);
-  
+
   CustomOp& setParseAttrs(parseAttrs_t func);
-  
+
   CustomOp& setInferType(inferType_t func);
 
   CustomOp& setInferSType(inferSType_t func);
-  
+
   CustomOp& setInferShape(inferShape_t func);
-  
+
   CustomOp& setMutateInputs(mutateInputs_t func);
-    
+
   CustomOp& setCreateOpState(createOpState_t func, const char* ctx);
 
   CustomOp& setIsSubgraphOp();
@@ -802,7 +802,7 @@ class CustomPass {
  public:
   CustomPass();
   explicit CustomPass(const char* pass_name);
-  
+
   CustomPass& setBody(graphPass_t fn);
 
   /*! \brief pass name */
@@ -830,22 +830,22 @@ typedef MXReturnValue (*reviewSubgraph_t)(const mxnet::ext::Graph *subgraph, int
 class CustomPartitioner {
  public:
   CustomPartitioner();
-  
+
   explicit CustomPartitioner(const char* backend_name);
-    
+
   CustomPartitioner& addStrategy(const char* prop_name,
                                  const char* sg_name);
-  
+
   CustomPartitioner& setSupportedOps(const char* prop_name, supportedOps_t fn);
-  
+
   CustomPartitioner& setCreateSelector(const char* prop_name, createSelector_t fn);
-  
+
   CustomPartitioner& setReviewSubgraph(const char* prop_name, reviewSubgraph_t fn);
-  
+
   supportedOps_t getSupportedOps(int stg_id);
-  
+
   createSelector_t getCreateSelector(int stg_id);
-  
+
   reviewSubgraph_t getReviewSubgraph(int stg_id);
 
   /*! \brief partitioner name */
diff --git a/src/lib_api.cc b/src/lib_api.cc
index 0c404dd31265..51fede35d989 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -61,9 +61,11 @@ void mxnet::ext::MXSparse::set(void *data_ptr, const int64_t* dims, int ndims, v
   }
 }
 
-mxnet::ext::MXTensor::MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0), stype(kDefaultStorage) {}
+mxnet::ext::MXTensor::MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0),
+                                   stype(kDefaultStorage) {}
 mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
-                                                      dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx), stype(oth.stype) {
+                                                      dtype(oth.dtype), verID(oth.verID),
+                                                      ctx(oth.ctx), stype(oth.stype) {
   setDLTensor();
 }
 
@@ -168,8 +170,10 @@ mxnet::ext::PassResource::PassResource(std::unordered_map<std::string, MXTensor>
                                        nd_malloc_t nd_malloc, const void* nd_alloc)
   : new_args_(new_args), new_aux_(new_aux), nd_malloc_(nd_malloc), nd_alloc_(nd_alloc) {}
 
-mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_arg(const std::string& name, const std::vector<int64_t>& shapes,
-                                                          const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) const {
+mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_arg(const std::string& name,
+                                                          const std::vector<int64_t>& shapes,
+                                                          const mxnet::ext::MXContext &ctx,
+                                                          mxnet::ext::MXDType dtype) const {
   void* data;
   nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
              dtype, name.c_str(), 1, &data);
@@ -178,8 +182,10 @@ mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_arg(const std::string& nam
   return &(new_args_->at(name));
 }
 
-mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_aux(const std::string& name, const std::vector<int64_t>& shapes,
-                                                          const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) const {
+mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_aux(const std::string& name,
+                                                          const std::vector<int64_t>& shapes,
+                                                          const mxnet::ext::MXContext &ctx,
+                                                          mxnet::ext::MXDType dtype) const {
   void* data;
   nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
              dtype, name.c_str(), 0, &data);
@@ -209,7 +215,8 @@ mxnet::ext::mx_stream_t mxnet::ext::OpResource::get_cuda_stream() const {
   return static_cast<mx_stream_t>(cuda_stream);
 }
 
-void mxnet::ext::OpResource::alloc_sparse(mxnet::ext::MXSparse* sparse, int index, int indices_len, int indptr_len) const {
+void mxnet::ext::OpResource::alloc_sparse(mxnet::ext::MXSparse* sparse, int index,
+                                          int indices_len, int indptr_len) const {
   sparse_malloc(sparse_alloc, index, indices_len, indptr_len,
                 &(sparse->data), &(sparse->indices), &(sparse->indptr));
 }
@@ -662,7 +669,7 @@ std::vector<mxnet::ext::Node*> mxnet::ext::Graph::topological_sort() const {
 void mxnet::ext::Graph::print(int indent) const {
   std::string space = "";
   for (int i = 0; i < indent; i++) space+=" ";
-  
+
   std::cout << space << "########### Graph #############" << std::endl;
   std::cout << space << "attributes: " << std::endl;
   for (auto &kv : attrs)
@@ -754,7 +761,8 @@ mxnet::ext::CustomOp& mxnet::ext::CustomOp::setForward(mxnet::ext::fcomp_t fcomp
   return *this;
 }
 
-mxnet::ext::CustomOp& mxnet::ext::CustomOp::setBackward(mxnet::ext::fcomp_t fgrad, const char* ctx) {
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setBackward(mxnet::ext::fcomp_t fgrad,
+                                                        const char* ctx) {
   if (backward_ctx_map.count(ctx) > 0)
     raiseDuplicateContextError();
   backward_ctx_map[ctx] = fgrad;
@@ -786,7 +794,8 @@ mxnet::ext::CustomOp& mxnet::ext::CustomOp::setMutateInputs(mxnet::ext::mutateIn
   return *this;
 }
 
-mxnet::ext::CustomOp& mxnet::ext::CustomOp::setCreateOpState(mxnet::ext::createOpState_t func, const char* ctx) {
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setCreateOpState(mxnet::ext::createOpState_t func,
+                                                             const char* ctx) {
   if (create_op_ctx_map.count(ctx) > 0)
     raiseDuplicateContextError();
   create_op_ctx_map[ctx] = func;
@@ -815,8 +824,9 @@ void mxnet::ext::CustomOp::mapToVector() {
 
 void mxnet::ext::CustomOp::raiseDuplicateContextError() {
   std::string op_name_str(name);
-  throw std::runtime_error("Error! Error! Cannot register multiple functions under same context for operator '"
-                           + op_name_str + "'");
+  throw std::runtime_error(
+    "Error! Error! Cannot register multiple functions under same context for operator '"
+    + op_name_str + "'");
 }
 
 mxnet::ext::CustomPass::CustomPass() : name("ERROR") {}
@@ -839,19 +849,19 @@ mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::addStrategy(const
 }
 
 mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setSupportedOps(const char* prop_name,
-                                                                              mxnet::ext::supportedOps_t fn) {
+                                                                    mxnet::ext::supportedOps_t fn) {
   supported_map[std::string(prop_name)] = fn;
   return *this;
 }
 
-mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setCreateSelector(const char* prop_name,
-                                                                                mxnet::ext::createSelector_t fn) {
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setCreateSelector(
+                                           const char* prop_name, mxnet::ext::createSelector_t fn) {
   selector_map[std::string(prop_name)] = fn;
   return *this;
 }
 
-mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setReviewSubgraph(const char* prop_name,
-                                                                                mxnet::ext::reviewSubgraph_t fn) {
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setReviewSubgraph(
+                                           const char* prop_name, mxnet::ext::reviewSubgraph_t fn) {
   review_map[std::string(prop_name)] = fn;
   return *this;
 }
@@ -880,688 +890,686 @@ mxnet::ext::reviewSubgraph_t mxnet::ext::CustomPartitioner::getReviewSubgraph(in
     return nullptr;
 }
 
-  /*! \brief returns MXNet library version */
-  MX_INT_RET _opVersion() {
-    return MX_LIBRARY_VERSION;
-  }
-
-  /*! \brief returns number of ops registered in this library */
-  MX_INT_RET _opRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->size();
-  }
+/*! \brief returns MXNet library version */
+MX_INT_RET _opVersion() {
+  return MX_LIBRARY_VERSION;
+}
 
-  /*! \brief returns operator registration at specified index */
-  MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
-                        const char*** forward_ctx, mxnet::ext::fcomp_t** forward_fp,
-                        int* forward_count, const char*** backward_ctx,
-                        mxnet::ext::fcomp_t** backward_fp, int* backward_count,
-                        const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
-                        int* create_op_count, mxnet::ext::parseAttrs_t* parse,
-                        mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
-                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate) {
-    mxnet::ext::CustomOp &op = mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->get(idx);
-    *name = op.name;
-    *parse = op.parse_attrs;
-    *type = op.infer_type;
-    *stype = op.infer_storage_type;
-    *shape = op.infer_shape;
-    *mutate = op.mutate_inputs;
-    *isSGop = op.isSGop;
-    op.mapToVector();
-    *forward_ctx = op.forward_ctx_cstr.data();
-    *forward_fp = op.forward_fp.data();
-    *forward_count = op.forward_fp.size();
-    *backward_ctx = op.backward_ctx_cstr.data();
-    *backward_fp = op.backward_fp.data();
-    *backward_count = op.backward_fp.size();
-    *create_op_ctx = op.create_op_ctx_cstr.data();
-    *create_op_fp = op.create_op_fp.data();
-    *create_op_count = op.create_op_fp.size();
-  }
+/*! \brief returns number of ops registered in this library */
+MX_INT_RET _opRegSize() {
+  return mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->size();
+}
 
-  /*! \brief calls free from the external library for library allocated arrays */
-  MX_VOID_RET _opCallFree(void* ptr) {
-    free(ptr);
-  }
+/*! \brief returns operator registration at specified index */
+MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
+                      const char*** forward_ctx, mxnet::ext::fcomp_t** forward_fp,
+                      int* forward_count, const char*** backward_ctx,
+                      mxnet::ext::fcomp_t** backward_fp, int* backward_count,
+                      const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
+                      int* create_op_count, mxnet::ext::parseAttrs_t* parse,
+                      mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
+                      mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate) {
+  mxnet::ext::CustomOp &op = mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->get(idx);
+  *name = op.name;
+  *parse = op.parse_attrs;
+  *type = op.infer_type;
+  *stype = op.infer_storage_type;
+  *shape = op.infer_shape;
+  *mutate = op.mutate_inputs;
+  *isSGop = op.isSGop;
+  op.mapToVector();
+  *forward_ctx = op.forward_ctx_cstr.data();
+  *forward_fp = op.forward_fp.data();
+  *forward_count = op.forward_fp.size();
+  *backward_ctx = op.backward_ctx_cstr.data();
+  *backward_fp = op.backward_fp.data();
+  *backward_count = op.backward_fp.size();
+  *create_op_ctx = op.create_op_ctx_cstr.data();
+  *create_op_fp = op.create_op_fp.data();
+  *create_op_count = op.create_op_fp.size();
+}
 
-  /*! \brief returns status of calling parse attributes function for operator from library */
-  MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
-                               const char* const* vals, int num,
-                               int* num_in, int* num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
+/*! \brief calls free from the external library for library allocated arrays */
+MX_VOID_RET _opCallFree(void* ptr) {
+  free(ptr);
+}
 
-    return parseAttrs(attrs, num_in, num_out);
+/*! \brief returns status of calling parse attributes function for operator from library */
+MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
+                             const char* const* vals, int num,
+                             int* num_in, int* num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
   }
+  return parseAttrs(attrs, num_in, num_out);
+}
 
-  /*! \brief returns status of calling inferShape function for operator from library */
-  MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
-                               const char* const* vals, int num,
-                               unsigned int** inshapes, int* indims, int num_in,
-                               unsigned int*** mod_inshapes, int** mod_indims,
-                               unsigned int*** outshapes, int** outdims, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
+/*! \brief returns status of calling inferShape function for operator from library */
+MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
+                             const char* const* vals, int num,
+                             unsigned int** inshapes, int* indims, int num_in,
+                             unsigned int*** mod_inshapes, int** mod_indims,
+                             unsigned int*** outshapes, int** outdims, int num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
 
-    // create a vector of shapes for inputs
-    std::vector<std::vector<unsigned int> > in_shapes(num_in);
-    for (int i = 0; i < num_in; i++) {
-      for (int j = 0; j < indims[i]; j++) {
-        in_shapes[i].push_back(inshapes[i][j]);
-      }
+  // create a vector of shapes for inputs
+  std::vector<std::vector<unsigned int> > in_shapes(num_in);
+  for (int i = 0; i < num_in; i++) {
+    for (int j = 0; j < indims[i]; j++) {
+      in_shapes[i].push_back(inshapes[i][j]);
     }
+  }
 
-    // create a vector of shapes for outputs
-    std::vector<std::vector<unsigned int> > out_shapes(num_out);
-
-    int retval = inferShape(attrs, &in_shapes, &out_shapes);
-    if (!retval) return retval;
-
-    // allocate space for modified input dims, shape
-    *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
-    *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
+  // create a vector of shapes for outputs
+  std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
-    // copy modified input shapes
-    for (int i = 0; i < num_in; i++) {
-      (*mod_indims)[i] = in_shapes[i].size();
-      (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
-      for (int j = 0; j < (*mod_indims)[i]; j++) {
-        (*mod_inshapes)[i][j] = in_shapes[i][j];
-      }
-    }
+  int retval = inferShape(attrs, &in_shapes, &out_shapes);
+  if (!retval) return retval;
 
-    // allocate space for output dims, shape
-    *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
-    *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
+  // allocate space for modified input dims, shape
+  *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
+  *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
 
-    // copy output shapes
-    for (int i = 0; i < num_out; i++) {
-      (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
-      for (int j = 0; j < (*outdims)[i]; j++) {
-        (*outshapes)[i][j] = out_shapes[i][j];
-      }
+  // copy modified input shapes
+  for (int i = 0; i < num_in; i++) {
+    (*mod_indims)[i] = in_shapes[i].size();
+    (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
+    for (int j = 0; j < (*mod_indims)[i]; j++) {
+      (*mod_inshapes)[i][j] = in_shapes[i][j];
     }
-
-    return retval;
   }
 
-  /*! \brief returns status of calling inferType function for operator from library */
-  MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
-                              const char* const* vals, int num,
-                              int* intypes, int num_in, int* outtypes, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
+  // allocate space for output dims, shape
+  *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
+  *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
 
-    // create a vector of types for inputs
-    std::vector<int> in_types(num_in);
-    for (int i = 0; i < num_in; i++) {
-      in_types[i] = intypes[i];
+  // copy output shapes
+  for (int i = 0; i < num_out; i++) {
+    (*outdims)[i] = out_shapes[i].size();
+    (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
+    for (int j = 0; j < (*outdims)[i]; j++) {
+      (*outshapes)[i][j] = out_shapes[i][j];
     }
+  }
+  return retval;
+}
 
-    // create a vector of types for outputs
-    std::vector<int> out_types(num_out, -1);
+/*! \brief returns status of calling inferType function for operator from library */
+MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
+                            const char* const* vals, int num,
+                            int* intypes, int num_in, int* outtypes, int num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
 
-    int retval = inferType(attrs, &in_types, &out_types);
-    if (!retval)
-      return retval;
+  // create a vector of types for inputs
+  std::vector<int> in_types(num_in);
+  for (int i = 0; i < num_in; i++) {
+    in_types[i] = intypes[i];
+  }
 
-    // copy modified input types
-    for (int i = 0; i < num_in; i++) {
-      intypes[i] = in_types[i];
-    }
-    // copy output types
-    for (int i = 0; i < num_out; i++) {
-      outtypes[i] = out_types[i];
-    }
+  // create a vector of types for outputs
+  std::vector<int> out_types(num_out, -1);
 
+  int retval = inferType(attrs, &in_types, &out_types);
+  if (!retval)
     return retval;
-  }
 
-  /*! \brief returns status of calling inferSType function for operator from library */
-  MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
-                               const char* const* vals, int num,
-                               int* instypes, int num_in, int* outstypes, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
+  // copy modified input types
+  for (int i = 0; i < num_in; i++) {
+    intypes[i] = in_types[i];
+  }
+  // copy output types
+  for (int i = 0; i < num_out; i++) {
+    outtypes[i] = out_types[i];
+  }
 
-    // create a vector of types for inputs
-    std::vector<int> in_stypes(num_in);
-    for (int i = 0; i < num_in; i++) {
-      in_stypes[i] = instypes[i];
-    }
+  return retval;
+}
 
-    // create a vector of types for outputs
-    std::vector<int> out_stypes(num_out, -1);
+/*! \brief returns status of calling inferSType function for operator from library */
+MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
+                             const char* const* vals, int num,
+                             int* instypes, int num_in, int* outstypes, int num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
 
-    int retval = inferSType(attrs, &in_stypes, &out_stypes);
+  // create a vector of types for inputs
+  std::vector<int> in_stypes(num_in);
+  for (int i = 0; i < num_in; i++) {
+    in_stypes[i] = instypes[i];
+  }
 
-    if (!retval)
-      return retval;
+  // create a vector of types for outputs
+  std::vector<int> out_stypes(num_out, -1);
 
-    // copy modified input storage types
-    for (int i = 0; i < num_in; i++) {
-      instypes[i] = in_stypes[i];
-    }
-    // copy output storage types
-    for (int i = 0; i < num_out; i++) {
-      outstypes[i] = out_stypes[i];
-    }
+  int retval = inferSType(attrs, &in_stypes, &out_stypes);
 
+  if (!retval)
     return retval;
+
+  // copy modified input storage types
+  for (int i = 0; i < num_in; i++) {
+    instypes[i] = in_stypes[i];
+  }
+  // copy output storage types
+  for (int i = 0; i < num_out; i++) {
+    outstypes[i] = out_stypes[i];
   }
 
-  /*! \brief returns status of calling Forward/Backward function for operator from library */
-  MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
-                             const char* const* vals,
-                             int num, const int64_t** inshapes, int* indims, void** indata,
-                             int* intypes, size_t* inIDs, const char** indev_type, int* indev_id,
-                             int num_in, const int64_t** outshapes, int* outdims, void** outdata,
-                             int* outtypes, size_t* outIDs, const char** outdev_type,
-                             int* outdev_id, int num_out, mxnet::ext::xpu_malloc_t cpu_malloc,
-                             void* cpu_alloc,
-                             mxnet::ext::xpu_malloc_t gpu_malloc, void* gpu_alloc,
-                             void* cuda_stream,
-                             mxnet::ext::sparse_malloc_t sparse_malloc, void* sparse_alloc,
-                             int* instypes, int* outstypes, void** in_indices, void** out_indices,
-                             void** in_indptr, void** out_indptr,
-                             int64_t* in_indices_shapes, int64_t* out_indices_shapes,
-                             int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
-                             void* rng_cpu_states, void* rng_gpu_states) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
+  return retval;
+}
 
-    // create a vector of tensors for inputs
-    std::vector<mxnet::ext::MXTensor> inputs(num_in);
-    // create a vector for sparse inputs
-    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+/*! \brief returns status of calling Forward/Backward function for operator from library */
+MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
+                           const char* const* vals,
+                           int num, const int64_t** inshapes, int* indims, void** indata,
+                           int* intypes, size_t* inIDs, const char** indev_type, int* indev_id,
+                           int num_in, const int64_t** outshapes, int* outdims, void** outdata,
+                           int* outtypes, size_t* outIDs, const char** outdev_type,
+                           int* outdev_id, int num_out, mxnet::ext::xpu_malloc_t cpu_malloc,
+                           void* cpu_alloc,
+                           mxnet::ext::xpu_malloc_t gpu_malloc, void* gpu_alloc,
+                           void* cuda_stream,
+                           mxnet::ext::sparse_malloc_t sparse_malloc, void* sparse_alloc,
+                           int* instypes, int* outstypes, void** in_indices, void** out_indices,
+                           void** in_indptr, void** out_indptr,
+                           int64_t* in_indices_shapes, int64_t* out_indices_shapes,
+                           int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
+                           void* rng_cpu_states, void* rng_gpu_states) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
 
-    for (int i = 0; i < num_in; i++) {
-      // Dense representation.
-      if (instypes[i] == 0) {
-        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
-                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
-                            mxnet::ext::kDefaultStorage);
+  // create a vector of tensors for inputs
+  std::vector<mxnet::ext::MXTensor> inputs(num_in);
+  // create a vector for sparse inputs
+  std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+
+  for (int i = 0; i < num_in; i++) {
+    // Dense representation.
+    if (instypes[i] == 0) {
+      inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
+                          inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
+                          mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (instypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
       } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (instypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
-                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
-        }
-        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
-                            inshapes[i], indims[i], inIDs[i],
-                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
+        type = mxnet::ext::kCSRStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
+                         in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
       }
+      inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
+                          inshapes[i], indims[i], inIDs[i],
+                          mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
     }
+  }
 
-    // create a vector of tensors for outputs
-    std::vector<mxnet::ext::MXTensor> outputs(num_out);
-    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
+  // create a vector of tensors for outputs
+  std::vector<mxnet::ext::MXTensor> outputs(num_out);
+  std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
 
-    for (int i = 0; i < num_out; i++) {
-      // Dense representation.
-      if (outstypes[i] == 0) {
-        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
-                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
-                             mxnet::ext::kDefaultStorage);
+  for (int i = 0; i < num_out; i++) {
+    // Dense representation.
+    if (outstypes[i] == 0) {
+      outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
+                           outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
+                           mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (outstypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i],
+                          out_indices[i], out_indices_shapes[i]);
       } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (outstypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i],
-                            out_indices[i], out_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
-        }
-        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
-                             (mxnet::ext::MXDType)outtypes[i],
-                             outshapes[i], outdims[i], outIDs[i],
-                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
+        type = mxnet::ext::kCSRStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                          out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
       }
+      outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
+                           (mxnet::ext::MXDType)outtypes[i],
+                           outshapes[i], outdims[i], outIDs[i],
+                           mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
     }
-
-    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
-                               cuda_stream, sparse_malloc, sparse_alloc,
-                               rng_cpu_states, rng_gpu_states);
-    return fcomp(attrs, &inputs, &outputs, res);
   }
 
-  /*! \brief returns status of calling mutateInputs function for operator from library */
-  MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
-                                 const char* const* vals, int num,
-                                 int** mutate_indices, int* indices_size) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of mutate input indices
-    std::vector<int> mut_ind;
+  mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
+                             cuda_stream, sparse_malloc, sparse_alloc,
+                             rng_cpu_states, rng_gpu_states);
+  return fcomp(attrs, &inputs, &outputs, res);
+}
 
-    int retval = mutate(attrs, &mut_ind);
-    if (!retval)
-      return retval;
+/*! \brief returns status of calling mutateInputs function for operator from library */
+MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
+                               const char* const* vals, int num,
+                               int** mutate_indices, int* indices_size) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
 
-    // output the input indices
-    *indices_size = mut_ind.size();
-    *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
-    for (int i = 0; i < *indices_size; i++) {
-      (*mutate_indices)[i] = mut_ind[i];
-    }
+  // create a vector of mutate input indices
+  std::vector<int> mut_ind;
 
+  int retval = mutate(attrs, &mut_ind);
+  if (!retval)
     return retval;
+
+  // output the input indices
+  *indices_size = mut_ind.size();
+  *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
+  for (int i = 0; i < *indices_size; i++) {
+    (*mutate_indices)[i] = mut_ind[i];
   }
 
-  /*! \brief returns status of calling createStatefulOp function for operator from library */
-  MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
-                                  const char* const* vals, int num,
-                                  void** state_op) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
+  return retval;
+}
 
-    // void pointer to hold custom state op instance created in custom library
-    // eventually state_op pointer is populated by instance from custom library
-    mxnet::ext::CustomStatefulOp** op_ptr =
-      reinterpret_cast<mxnet::ext::CustomStatefulOp**>(state_op);
-    return create_op(attrs, op_ptr);
+/*! \brief returns status of calling createStatefulOp function for operator from library */
+MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
+                                const char* const* vals, int num,
+                                void** state_op) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
   }
 
-  /*! \brief returns status of calling Stateful Forward/Backward for operator from library */
-  MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
-                                     int* indims, void** indata, int* intypes, size_t* inIDs,
-                                     const char** indev_type, int* indev_id, int num_in,
-                                     const int64_t** outshapes, int* outdims, void** outdata,
-                                     int* outtypes, size_t* outIDs, const char** outdev_type,
-                                     int* outdev_id, int num_out,
-                                     mxnet::ext::xpu_malloc_t cpu_malloc,
-                                     void* cpu_alloc, mxnet::ext::xpu_malloc_t gpu_malloc,
-                                     void* gpu_alloc,
-                                     void* stream, mxnet::ext::sparse_malloc_t sparse_malloc,
-                                     void* sparse_alloc, int* instypes, int* outstypes,
-                                     void** in_indices, void** out_indices, void** in_indptr,
-                                     void** out_indptr, int64_t* in_indices_shapes,
-                                     int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
-                                     int64_t* out_indptr_shapes,
-                                     void* rng_cpu_states, void* rng_gpu_states) {
-    // create a vector of tensors for inputs
-    std::vector<mxnet::ext::MXTensor> inputs(num_in);
-    // create a vector for sparse inputs
-    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
-
-    for (int i = 0; i < num_in; i++) {
-      if (instypes[i] == 0) {
-        // Dense representation.
-        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
-                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
-                            mxnet::ext::kDefaultStorage);
+  // void pointer to hold custom state op instance created in custom library
+  // eventually state_op pointer is populated by instance from custom library
+  mxnet::ext::CustomStatefulOp** op_ptr =
+    reinterpret_cast<mxnet::ext::CustomStatefulOp**>(state_op);
+  return create_op(attrs, op_ptr);
+}
+
+/*! \brief returns status of calling Stateful Forward/Backward for operator from library */
+MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
+                                   int* indims, void** indata, int* intypes, size_t* inIDs,
+                                   const char** indev_type, int* indev_id, int num_in,
+                                   const int64_t** outshapes, int* outdims, void** outdata,
+                                   int* outtypes, size_t* outIDs, const char** outdev_type,
+                                   int* outdev_id, int num_out,
+                                   mxnet::ext::xpu_malloc_t cpu_malloc,
+                                   void* cpu_alloc, mxnet::ext::xpu_malloc_t gpu_malloc,
+                                   void* gpu_alloc,
+                                   void* stream, mxnet::ext::sparse_malloc_t sparse_malloc,
+                                   void* sparse_alloc, int* instypes, int* outstypes,
+                                   void** in_indices, void** out_indices, void** in_indptr,
+                                   void** out_indptr, int64_t* in_indices_shapes,
+                                   int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
+                                   int64_t* out_indptr_shapes,
+                                   void* rng_cpu_states, void* rng_gpu_states) {
+  // create a vector of tensors for inputs
+  std::vector<mxnet::ext::MXTensor> inputs(num_in);
+  // create a vector for sparse inputs
+  std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+
+  for (int i = 0; i < num_in; i++) {
+    if (instypes[i] == 0) {
+      // Dense representation.
+      inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
+                          inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
+                          mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (instypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
       } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (instypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
-                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
-        }
-        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
-                            inshapes[i], indims[i], inIDs[i],
-                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
+        type = mxnet::ext::kCSRStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
+                         in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
       }
+      inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
+                          inshapes[i], indims[i], inIDs[i],
+                          mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
     }
+  }
 
-    // create a vector of tensors for outputs
-    std::vector<mxnet::ext::MXTensor> outputs(num_out);
-    // create a vector for sparse outputs
-    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
-
-    for (int i = 0; i < num_out; i++) {
-      if (outstypes[i] == 0) {
-        // Dense representation.
-        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
-                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
-                             mxnet::ext::kDefaultStorage);
+  // create a vector of tensors for outputs
+  std::vector<mxnet::ext::MXTensor> outputs(num_out);
+  // create a vector for sparse outputs
+  std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
+
+  for (int i = 0; i < num_out; i++) {
+    if (outstypes[i] == 0) {
+      // Dense representation.
+      outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
+                           outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
+                           mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (outstypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                          out_indices_shapes[i]);
       } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (outstypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
-        }
-        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
-                             (mxnet::ext::MXDType)outtypes[i],
-                             outshapes[i], outdims[i], outIDs[i],
-                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
+        type = mxnet::ext::kCSRStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                          out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
       }
+      outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
+                           (mxnet::ext::MXDType)outtypes[i],
+                           outshapes[i], outdims[i], outIDs[i],
+                           mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
     }
+  }
 
-    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
-                               stream, sparse_malloc, sparse_alloc, rng_cpu_states, rng_gpu_states);
+  mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
+                             stream, sparse_malloc, sparse_alloc, rng_cpu_states, rng_gpu_states);
 
-    mxnet::ext::CustomStatefulOp* op_ptr =
-      reinterpret_cast<mxnet::ext::CustomStatefulOp*>(state_op);
-    if (is_forward) {
-      return op_ptr->Forward(&inputs, &outputs, res);
-    }
-    return op_ptr->Backward(&inputs, &outputs, res);
+  mxnet::ext::CustomStatefulOp* op_ptr =
+    reinterpret_cast<mxnet::ext::CustomStatefulOp*>(state_op);
+  if (is_forward) {
+    return op_ptr->Forward(&inputs, &outputs, res);
   }
+  return op_ptr->Backward(&inputs, &outputs, res);
+}
 
-  /*! \brief returns number of partitioners registered in this library */
-  MX_INT_RET _partRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->size();
-  }
+/*! \brief returns number of partitioners registered in this library */
+MX_INT_RET _partRegSize() {
+  return mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->size();
+}
 
-  /* returns number of strategies registered for partitioner
-   * at specified index */
-  MX_INT_RET _partRegGetCount(int idx, const char** name) {
-    mxnet::ext::CustomPartitioner part =
-      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(idx);
-    *name = part.name;
-    return part.strategies.size();
-  }
+/* returns number of strategies registered for partitioner
+ * at specified index */
+MX_INT_RET _partRegGetCount(int idx, const char** name) {
+  mxnet::ext::CustomPartitioner part =
+    mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(idx);
+  *name = part.name;
+  return part.strategies.size();
+}
 
-  /*! \brief returns partitioner registration at specified index */
-  MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
-                          mxnet::ext::supportedOps_t* supportedOps,
-                          mxnet::ext::createSelector_t* createSelector,
-                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name) {
-    mxnet::ext::CustomPartitioner part =
-      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(part_idx);
-    *strategy = part.strategies[stg_idx];
-    *op_name = part.op_names[stg_idx];
-    *supportedOps = part.getSupportedOps(stg_idx);
-    *createSelector = part.getCreateSelector(stg_idx);
-    *reviewSubgraph = part.getReviewSubgraph(stg_idx);
-  }
+/*! \brief returns partitioner registration at specified index */
+MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
+                        mxnet::ext::supportedOps_t* supportedOps,
+                        mxnet::ext::createSelector_t* createSelector,
+                        mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name) {
+  mxnet::ext::CustomPartitioner part =
+    mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(part_idx);
+  *strategy = part.strategies[stg_idx];
+  *op_name = part.op_names[stg_idx];
+  *supportedOps = part.getSupportedOps(stg_idx);
+  *createSelector = part.getCreateSelector(stg_idx);
+  *reviewSubgraph = part.getReviewSubgraph(stg_idx);
+}
+
+/*! \brief returns status of calling supported ops function from library */
+MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
+                                 int num_ids, int *ids, const char* const* opt_keys,
+                                 const char* const* opt_vals, int num_opts) {
+  mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+  // create map of options from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // create array of subgraph IDs for operator support
+  std::vector<int> _ids(num_ids, -2);
+  // call user's supportedOps function
+  mxnet::ext::MXReturnValue retval = supportedOps(graph, &_ids, opts);
+  if (!retval) return retval;
+
+  // copy bools in ids to ints
+  for (int i = 0; i < num_ids; i++)
+    ids[i] = _ids[i];
+
+  return retval;
+}
 
-  /*! \brief returns status of calling supported ops function from library */
-  MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
-                                   int num_ids, int *ids, const char* const* opt_keys,
+/*! \brief returns status of calling create selector function from library */
+MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
+                                   void** selector, const char* const* opt_keys,
                                    const char* const* opt_vals, int num_opts) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of options from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create array of subgraph IDs for operator support
-    std::vector<int> _ids(num_ids, -2);
-    // call user's supportedOps function
-    mxnet::ext::MXReturnValue retval = supportedOps(graph, &_ids, opts);
-    if (!retval) return retval;
-
-    // copy bools in ids to ints
-    for (int i = 0; i < num_ids; i++)
-      ids[i] = _ids[i];
+  mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+  // create map of options from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // void pointer to hold selector instance created in custom library
+  // eventually pointer is populated by instance from custom library
+  mxnet::ext::CustomOpSelector** sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector**>(selector);
+
+  // call user's createSelector function
+  return createSelector(graph, sel_ptr, opts);
+}
 
-    return retval;
-  }
+/*! \brief returns status of calling select function from library */
+MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  *selected = sel_ptr->Select(nodeID);
+}
 
-  /*! \brief returns status of calling create selector function from library */
-  MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
-                                     void** selector, const char* const* opt_keys,
-                                     const char* const* opt_vals, int num_opts) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of options from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // void pointer to hold selector instance created in custom library
-    // eventually pointer is populated by instance from custom library
-    mxnet::ext::CustomOpSelector** sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector**>(selector);
-
-    // call user's createSelector function
-    return createSelector(graph, sel_ptr, opts);
-  }
+/*! \brief returns status of calling select input function from library */
+MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
+                                 int input_nodeID, int* selected) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  *selected = sel_ptr->SelectInput(nodeID, input_nodeID);
+}
 
-  /*! \brief returns status of calling select function from library */
-  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->Select(nodeID);
-  }
+/*! \brief returns status of calling select output function from library */
+MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
+                                  int output_nodeID, int* selected) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  *selected = sel_ptr->SelectOutput(nodeID, output_nodeID);
+}
 
-  /*! \brief returns status of calling select input function from library */
-  MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
-                                  int input_nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->SelectInput(nodeID, input_nodeID);
+/*! \brief returns status of calling filter function from library */
+MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
+                            int** keep, int* num_keep) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  std::vector<int> candidates_(num_candidates);
+  for (int i=0; i < num_candidates; i++) {
+    candidates_[i] = candidates[i];
   }
+  std::vector<int> keep_;
 
-  /*! \brief returns status of calling select output function from library */
-  MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
-                                    int output_nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->SelectOutput(nodeID, output_nodeID);
-  }
+  sel_ptr->Filter(candidates_, &keep_);
 
-  /*! \brief returns status of calling filter function from library */
-  MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
-                              int** keep, int* num_keep) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    std::vector<int> candidates_(num_candidates);
-    for (int i=0; i < num_candidates; i++) {
-      candidates_[i] = candidates[i];
-    }
-    std::vector<int> keep_;
+  *num_keep = keep_.size();
+  *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
+  for (unsigned i=0; i < keep_.size(); i++)
+    (*keep)[i] = keep_[i];
+}
 
-    sel_ptr->Filter(candidates_, &keep_);
+/*! \brief returns status of calling reset selector function from library */
+MX_VOID_RET _partCallReset(void* sel_inst) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    sel_ptr->Reset();
+}
 
-    *num_keep = keep_.size();
-    *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
-    for (unsigned i=0; i < keep_.size(); i++)
-      (*keep)[i] = keep_[i];
+/*! \brief returns status of calling review subgraph function from library */
+MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
+                                   int subgraph_id, int *accept, const char* const* opt_keys,
+                                   const char* const* opt_vals, int num_opts,
+                                   char*** attr_keys, char*** attr_vals, int *num_attrs,
+                                   const char* const* arg_names, int num_args,
+                                   void* const* arg_data, const int64_t* const* arg_shapes,
+                                   const int* arg_dims, const int* arg_types,
+                                   const size_t* arg_IDs, const char* const* arg_dev_type,
+                                   const int* arg_dev_id,
+                                   const char* const* aux_names, int num_aux,
+                                   void* const* aux_data, const int64_t* const* aux_shapes,
+                                   const int* aux_dims, const int* aux_types,
+                                   const size_t* aux_IDs, const char* const* aux_dev_type,
+                                   const int* aux_dev_id) {
+  mxnet::ext::Graph *subgraph = mxnet::ext::Graph::fromString(json);
+  bool accept_bool = false;
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // create a map of named tensors for args
+  std::unordered_map<std::string, mxnet::ext::MXTensor> args;
+  for (int i = 0; i < num_args; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < arg_dims[i]; j++)
+      shapes.push_back(arg_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
+                                arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i], arg_dev_id[i]));
+    args[arg_names[i]] = tensor;
   }
-
-  /*! \brief returns status of calling reset selector function from library */
-  MX_VOID_RET _partCallReset(void* sel_inst) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    sel_ptr->Reset();
+  // create a map of named tensors for aux
+  std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
+  for (int i = 0; i < num_aux; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < aux_dims[i]; j++)
+      shapes.push_back(aux_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
+                                aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
+                                                                  aux_dev_id[i]));
+    aux[aux_names[i]] = tensor;
   }
 
-  /*! \brief returns status of calling review subgraph function from library */
-  MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
-                                     int subgraph_id, int *accept, const char* const* opt_keys,
-                                     const char* const* opt_vals, int num_opts,
-                                     char*** attr_keys, char*** attr_vals, int *num_attrs,
-                                     const char* const* arg_names, int num_args,
-                                     void* const* arg_data, const int64_t* const* arg_shapes,
-                                     const int* arg_dims, const int* arg_types,
-                                     const size_t* arg_IDs, const char* const* arg_dev_type,
-                                     const int* arg_dev_id,
-                                     const char* const* aux_names, int num_aux,
-                                     void* const* aux_data, const int64_t* const* aux_shapes,
-                                     const int* aux_dims, const int* aux_types,
-                                     const size_t* aux_IDs, const char* const* aux_dev_type,
-                                     const int* aux_dev_id) {
-    mxnet::ext::Graph *subgraph = mxnet::ext::Graph::fromString(json);
-    bool accept_bool = false;
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create a map of named tensors for args
-    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
-    for (int i = 0; i < num_args; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < arg_dims[i]; j++)
-        shapes.push_back(arg_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
-                      arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i], arg_dev_id[i]));
-      args[arg_names[i]] = tensor;
-    }
-    // create a map of named tensors for aux
-    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
-    for (int i = 0; i < num_aux; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < aux_dims[i]; j++)
-        shapes.push_back(aux_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
-                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
-                                                                    aux_dev_id[i]));
-      aux[aux_names[i]] = tensor;
+  subgraph->_setParams(&args, &aux);
+  mxnet::ext::MXReturnValue retval = reviewSubgraph(subgraph, subgraph_id, &accept_bool,
+                                                    opts);
+  if (!retval) return retval;
+
+  *accept = accept_bool;
+
+  if (subgraph->attrs.size() > 0) {
+    *num_attrs = subgraph->attrs.size();
+    // allocate space for attributes
+    *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+    *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+
+    // copy attributes
+    int i = 0;
+    for (auto kv : subgraph->attrs) {
+      (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
+      std::string val = kv.second.dump();  // convert JsonVal back to string
+      (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
+      snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
+      snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
+      i++;
     }
+  }
 
-    subgraph->_setParams(&args, &aux);
-    mxnet::ext::MXReturnValue retval = reviewSubgraph(subgraph, subgraph_id, &accept_bool,
-                                                      opts);
-    if (!retval) return retval;
-
-    *accept = accept_bool;
-
-    if (subgraph->attrs.size() > 0) {
-      *num_attrs = subgraph->attrs.size();
-      // allocate space for attributes
-      *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-      *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-
-      // copy attributes
-      int i = 0;
-      for (auto kv : subgraph->attrs) {
-        (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
-        std::string val = kv.second.dump();  // convert JsonVal back to string
-        (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
-        snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
-        snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
-        i++;
-      }
-    }
+  return retval;
+}
 
-    return retval;
-  }
+/*! \brief returns number of graph passes registered in this library */
+MX_INT_RET _passRegSize() {
+  return mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->size();
+}
 
-  /*! \brief returns number of graph passes registered in this library */
-  MX_INT_RET _passRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->size();
-  }
+/*! \brief returns pass registration at specified index */
+MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
+                        const char** pass_name) {
+  mxnet::ext::CustomPass pass =
+    mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->get(pass_idx);
+  *graphPass = pass.pass;
+  *pass_name = pass.name;
+}
 
-  /*! \brief returns pass registration at specified index */
-  MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
-                          const char** pass_name) {
-    mxnet::ext::CustomPass pass =
-      mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->get(pass_idx);
-    *graphPass = pass.pass;
-    *pass_name = pass.name;
+/*! \brief returns status of calling graph pass function from library */
+MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
+                              char** out_graph, const char* const* opt_keys,
+                              const char* const* opt_vals, int num_opts,
+                              const char* pass_name, const char* const* arg_names, int num_args,
+                              void* const* arg_data, const int64_t* const* arg_shapes,
+                              const int* arg_dims, const int* arg_types,
+                              const size_t* arg_IDs, const char* const* arg_dev_type,
+                              const int* arg_dev_id, const char* const* aux_names, int num_aux,
+                              void* const* aux_data, const int64_t* const* aux_shapes,
+                              const int* aux_dims, const int* aux_types,
+                              const size_t* aux_IDs, const char* const* aux_dev_type,
+                              const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
+                              const void* nd_alloc) {
+  mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // create a map of named tensors for args
+  std::unordered_map<std::string, mxnet::ext::MXTensor> args;
+  for (int i = 0; i < num_args; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < arg_dims[i]; j++)
+      shapes.push_back(arg_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
+                                arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i],
+                                                                  arg_dev_id[i]));
+    args[arg_names[i]] = tensor;
+  }
+  // create a map of named tensors for aux
+  std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
+  for (int i = 0; i < num_aux; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < aux_dims[i]; j++)
+      shapes.push_back(aux_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
+                                aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
+                                                                  aux_dev_id[i]));
+    aux[aux_names[i]] = tensor;
   }
 
-  /*! \brief returns status of calling graph pass function from library */
-  MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
-                                char** out_graph, const char* const* opt_keys,
-                                const char* const* opt_vals, int num_opts,
-                                const char* pass_name, const char* const* arg_names, int num_args,
-                                void* const* arg_data, const int64_t* const* arg_shapes,
-                                const int* arg_dims, const int* arg_types,
-                                const size_t* arg_IDs, const char* const* arg_dev_type,
-                                const int* arg_dev_id, const char* const* aux_names, int num_aux,
-                                void* const* aux_data, const int64_t* const* aux_shapes,
-                                const int* aux_dims, const int* aux_types,
-                                const size_t* aux_IDs, const char* const* aux_dev_type,
-                                const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
-                                const void* nd_alloc) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create a map of named tensors for args
-    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
-    for (int i = 0; i < num_args; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < arg_dims[i]; j++)
-        shapes.push_back(arg_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
-                                  arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i],
-                                                                    arg_dev_id[i]));
-      args[arg_names[i]] = tensor;
-    }
-    // create a map of named tensors for aux
-    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
-    for (int i = 0; i < num_aux; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < aux_dims[i]; j++)
-        shapes.push_back(aux_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
-                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
-                                                                    aux_dev_id[i]));
-      aux[aux_names[i]] = tensor;
-    }
-
-    std::unordered_map<std::string, mxnet::ext::MXTensor> new_args, new_aux;
-    mxnet::ext::PassResource res(&new_args, &new_aux, nd_malloc, nd_alloc);
-    graph->_setParams(&args, &aux);
-    graph->_setPassResource(&res);
-    mxnet::ext::MXReturnValue retval = graphPass(graph, opts);
-    if (!retval) return retval;
+  std::unordered_map<std::string, mxnet::ext::MXTensor> new_args, new_aux;
+  mxnet::ext::PassResource res(&new_args, &new_aux, nd_malloc, nd_alloc);
+  graph->_setParams(&args, &aux);
+  graph->_setPassResource(&res);
+  mxnet::ext::MXReturnValue retval = graphPass(graph, opts);
+  if (!retval) return retval;
 
-    std::string *tmp = new std::string(graph->toString());
-    *out_graph = const_cast<char*>(tmp->c_str());
-    return retval;
-  }
+  std::string *tmp = new std::string(graph->toString());
+  *out_graph = const_cast<char*>(tmp->c_str());
+  return retval;
+}
 
-  /*!
-   * \brief Checks if the MXNet version is supported by the library.
-   * If supported, initializes the library.
-   * \param version MXNet version number passed to library and defined as:
-   *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
-   * \return Non-zero value on error i.e. library incompatible with passed MXNet version
-   */
+/*!
+ * \brief Checks if the MXNet version is supported by the library.
+ * If supported, initializes the library.
+ * \param version MXNet version number passed to library and defined as:
+ *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+ * \return Non-zero value on error i.e. library incompatible with passed MXNet version
+ */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) mxnet::ext::MXReturnValue __cdecl
+__declspec(dllexport) mxnet::ext::MXReturnValue __cdecl
 #else
-  mxnet::ext::MXReturnValue
+mxnet::ext::MXReturnValue
 #endif
-  initialize(int version);
+initialize(int version);
 
-  MX_INT_RET _msgSize() {
-    return mxnet::ext::MXerrorMsgs::get()->size();
-  }
+MX_INT_RET _msgSize() {
+  return mxnet::ext::MXerrorMsgs::get()->size();
+}
 
-  /*! \brief returns operator registration at specified index */
-  MX_VOID_RET _msgGet(int idx, const char** msg) {
-    *msg = mxnet::ext::MXerrorMsgs::get()->get(idx)->c_str();
-  }
+/*! \brief returns operator registration at specified index */
+MX_VOID_RET _msgGet(int idx, const char** msg) {
+  *msg = mxnet::ext::MXerrorMsgs::get()->get(idx)->c_str();
+}

From 1ec330b9b9e8cfcd537772786cbc42455b915dc9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 07:23:43 +0000
Subject: [PATCH 05/43] whitespace

---
 include/mxnet/lib_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 21004c6f1603..498a21eb9881 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -764,7 +764,7 @@ class CustomOp {
   CustomOp& setCreateOpState(createOpState_t func, const char* ctx);
 
   CustomOp& setIsSubgraphOp();
-  
+
   void mapToVector();
 
   /*! \brief operator name */

From bee854bfcee8b132e44982d5a7eed60e01696c2b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 08:19:02 +0000
Subject: [PATCH 06/43] modernize

---
 include/mxnet/lib_api.h |  2 +-
 src/lib_api.cc          | 59 +++++++++++++++++++++--------------------
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 498a21eb9881..7c09d2bc28cf 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -331,7 +331,7 @@ struct MXSparse {
 struct MXTensor {
   MXTensor();
   MXTensor(const MXTensor& oth);
-  MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
+  MXTensor(void *data_ptr, const std::vector<int64_t> shape, MXDType dtype,
            size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage);
 
   /*! \brief populate internal tensor fields */
diff --git a/src/lib_api.cc b/src/lib_api.cc
index 51fede35d989..f560ed41a291 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -35,7 +35,7 @@ mxnet::ext::MXContext::MXContext() : dev_type("error"), dev_id(-1) {}
 mxnet::ext::MXContext::MXContext(std::string dev_type_, int dev_id_)
   : dev_type(dev_type_), dev_id(dev_id_) {}
 mxnet::ext::MXContext::MXContext(const char* dev_type_, int dev_id_)
-    : dev_type(dev_type_), dev_id(dev_id_) {}
+  : dev_type(std::move(dev_type_)), dev_id(dev_id_) {}
 mxnet::ext::MXContext mxnet::ext::MXContext::CPU() { return MXContext("cpu", 0); }
 mxnet::ext::MXContext mxnet::ext::MXContext::GPU() { return MXContext("gpu", 0); }
 mxnet::ext::MXContext mxnet::ext::MXContext::CPU(int dev_id) { return MXContext("cpu", dev_id); }
@@ -69,9 +69,10 @@ mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), sh
   setDLTensor();
 }
 
-mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
+mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> shape, MXDType dtype,
                                size_t vID, MXContext mx_ctx, MXStorageType stype)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx), stype(stype) {
+  : data_ptr(data_ptr), shape(std::move(shape)), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
+    stype(stype) {
   setDLTensor();
 }
 
@@ -149,9 +150,8 @@ void mxnet::ext::MXTensor::setDLTensor() {
 
 int64_t mxnet::ext::MXTensor::size() const {
   int64_t size = 1;
-  for (unsigned int i = 0; i < shape.size(); i++) {
-    size *= shape[i];
-  }
+  for (auto &s : shape)
+    size *= s;
   return size;
 }
 
@@ -251,9 +251,10 @@ std::string mxnet::ext::getDtypeAt(const std::string& dtype, unsigned index) {
 
 mxnet::ext::JsonVal::JsonVal() : type(ERR), num(-1), str("") {}
 mxnet::ext::JsonVal::JsonVal(mxnet::ext::JsonType t) : type(t), num(-1), str("") {}
-mxnet::ext::JsonVal::JsonVal(std::string s) : type(STR), num(-1), str(s) {}
+mxnet::ext::JsonVal::JsonVal(std::string s) : type(STR), num(-1), str(std::move(s)) {}
 mxnet::ext::JsonVal::JsonVal(int n) : type(NUM), num(n), str(std::to_string(n)) {}
-mxnet::ext::JsonVal::JsonVal(JsonType t, int n, std::string s) : type(t), num(n), str(s) {}
+mxnet::ext::JsonVal::JsonVal(JsonType t, int n, std::string s) : type(t), num(n),
+                                                                 str(std::move(s)) {}
 
 bool mxnet::ext::JsonVal::operator<(const mxnet::ext::JsonVal &o) const {
   // for string JSON objects compare the string
@@ -457,8 +458,8 @@ void mxnet::ext::Node::alloc_aux(const std::vector<int64_t>& shapes,
 mxnet::ext::Graph::Graph() : res(nullptr) {}
 
 mxnet::ext::Graph::~Graph() {
-  for (int i = 0; i < nodes.size(); i++)
-    delete nodes[i];
+  for (auto &node : nodes)
+    delete node;
 }
 
 mxnet::ext::Graph* mxnet::ext::Graph::fromString(const std::string& json) {
@@ -560,23 +561,23 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
   val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
   JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
   for (int i = 0; i < nodes.size(); i++)
-    node_row_ptr.list.push_back(JsonVal(i));
+    node_row_ptr.list.emplace_back(i);
 
   // add all input nodes
   val.map[JsonVal("arg_nodes")] = JsonVal(LIST);
   JsonVal& arg_nodes = val.map[JsonVal("arg_nodes")];
-  for (int i = 0; i < inputs.size(); i++)
-    arg_nodes.list.push_back(JsonVal(nodeMap[inputs[i]]));
+  for (auto &input : inputs)
+    arg_nodes.list.emplace_back(nodeMap[input]);
 
   // add all output nodes
   val.map[JsonVal("heads")] = JsonVal(LIST);
   JsonVal& heads = val.map[JsonVal("heads")];
   for (int i = 0; i < outputs.size(); i++) {
-    heads.list.push_back(JsonVal(LIST));
+    heads.list.emplace_back(LIST);
     JsonVal& out = heads.list[i];
-    out.list.push_back(JsonVal(nodeMap[outputs[i].node]));
-    out.list.push_back(JsonVal(outputs[i].entry));
-    out.list.push_back(JsonVal(0));
+    out.list.emplace_back(nodeMap[outputs[i].node]);
+    out.list.emplace_back(outputs[i].entry);
+    out.list.emplace_back(0);
   }
 
   // add all graph nodes
@@ -584,7 +585,7 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
   JsonVal& nodes_ = val.map[JsonVal("nodes")];
   for (int i = sorted.size()-1; i >= 0; i--) {
     // each node is a map
-    nodes_.list.push_back(JsonVal(MAP));
+    nodes_.list.emplace_back(MAP);
     Node* n = sorted[i];
     JsonVal& n_ = nodes_.list[nodes_.list.size()-1];
 
@@ -598,9 +599,9 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
       inputs_.list.push_back(JsonVal(LIST));
       NodeEntry& entry = n->inputs[j];
       JsonVal& in = inputs_.list[j];
-      in.list.push_back(JsonVal(nodeMap[entry.node]));
-      in.list.push_back(JsonVal(entry.entry));
-      in.list.push_back(JsonVal(0));
+      in.list.emplace_back(nodeMap[entry.node]);
+      in.list.emplace_back(entry.entry);
+      in.list.emplace_back(0);
     }
 
     // add subgraphs for this node, convert each back to JSON
@@ -681,13 +682,13 @@ void mxnet::ext::Graph::print(int indent) const {
   // loop over each node and print out its inputs/outputs
   for (int i = sorted.size()-1; i >= 0; i--) {
     std::cout << space << "Node: " << sorted[i]->name << std::endl;
-    for (int j = 0; j < sorted[i]->inputs.size(); j++) {
-      std::cout << space << "\tInput: " << sorted[i]->inputs[j].node->name << " "
-                << sorted[i]->inputs[j].entry << std::endl;
+    for (auto &input : sorted[i]->inputs) {
+      std::cout << space << "\tInput: " << input.node->name << " "
+                << input.entry << std::endl;
     }
-    for (int j = 0; j < sorted[i]->outputs.size(); j++) {
-      std::cout << space << "\tOutput: " << sorted[i]->outputs[j].node->name << " "
-                << sorted[i]->outputs[j].entry << std::endl;
+    for (auto &output : sorted[i]->outputs) {
+      std::cout << space << "\tOutput: " << output.node->name << " "
+                << output.entry << std::endl;
     }
     if (sorted[i]->subgraphs.size() > 0) {
       for (auto &subgraph : sorted[i]->subgraphs) {
@@ -751,8 +752,8 @@ void mxnet::ext::Graph::_setParams(std::unordered_map<std::string, mxnet::ext::M
 }
 
 mxnet::ext::CustomOp::CustomOp(const char* op_name)
-  : name(op_name), parse_attrs(NULL), infer_type(NULL), infer_storage_type(NULL), infer_shape(NULL),
-    mutate_inputs(NULL), isSGop(false) {}
+  : name(op_name), parse_attrs(nullptr), infer_type(nullptr), infer_storage_type(nullptr),
+    infer_shape(nullptr), mutate_inputs(nullptr), isSGop(false) {}
 
 mxnet::ext::CustomOp& mxnet::ext::CustomOp::setForward(mxnet::ext::fcomp_t fcomp, const char* ctx) {
   if (forward_ctx_map.count(ctx) > 0)

From c36363e9a04eb59fc3947ad6ee6e2b73f93df9ae Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 16:26:58 +0000
Subject: [PATCH 07/43] fix modernize

---
 src/lib_api.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lib_api.cc b/src/lib_api.cc
index f560ed41a291..70a9cd5040e9 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -33,9 +33,9 @@
 
 mxnet::ext::MXContext::MXContext() : dev_type("error"), dev_id(-1) {}
 mxnet::ext::MXContext::MXContext(std::string dev_type_, int dev_id_)
-  : dev_type(dev_type_), dev_id(dev_id_) {}
-mxnet::ext::MXContext::MXContext(const char* dev_type_, int dev_id_)
   : dev_type(std::move(dev_type_)), dev_id(dev_id_) {}
+mxnet::ext::MXContext::MXContext(const char* dev_type_, int dev_id_)
+  : dev_type(dev_type_), dev_id(dev_id_) {}
 mxnet::ext::MXContext mxnet::ext::MXContext::CPU() { return MXContext("cpu", 0); }
 mxnet::ext::MXContext mxnet::ext::MXContext::GPU() { return MXContext("gpu", 0); }
 mxnet::ext::MXContext mxnet::ext::MXContext::CPU(int dev_id) { return MXContext("cpu", dev_id); }
@@ -69,7 +69,7 @@ mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), sh
   setDLTensor();
 }
 
-mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> shape, MXDType dtype,
+mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
                                size_t vID, MXContext mx_ctx, MXStorageType stype)
   : data_ptr(data_ptr), shape(std::move(shape)), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
     stype(stype) {
@@ -596,7 +596,7 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
     // add inputs for this node
     JsonVal& inputs_ = n_.map[JsonVal("inputs")];
     for (int j = 0; j < n->inputs.size(); j++) {
-      inputs_.list.push_back(JsonVal(LIST));
+      inputs_.list.emplace_back(LIST);
       NodeEntry& entry = n->inputs[j];
       JsonVal& in = inputs_.list[j];
       in.list.emplace_back(nodeMap[entry.node]);

From 53b41368e4e5ea94ea4359fda13ea5c91986c5ca Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 16:46:03 +0000
Subject: [PATCH 08/43] fix modernize

---
 include/mxnet/lib_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 7c09d2bc28cf..498a21eb9881 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -331,7 +331,7 @@ struct MXSparse {
 struct MXTensor {
   MXTensor();
   MXTensor(const MXTensor& oth);
-  MXTensor(void *data_ptr, const std::vector<int64_t> shape, MXDType dtype,
+  MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
            size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage);
 
   /*! \brief populate internal tensor fields */

From 295389104d90d0a6147a0f970dd0793dee5bb4e7 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 17:17:09 +0000
Subject: [PATCH 09/43] fix modernize

---
 src/lib_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib_api.cc b/src/lib_api.cc
index 70a9cd5040e9..48fc86d4c2e8 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -71,7 +71,7 @@ mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), sh
 
 mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
                                size_t vID, MXContext mx_ctx, MXStorageType stype)
-  : data_ptr(data_ptr), shape(std::move(shape)), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
+  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
     stype(stype) {
   setDLTensor();
 }

From 0c0cceb1ad7f9e7d4efaf5839a1a8f47996e5ccf Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 17:50:05 +0000
Subject: [PATCH 10/43] fixed move

---
 include/mxnet/lib_api.h | 2 +-
 src/lib_api.cc          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 498a21eb9881..a219d658bb31 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -331,7 +331,7 @@ struct MXSparse {
 struct MXTensor {
   MXTensor();
   MXTensor(const MXTensor& oth);
-  MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
+  MXTensor(void *data_ptr, std::vector<int64_t> shape, MXDType dtype,
            size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage);
 
   /*! \brief populate internal tensor fields */
diff --git a/src/lib_api.cc b/src/lib_api.cc
index 48fc86d4c2e8..59957352b267 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -69,9 +69,9 @@ mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), sh
   setDLTensor();
 }
 
-mxnet::ext::MXTensor::MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
+mxnet::ext::MXTensor::MXTensor(void *data_ptr, std::vector<int64_t> shape, MXDType dtype,
                                size_t vID, MXContext mx_ctx, MXStorageType stype)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
+  : data_ptr(data_ptr), shape(std::move(shape)), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
     stype(stype) {
   setDLTensor();
 }

From 7cbc99b132949594c0f2cc1bef0801ab9b46658c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 18:21:38 +0000
Subject: [PATCH 11/43] added lib_api.cc to CMakeLists.txt for example libs

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b494fdd75ae7..dd79db133433 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -708,18 +708,18 @@ endif()
 target_compile_definitions(mxnet PUBLIC DMLC_LOG_FATAL_THROW=$<BOOL:${LOG_FATAL_THROW}>)
 
 # extension libraries (custom operators, custom subgraphs) are built by default
-add_library(customop_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc)
-add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposecsr_lib.cc)
-add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc)
-add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc)
-add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc)
+add_library(customop_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposecsr_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 if(USE_CUDA)
-  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu)
+  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
   target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 endif()
 if(UNIX)

From 6965cd7d68a939996503f6da7dcc3ad0ad08b137 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Sun, 16 Aug 2020 22:44:10 +0000
Subject: [PATCH 12/43] working example

---
 example/extensions/lib_external_ops/Makefile  |  2 +-
 example/extensions/lib_external_ops/README.md | 26 ++++++-
 .../extensions/lib_external_ops/init_lib.cc   | 39 +++++++++++
 .../extensions/lib_external_ops/libtest.cc    | 69 -------------------
 .../lib_external_ops/test_loading.py          | 41 +++++++++++
 python/mxnet/base.py                          |  2 +-
 src/initialize.cc                             |  8 ++-
 7 files changed, 114 insertions(+), 73 deletions(-)
 create mode 100644 example/extensions/lib_external_ops/init_lib.cc
 delete mode 100644 example/extensions/lib_external_ops/libtest.cc
 create mode 100644 example/extensions/lib_external_ops/test_loading.py

diff --git a/example/extensions/lib_external_ops/Makefile b/example/extensions/lib_external_ops/Makefile
index c3e82635ec87..70e6addd3336 100644
--- a/example/extensions/lib_external_ops/Makefile
+++ b/example/extensions/lib_external_ops/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -shared -fPIC -std=c++11 min_ex.cc.o -o libmin_ex.so
+	g++ -shared -fPIC -std=c++11 init_lib.cc min_ex.cc.o ../../../src/lib_api.cc -o libmin_ex.so -I../../../include
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl
diff --git a/example/extensions/lib_external_ops/README.md b/example/extensions/lib_external_ops/README.md
index 8239989b84e4..db655535c94f 100644
--- a/example/extensions/lib_external_ops/README.md
+++ b/example/extensions/lib_external_ops/README.md
@@ -15,4 +15,28 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-TBD
\ No newline at end of file
+External Operators Example and Tutorial
+=======================================
+
+## Introduction
+
+TBD
+
+## Getting Started
+
+### Have MXNet Ready
+
+For this tutorial, clone MXNet from source but dont build it yet.
+
+### Run An Example
+
+This example shows compiling a custom backend operator and then dynamically loading it into MXNet at runtime. Go to the **lib_external_ops** directory and follow these steps:
+
+1. Copy **min_ex.cc** and **min_ex-inl.h** into the src/operator directory.
+2. Build MXNet.
+3. Find the **min_ex.cc.o** file and copy it back to the **lib_external_ops** directory.
+4. Delete the **min_ex.cc** and **min_ex-inl.h** from the src/operator directory.
+5. Clean the build folder.
+6. Rebuild MXNet.
+7. Run `make` in the **lib_external_ops** directory to produce the libmin_ex.so with your custom operator inside.
+8. Run `python test_loading.py`.
\ No newline at end of file
diff --git a/example/extensions/lib_external_ops/init_lib.cc b/example/extensions/lib_external_ops/init_lib.cc
new file mode 100644
index 000000000000..a21c481bee2f
--- /dev/null
+++ b/example/extensions/lib_external_ops/init_lib.cc
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file init_lib.cc
+ * \brief Sample library file
+ */
+
+#include <iostream>
+#include "mxnet/lib_api.h"
+
+using namespace mxnet::ext;
+
+MXReturnValue initialize(int version) {
+  if (version >= 10700) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    MX_ERROR_MSG << "MXNet version " << version << " not supported";
+    return MX_FAIL;
+  }
+}
diff --git a/example/extensions/lib_external_ops/libtest.cc b/example/extensions/lib_external_ops/libtest.cc
deleted file mode 100644
index 3453c2815dd3..000000000000
--- a/example/extensions/lib_external_ops/libtest.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file libtest.cc
- * \brief This test checks if the library is implemented correctly
- * and does not involve dynamic loading of library into MXNet
- * This test is supposed to be run before test.py
- */
-
-#include <dlfcn.h>
-
-#include <iostream>
-
-
-int main(void) {
-  dlerror();
-  void *mx;
-  mx = dlopen("libmxnet.so", RTLD_LAZY | RTLD_GLOBAL);
-  
-  if (!mx) {
-    std::cerr << "Unable to load libmxnet.so" << std::endl;
-    char* err = dlerror();
-    if(err)
-      std::cerr << err << std::endl;
-    return 1;
-  }
-
-  // Get a handle to the library.
-  void *handle;
-  handle = dlopen("libmin_ex.so", RTLD_LAZY);
-
-  if (!handle) {
-    std::cerr << "Unable to load library" << std::endl;
-    char* err = dlerror();
-    if(err)
-      std::cerr << err << std::endl;
-    return 1;
-  }
-
-  // get initialize function address from the library
-  void* init_lib = dlsym(handle, "initialize");
-
-  if (!init_lib) {
-    std::cerr << "Unable to get function 'intialize' from library" << std::endl;
-    return 1;
-  }
-
-  dlclose(handle);
-
-  return 0;
-}
diff --git a/example/extensions/lib_external_ops/test_loading.py b/example/extensions/lib_external_ops/test_loading.py
new file mode 100644
index 000000000000..c644d8eaffeb
--- /dev/null
+++ b/example/extensions/lib_external_ops/test_loading.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+# This test checks if dynamic loading of library into MXNet is successful
+
+import mxnet as mx
+import os
+
+try:
+    print(mx.nd.min_ex())
+except Exception as ex:
+    print('Operator not registered yet. %s: %s' %(type(ex).__name__,ex))
+
+# test loading library
+if (os.name=='posix'):
+    path = os.path.abspath('libmin_ex.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('libmin_ex.dll')
+    mx.library.load(path)
+
+print(mx.nd.min_ex())
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 0b4bdf9a97c2..73b36c079678 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -276,7 +276,7 @@ class MXCallbackList(ctypes.Structure):
 def _load_lib():
     """Load library by searching possible path."""
     lib_path = libinfo.find_lib_path()
-    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
+    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
     # DMatrix functions
     lib.MXGetLastError.restype = ctypes.c_char_p
     return lib
diff --git a/src/initialize.cc b/src/initialize.cc
index 784e54f9a9d7..71776708c2cc 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -126,7 +126,13 @@ void* LibraryInitializer::lib_load(const char* path) {
       return nullptr;
     }
 #else
-    handle = dlopen(path, RTLD_LAZY);
+    /* library loading flags:
+     *  RTLD_LAZY - Perform lazy binding. Only resolve symbols as the code that
+     *              references them is executed.
+     *  RTLD_LOCAL - Symbols defined in this library are not made available to
+     *              resolve references in subsequently loaded libraries.
+     */
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
     if (!handle) {
       LOG(FATAL) << "Error loading library: '" << path << "'\n" << dlerror();
       return nullptr;

From 42d00d023dab2a74939cdd93a91deb778d4a7f52 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Mon, 17 Aug 2020 05:18:43 +0000
Subject: [PATCH 13/43] remove GLOBAL to fix protobuf issue

---
 python/mxnet/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 73b36c079678..0b4bdf9a97c2 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -276,7 +276,7 @@ class MXCallbackList(ctypes.Structure):
 def _load_lib():
     """Load library by searching possible path."""
     lib_path = libinfo.find_lib_path()
-    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
+    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
     # DMatrix functions
     lib.MXGetLastError.restype = ctypes.c_char_p
     return lib

From 2379eedde1b4a807bb31a95b7659722ec6d29d0c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Mon, 17 Aug 2020 07:59:56 +0000
Subject: [PATCH 14/43] fixed library unload

---
 example/extensions/lib_external_ops/Makefile |  2 +-
 include/mxnet/c_api.h                        |  2 +-
 python/mxnet/library.py                      | 16 ++++++++++++++--
 src/c_api/c_api.cc                           | 20 ++++++++++----------
 src/initialize.cc                            |  7 -------
 5 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/example/extensions/lib_external_ops/Makefile b/example/extensions/lib_external_ops/Makefile
index 70e6addd3336..c967f44a080a 100644
--- a/example/extensions/lib_external_ops/Makefile
+++ b/example/extensions/lib_external_ops/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -shared -fPIC -std=c++11 init_lib.cc min_ex.cc.o ../../../src/lib_api.cc -o libmin_ex.so -I../../../include
+	g++ -shared -fPIC -std=c++11 init_lib.cc min_ex.cc.o ../../../src/lib_api.cc -o libmin_ex.so -I../../../include -L../../../build -lmxnet
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 01ce18859ef4..4b56244cff94 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -242,7 +242,7 @@ MXNET_DLL const char *MXGetLastError();
  * \param 0 for quiet, 1 for verbose
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXLoadLib(const char *path, unsigned verbose);
+MXNET_DLL int MXLoadLib(const char *path, unsigned verbose, void** lib);
 
 /*!
  * \brief Get list of features supported on the runtime
diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index 487fff940fda..e1e6e3b1e561 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -24,6 +24,15 @@
 from .ndarray.register import _make_ndarray_function
 from .symbol.register import _make_symbol_function
 
+class MXlib:
+    def __init__(self,handle):
+        self.handle = handle
+    def __del__(self):
+        libdl = ctypes.CDLL("libdl.so")
+        libdl.dlclose(handle)
+
+loaded_libs = []
+
 def load(path, verbose=True):
     """Loads library dynamically.
 
@@ -39,6 +48,8 @@ def load(path, verbose=True):
     ---------
     void
     """
+    global loaded_libs
+
     #check if path exists
     if not os.path.exists(path):
         raise MXNetError("load path %s does NOT exist" % path)
@@ -53,7 +64,9 @@ def load(path, verbose=True):
     verbose_val = 1 if verbose else 0
     byt_obj = path.encode('utf-8')
     chararr = ctypes.c_char_p(byt_obj)
-    check_call(_LIB.MXLoadLib(chararr, mx_uint(verbose_val)))
+    lib_ptr = ctypes.c_void_p(0)
+    check_call(_LIB.MXLoadLib(chararr, mx_uint(verbose_val),ctypes.byref(lib_ptr)))
+    loaded_libs.append(MXlib(lib_ptr))
 
     #regenerate operators
     _init_op_module('mxnet', 'ndarray', _make_ndarray_function)
@@ -73,7 +86,6 @@ def load(path, verbose=True):
         func = getattr(mx_sym_op, op)
         setattr(mx_sym, op, func)
 
-
 def compiled_with_gcc_cxx11_abi():
     """Check if the library is compiled with _GLIBCXX_USE_CXX11_ABI.
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 23049f1b8867..012eda19830f 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1462,15 +1462,15 @@ void registerPasses(void *lib, int verbose, mxnet::ext::msgSize_t msgSize,
  * \brief Loads dynamic custom library and initializes it
  * \param path library path
  */
-int MXLoadLib(const char *path, unsigned verbose) {
+int MXLoadLib(const char *path, unsigned verbose, void** lib) {
   API_BEGIN();
-  void *lib = LibraryInitializer::Get()->lib_load(path);
-  if (!lib)
+  *lib = LibraryInitializer::Get()->lib_load(path);
+  if (!*lib)
     LOG(FATAL) << "Unable to load library";
 
   // check that library and MXNet use same version of library API
   mxnet::ext::opVersion_t opVersion =
-    get_func<mxnet::ext::opVersion_t>(lib, const_cast<char*>(MXLIB_OPVERSION_STR));
+    get_func<mxnet::ext::opVersion_t>(*lib, const_cast<char*>(MXLIB_OPVERSION_STR));
   int libVersion =  opVersion();
   if (MX_LIBRARY_VERSION != libVersion)
     LOG(FATAL) << "Library version (" << libVersion << ") does not match MXNet version ("
@@ -1478,22 +1478,22 @@ int MXLoadLib(const char *path, unsigned verbose) {
 
   // get error messaging APIs
   mxnet::ext::msgSize_t msgSize =
-    get_func<mxnet::ext::msgSize_t>(lib, const_cast<char*>(MXLIB_MSGSIZE_STR));
+    get_func<mxnet::ext::msgSize_t>(*lib, const_cast<char*>(MXLIB_MSGSIZE_STR));
   mxnet::ext::msgGet_t msgGet =
-    get_func<mxnet::ext::msgGet_t>(lib, const_cast<char*>(MXLIB_MSGGET_STR));
+    get_func<mxnet::ext::msgGet_t>(*lib, const_cast<char*>(MXLIB_MSGGET_STR));
 
   // initialize library by passing MXNet version
   mxnet::ext::initialize_t initialize =
-    get_func<mxnet::ext::initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
+    get_func<mxnet::ext::initialize_t>(*lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION))) {
     std::string msgs = getExtensionMsgs(msgSize, msgGet);
     LOG(FATAL) << "Library failed to initialize" << msgs;
   }
 
   // find ops, partitioners, and passes in library
-  registerOperators(lib, verbose, msgSize, msgGet);
-  registerPartitioners(lib, verbose, msgSize, msgGet);
-  registerPasses(lib, verbose, msgSize, msgGet);
+  registerOperators(*lib, verbose, msgSize, msgGet);
+  registerPartitioners(*lib, verbose, msgSize, msgGet);
+  registerPasses(*lib, verbose, msgSize, msgGet);
   API_END();
 }
 
diff --git a/src/initialize.cc b/src/initialize.cc
index 71776708c2cc..e166a1a11b94 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -99,7 +99,6 @@ LibraryInitializer::LibraryInitializer()
 }
 
 LibraryInitializer::~LibraryInitializer() {
-  close_open_libs();
 }
 
 bool LibraryInitializer::lib_is_loaded(const std::string& path) const {
@@ -241,12 +240,6 @@ void LibraryInitializer::install_signal_handlers() {
 #endif
 }
 
-void LibraryInitializer::close_open_libs() {
-  for (const auto& l : loaded_libs) {
-    lib_close(l.second);
-  }
-}
-
 /**
  * Perform static initialization
  */

From afa87a760986b0f6d500c1c5a9ae5a0e3048e1d2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Mon, 17 Aug 2020 16:07:12 +0000
Subject: [PATCH 15/43] added test target

---
 example/extensions/lib_external_ops/Makefile | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/example/extensions/lib_external_ops/Makefile b/example/extensions/lib_external_ops/Makefile
index c967f44a080a..6f04fdf9cc70 100644
--- a/example/extensions/lib_external_ops/Makefile
+++ b/example/extensions/lib_external_ops/Makefile
@@ -15,11 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 
+SRCS=init_lib.cc \
+     min_ex.cc \
+     ../../../src/lib_api.cc
+
+INC_PATH=-I../../../include \
+    -I../../../src/operator/
+
+LIB_PATH=-L../../../build
+
+LIBS=-lmxnet
+
 all:
 	g++ -shared -fPIC -std=c++11 init_lib.cc min_ex.cc.o ../../../src/lib_api.cc -o libmin_ex.so -I../../../include -L../../../build -lmxnet
 
 test:
-	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl
+	g++ -shared -fPIC -std=c++11 ${SRCS} -o libmin_ex.so ${INC_PATH} ${LIB_PATH} ${LIBS}
 
 clean:
 	rm -rf libmin_ex.so

From b2f5a19357c034e4dacd3fc42a0066aa60e55ed2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Mon, 17 Aug 2020 16:19:06 +0000
Subject: [PATCH 16/43] fixed sanity

---
 python/mxnet/library.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index e1e6e3b1e561..f15f8f239e2d 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -25,12 +25,14 @@
 from .symbol.register import _make_symbol_function
 
 class MXlib:
-    def __init__(self,handle):
+    """Holds a pointed to a loaded shared library and closes it on destruction"""
+    def __init__(self, handle):
         self.handle = handle
     def __del__(self):
         libdl = ctypes.CDLL("libdl.so")
-        libdl.dlclose(handle)
-
+        libdl.dlclose(self.handle)
+        print("DLClose'd")
+# set of libraries loaded
 loaded_libs = []
 
 def load(path, verbose=True):
@@ -65,7 +67,8 @@ def load(path, verbose=True):
     byt_obj = path.encode('utf-8')
     chararr = ctypes.c_char_p(byt_obj)
     lib_ptr = ctypes.c_void_p(0)
-    check_call(_LIB.MXLoadLib(chararr, mx_uint(verbose_val),ctypes.byref(lib_ptr)))
+    check_call(_LIB.MXLoadLib(chararr, mx_uint(verbose_val), ctypes.byref(lib_ptr)))
+    # add library pointer to list so it can be closed later
     loaded_libs.append(MXlib(lib_ptr))
 
     #regenerate operators

From 68a37337424ddf92b6370398cd0c7d4d325d54c9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Mon, 17 Aug 2020 17:07:22 +0000
Subject: [PATCH 17/43] changed destructor to default

---
 src/initialize.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/initialize.cc b/src/initialize.cc
index e166a1a11b94..693c047717f9 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -98,8 +98,7 @@ LibraryInitializer::LibraryInitializer()
   install_pthread_atfork_handlers();
 }
 
-LibraryInitializer::~LibraryInitializer() {
-}
+LibraryInitializer::~LibraryInitializer() = default;
 
 bool LibraryInitializer::lib_is_loaded(const std::string& path) const {
   return loaded_libs.count(path) > 0;

From 04e88fb3c3f9daaa2bf7448b65678a20eff929be Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 19 Aug 2020 04:09:55 +0000
Subject: [PATCH 18/43] added /LD option for customop_gpu_lib target

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd79db133433..490630179cda 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -738,7 +738,7 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC>")
+    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC> /LD")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

From b9f67efeac482afb51caee21bea81699067b4342 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 19 Aug 2020 06:41:22 +0000
Subject: [PATCH 19/43] moved /LD inside the <>

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 490630179cda..65046263070a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -738,7 +738,7 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC> /LD")
+    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC /LD>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

From 4b9a4dc514fde548e9eaaa7ba0fcb243ae59a628 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 19 Aug 2020 15:57:18 +0000
Subject: [PATCH 20/43] diff compile flags for relu_lib.cu and lib_api.cc

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65046263070a..c643d62c4c3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -738,7 +738,9 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC /LD>")
+    #target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD>")
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD>")
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc PROPERTIES COMPILE_OPTIONS /LD)
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

From 4afe1827ef65218c6690f09f6e77b14ccfb97c98 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 02:46:07 +0000
Subject: [PATCH 21/43] set CMAKE_VERBOSE_MAKEFILE for debug

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c643d62c4c3c..48b9c0d69a65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,6 +160,7 @@ if("$ENV{VERBOSE}" STREQUAL "1")
   message(STATUS " Verbose Makefile ACTIVATED")
   set(CMAKE_VERBOSE_MAKEFILE ON)
 endif()
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
 #Switch off modern thread local for dmlc-core, please see: https://github.com/dmlc/dmlc-core/issues/571#issuecomment-543467484
 add_definitions(-DDMLC_MODERN_THREAD_LOCAL=0)

From 7a36a40be017ab5bf37051a34d066efa1cf2e730 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 04:10:18 +0000
Subject: [PATCH 22/43] added -v to ninja

---
 ci/build_windows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_windows.py b/ci/build_windows.py
index c8d3af515b5a..6d77bce5bd99 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -192,7 +192,7 @@ def windows_build(args):
             logging.info("Generating project with CMake:\n{}".format(cmd))
             check_call(cmd, shell=True, env=env)
 
-            cmd = "\"{}\" && ninja".format(args.vcvars)
+            cmd = "\"{}\" && ninja -v".format(args.vcvars)
             logging.info("Building:\n{}".format(cmd))
 
             t0 = int(time.time())

From 8607847d9b1a3b5bb6dcf478e15f1cf558f24db6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 06:37:19 +0000
Subject: [PATCH 23/43] added /MT

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48b9c0d69a65..bebe9fcdbe54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -740,7 +740,7 @@ elseif(MSVC)
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
     #target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD>")
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD>")
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc PROPERTIES COMPILE_OPTIONS /LD)
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()

From 4165d02c7fa9a8f9dff3a1f1830e7d8a273b9d27 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 16:17:20 +0000
Subject: [PATCH 24/43] another try

---
 CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bebe9fcdbe54..43b90663848c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -739,9 +739,10 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    #target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD>")
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc PROPERTIES COMPILE_OPTIONS /LD)
+    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
+    target_compile_options(customop_gpu_lib PUBLIC /LD /MT)
+    #set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
+    #set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc PROPERTIES COMPILE_OPTIONS /LD)
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

From f7de08ebb113e2cc18d0749e027889082e47393d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 18:43:31 +0000
Subject: [PATCH 25/43] changed /MT to -MT

---
 CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43b90663848c..e52fac146f8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -739,8 +739,9 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
-    target_compile_options(customop_gpu_lib PUBLIC /LD /MT)
+    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
+    target_compile_options(customop_gpu_lib PUBLIC /LD)
+    target_compile_options(customop_gpu_lib PUBLIC /MT)
     #set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
     #set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc PROPERTIES COMPILE_OPTIONS /LD)
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")

From 4b7d1197909a33faa0cd360c58f325b3c49c67fe Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 21:59:10 +0000
Subject: [PATCH 26/43] set flags for cxx separately

---
 CMakeLists.txt | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e52fac146f8b..8ae48d305da3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -739,11 +739,8 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
-    target_compile_options(customop_gpu_lib PUBLIC /LD)
-    target_compile_options(customop_gpu_lib PUBLIC /MT)
-    #set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu PROPERTIES COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/LD /MT>")
-    #set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc PROPERTIES COMPILE_OPTIONS /LD)
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD /MT>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

From c3719fd44ed5c01d0b3c14ae71f00bdd9e35cfeb Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Thu, 20 Aug 2020 22:16:41 +0000
Subject: [PATCH 27/43] split /LD /MT flags

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ae48d305da3..bf6d2bfe80a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -740,7 +740,8 @@ elseif(MSVC)
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
-    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD /MT>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

From 4335985bda1d468bbdecd4929b3edd5fd598bab0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Fri, 21 Aug 2020 08:04:51 +0000
Subject: [PATCH 28/43] refactored cuda APIs into header file

---
 include/mxnet/lib_api.h | 11 ++++++++---
 src/lib_api.cc          |  8 --------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index a219d658bb31..ee028dfe60e2 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -48,11 +48,12 @@
 #include <sstream>
 
 #if defined(__NVCC__)
+  #include <cuda_runtime.h>
   #include <curand_kernel.h>
 #endif
 
 /* Make sure to update the version number everytime you make changes */
-#define MX_LIBRARY_VERSION 8
+#define MX_LIBRARY_VERSION 9
 
 /*!
  * \brief For loading multiple custom op libraries in Linux, exporting same symbol multiple
@@ -439,7 +440,9 @@ class OpResource {
   void* alloc_gpu(int size) const;
 
   /*! \brief return the cuda stream object with correct type */
-  mx_stream_t get_cuda_stream() const;
+  inline mx_stream_t get_cuda_stream() const {
+    return static_cast<mx_stream_t>(cuda_stream);
+  }
 
   /*! \brief allocate sparse memory controlled by MXNet */
   void alloc_sparse(MXSparse* sparse, int index, int indices_len, int indptr_len = 0) const;
@@ -451,7 +454,9 @@ class OpResource {
   /*! \brief get pointer to initialized and seeded random number states located on GPU */
   /* Access each state by states[id], but this id should be <= MX_NUM_GPU_RANDOM_STATES */
   /* Note that if you are using cpu build, it will return a nullptr */
-  mx_gpu_rand_t* get_gpu_rand_states() const;
+  inline mx_gpu_rand_t* get_gpu_rand_states() const {
+    return static_cast<mx_gpu_rand_t*>(rand_gpu_states);
+  }
 
  private:
   /*! \brief allocation lambda function */
diff --git a/src/lib_api.cc b/src/lib_api.cc
index 59957352b267..b00dcfaf9472 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -211,10 +211,6 @@ void* mxnet::ext::OpResource::alloc_gpu(int size) const {
   return gpu_malloc(gpu_alloc, size);
 }
 
-mxnet::ext::mx_stream_t mxnet::ext::OpResource::get_cuda_stream() const {
-  return static_cast<mx_stream_t>(cuda_stream);
-}
-
 void mxnet::ext::OpResource::alloc_sparse(mxnet::ext::MXSparse* sparse, int index,
                                           int indices_len, int indptr_len) const {
   sparse_malloc(sparse_alloc, index, indices_len, indptr_len,
@@ -225,10 +221,6 @@ mxnet::ext::mx_cpu_rand_t* mxnet::ext::OpResource::get_cpu_rand_states() const {
   return static_cast<mx_cpu_rand_t*>(rand_cpu_states);
 }
 
-mxnet::ext::mx_gpu_rand_t* mxnet::ext::OpResource::get_gpu_rand_states() const {
-  return static_cast<mx_gpu_rand_t*>(rand_gpu_states);
-}
-
 std::string mxnet::ext::getShapeAt(const std::string& shape, unsigned index) {
   int idx = 1;  // start at 1 to skip the first square bracket [
   // find the beginning of the output shape for the particular output index

From 1ef7b0a088e629b1de6a788c14a1be472e28b156 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Fri, 21 Aug 2020 14:20:32 +0000
Subject: [PATCH 29/43] removed debugging stuff

---
 CMakeLists.txt          | 1 -
 ci/build_windows.py     | 2 +-
 python/mxnet/library.py | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d2be050de29..3153e0c4a0ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,7 +159,6 @@ if("$ENV{VERBOSE}" STREQUAL "1")
   message(STATUS " Verbose Makefile ACTIVATED")
   set(CMAKE_VERBOSE_MAKEFILE ON)
 endif()
-set(CMAKE_VERBOSE_MAKEFILE ON)
 
 #Switch off modern thread local for dmlc-core, please see: https://github.com/dmlc/dmlc-core/issues/571#issuecomment-543467484
 add_definitions(-DDMLC_MODERN_THREAD_LOCAL=0)
diff --git a/ci/build_windows.py b/ci/build_windows.py
index 868fcb81d6a9..0a195b50f77a 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -186,7 +186,7 @@ def windows_build(args):
             logging.info("Generating project with CMake:\n{}".format(cmd))
             check_call(cmd, shell=True, env=env)
 
-            cmd = "\"{}\" && ninja -v".format(args.vcvars)
+            cmd = "\"{}\" && ninja".format(args.vcvars)
             logging.info("Building:\n{}".format(cmd))
 
             t0 = int(time.time())
diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index f15f8f239e2d..22528a08dc01 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -31,7 +31,7 @@ def __init__(self, handle):
     def __del__(self):
         libdl = ctypes.CDLL("libdl.so")
         libdl.dlclose(self.handle)
-        print("DLClose'd")
+
 # set of libraries loaded
 loaded_libs = []
 

From cac8fbaadec6b731a2f6d59797085fa2dd48a876 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 01:02:49 +0000
Subject: [PATCH 30/43] updated instructions for gpu build

---
 config/linux_gpu.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index 442ac6cb3578..56712edbdff6 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -33,6 +33,14 @@
 # Specify `cmake --build . --parallel N` to set the number of parallel compilation jobs.
 # Default is derived from CPUs available.
 #
+# By default, cmake will try and discover which GPU architecture to use by looking at
+# the available GPUs on the machine that you're building on. If you want to build for
+# a specific GPU architecture or are building on a machine without a GPU, then
+# specify the MXNET_CUDA_ARCH option like:
+#
+# $ cmake .. -DMXNET_CUDA_ARCH=7.0
+#
+# In the example above we're building for sm_70 which is the Volta architecture.
 #-------------------------------------------------------------------------------
 
 #---------------------------------------------

From 36e0a6a9f35aeb98eb764b5b64b28804c19c69a1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 03:21:26 +0000
Subject: [PATCH 31/43] moved building into cmakelists

---
 CMakeLists.txt                                | 12 +++++++
 config/linux_gpu.cmake                        | 12 +++----
 example/extensions/lib_external_ops/Makefile  | 36 -------------------
 example/extensions/lib_external_ops/README.md | 16 ++++-----
 .../extensions/lib_external_ops/min_ex-inl.h  |  6 ++--
 .../lib_external_ops/test_loading.py          |  4 +--
 6 files changed, 27 insertions(+), 59 deletions(-)
 delete mode 100644 example/extensions/lib_external_ops/Makefile

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3153e0c4a0ed..d9327c3f8668 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -705,11 +705,18 @@ add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensio
 add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+
+FILE(GLOB_RECURSE EXTRA_SRC "${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops/*.cc")
+FILE(GLOB_RECURSE EXTRA_CUSRC "${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops/*.cu")
+add_library(external_lib SHARED ${EXTRA_SRC} ${EXTRA_CUSRC} ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+target_link_libraries(external_lib PUBLIC mshadow mxnet)
+
 target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
+target_include_directories(external_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 if(USE_CUDA)
   add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
   target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
@@ -717,6 +724,7 @@ endif()
 if(UNIX)
   if (USE_CUDA)
     target_compile_options(customop_gpu_lib PUBLIC -shared)
+    target_compile_options(external_lib PUBLIC -shared)
   endif()
 elseif(MSVC)
   target_compile_options(customop_lib PUBLIC /LD)
@@ -724,15 +732,19 @@ elseif(MSVC)
   target_compile_options(transposerowsp_lib PUBLIC /LD)
   target_compile_options(subgraph_lib PUBLIC /LD)
   target_compile_options(pass_lib PUBLIC /LD)
+  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
+  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
   set_target_properties(customop_lib PROPERTIES PREFIX "lib")
   set_target_properties(transposecsr_lib PROPERTIES PREFIX "lib")
   set_target_properties(transposerowsp_lib PROPERTIES PREFIX "lib")
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
+  set_target_properties(external_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
+    target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()
diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index 56712edbdff6..e46db8ba6d92 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -30,17 +30,13 @@
 #  $ cmake ..
 #  $ cmake --build .
 #
-# Specify `cmake --build . --parallel N` to set the number of parallel compilation jobs.
-# Default is derived from CPUs available.
+#  or you can specify the particular GPU architecture by
 #
-# By default, cmake will try and discover which GPU architecture to use by looking at
-# the available GPUs on the machine that you're building on. If you want to build for
-# a specific GPU architecture or are building on a machine without a GPU, then
-# specify the MXNET_CUDA_ARCH option like:
+#  $ cmake .. -DMXNET_CUDA_ARCH=7.0
 #
-# $ cmake .. -DMXNET_CUDA_ARCH=7.0
+# Specify `cmake --build . --parallel N` to set the number of parallel compilation jobs.
+# Default is derived from CPUs available.
 #
-# In the example above we're building for sm_70 which is the Volta architecture.
 #-------------------------------------------------------------------------------
 
 #---------------------------------------------
diff --git a/example/extensions/lib_external_ops/Makefile b/example/extensions/lib_external_ops/Makefile
deleted file mode 100644
index 6f04fdf9cc70..000000000000
--- a/example/extensions/lib_external_ops/Makefile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-SRCS=init_lib.cc \
-     min_ex.cc \
-     ../../../src/lib_api.cc
-
-INC_PATH=-I../../../include \
-    -I../../../src/operator/
-
-LIB_PATH=-L../../../build
-
-LIBS=-lmxnet
-
-all:
-	g++ -shared -fPIC -std=c++11 init_lib.cc min_ex.cc.o ../../../src/lib_api.cc -o libmin_ex.so -I../../../include -L../../../build -lmxnet
-
-test:
-	g++ -shared -fPIC -std=c++11 ${SRCS} -o libmin_ex.so ${INC_PATH} ${LIB_PATH} ${LIBS}
-
-clean:
-	rm -rf libmin_ex.so
diff --git a/example/extensions/lib_external_ops/README.md b/example/extensions/lib_external_ops/README.md
index db655535c94f..1a911a9406e0 100644
--- a/example/extensions/lib_external_ops/README.md
+++ b/example/extensions/lib_external_ops/README.md
@@ -20,23 +20,19 @@ External Operators Example and Tutorial
 
 ## Introduction
 
-TBD
+Extending MXNet with custom components used to mean distributing a custom fork. This feature allows adding custom components to MXNet by dynamically loading external libraries at runtime.
 
 ## Getting Started
 
 ### Have MXNet Ready
 
-For this tutorial, clone MXNet from source but dont build it yet.
+For this tutorial, clone MXNet from source and build it.
 
 ### Run An Example
 
 This example shows compiling a custom backend operator and then dynamically loading it into MXNet at runtime. Go to the **lib_external_ops** directory and follow these steps:
 
-1. Copy **min_ex.cc** and **min_ex-inl.h** into the src/operator directory.
-2. Build MXNet.
-3. Find the **min_ex.cc.o** file and copy it back to the **lib_external_ops** directory.
-4. Delete the **min_ex.cc** and **min_ex-inl.h** from the src/operator directory.
-5. Clean the build folder.
-6. Rebuild MXNet.
-7. Run `make` in the **lib_external_ops** directory to produce the libmin_ex.so with your custom operator inside.
-8. Run `python test_loading.py`.
\ No newline at end of file
+1. Touch or modify the **min_ex.cc** and/or **min_ex-inl.h** file(s)
+2. Go into the **build** directory that was created when building MXNet.
+3. Run `make external_lib`. Notice that **libexternal_lib.so** has been rebuilt
+4. Go to the **example/extensions/lib_external_ops** directory again and run `python test_loading.py`.
\ No newline at end of file
diff --git a/example/extensions/lib_external_ops/min_ex-inl.h b/example/extensions/lib_external_ops/min_ex-inl.h
index 82a10e6540e5..868ccfabca73 100644
--- a/example/extensions/lib_external_ops/min_ex-inl.h
+++ b/example/extensions/lib_external_ops/min_ex-inl.h
@@ -23,9 +23,9 @@
 #include <dmlc/parameter.h>
 #include <vector>
 #include <algorithm>
-#include "mxnet_op.h"
-#include "operator_common.h"
-#include "elemwise_op_common.h"
+#include "operator/mxnet_op.h"
+#include "operator/operator_common.h"
+#include "operator/elemwise_op_common.h"
 
 namespace mxnet {
 namespace op {
diff --git a/example/extensions/lib_external_ops/test_loading.py b/example/extensions/lib_external_ops/test_loading.py
index c644d8eaffeb..09a8a061b851 100644
--- a/example/extensions/lib_external_ops/test_loading.py
+++ b/example/extensions/lib_external_ops/test_loading.py
@@ -32,10 +32,10 @@
 
 # test loading library
 if (os.name=='posix'):
-    path = os.path.abspath('libmin_ex.so')
+    path = os.path.abspath('../../../build/libexternal_lib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('libmin_ex.dll')
+    path = os.path.abspath('../../../build/libexternal_lib.dll')
     mx.library.load(path)
 
 print(mx.nd.min_ex())

From caaa01129528a127295010ad7f7aacebc8c74426 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 04:10:32 +0000
Subject: [PATCH 32/43] moved build stuff into separate CMakeLists.txt

---
 CMakeLists.txt                                | 11 +-----
 .../lib_external_ops/CMakeLists.txt           | 34 +++++++++++++++++++
 .../lib_external_ops/test_loading.py          |  4 +--
 3 files changed, 37 insertions(+), 12 deletions(-)
 create mode 100644 example/extensions/lib_external_ops/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9327c3f8668..d6db70af1f17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -706,17 +706,13 @@ add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extens
 add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 
-FILE(GLOB_RECURSE EXTRA_SRC "${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops/*.cc")
-FILE(GLOB_RECURSE EXTRA_CUSRC "${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops/*.cu")
-add_library(external_lib SHARED ${EXTRA_SRC} ${EXTRA_CUSRC} ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
-target_link_libraries(external_lib PUBLIC mshadow mxnet)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops/build)
 
 target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
-target_include_directories(external_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 if(USE_CUDA)
   add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
   target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
@@ -724,7 +720,6 @@ endif()
 if(UNIX)
   if (USE_CUDA)
     target_compile_options(customop_gpu_lib PUBLIC -shared)
-    target_compile_options(external_lib PUBLIC -shared)
   endif()
 elseif(MSVC)
   target_compile_options(customop_lib PUBLIC /LD)
@@ -732,19 +727,15 @@ elseif(MSVC)
   target_compile_options(transposerowsp_lib PUBLIC /LD)
   target_compile_options(subgraph_lib PUBLIC /LD)
   target_compile_options(pass_lib PUBLIC /LD)
-  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
-  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
   set_target_properties(customop_lib PROPERTIES PREFIX "lib")
   set_target_properties(transposecsr_lib PROPERTIES PREFIX "lib")
   set_target_properties(transposerowsp_lib PROPERTIES PREFIX "lib")
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
-  set_target_properties(external_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
     target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
-    target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()
diff --git a/example/extensions/lib_external_ops/CMakeLists.txt b/example/extensions/lib_external_ops/CMakeLists.txt
new file mode 100644
index 000000000000..3d9ae6549078
--- /dev/null
+++ b/example/extensions/lib_external_ops/CMakeLists.txt
@@ -0,0 +1,34 @@
+# specify CXX sources
+FILE(GLOB CXX_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/min_ex.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/init_lib.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../src/lib_api.cc)
+
+# specify GPU sources (optional)
+FILE(GLOB CU_SRCS "*.cu")
+
+# create library & set libraries
+add_library(external_lib SHARED ${CXX_SRCS})
+target_link_libraries(external_lib PUBLIC mshadow mxnet)
+
+# generic GPU stuff
+if(USE_CUDA)
+  target_sources(external_lib $CU_SRCS)
+endif(USE_CUDA)
+
+if(UNIX)
+  # unix-specific stuff
+  if(USE_CUDA)
+    # unix+GPU-specific stuff
+    target_compile_options(external_lib PUBLIC -shared)
+  endif(USE_CUDA)
+elseif(MSVC)
+  # windows-specific stuff
+  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
+  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
+  set_target_properties(external_lib PROPERTIES PREFIX "lib")
+  if(USE_CUDA)
+    # windows+GPU-specific stuff
+    target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
+  endif(USE_CUDA)
+endif()
diff --git a/example/extensions/lib_external_ops/test_loading.py b/example/extensions/lib_external_ops/test_loading.py
index 09a8a061b851..cb94f3252a2f 100644
--- a/example/extensions/lib_external_ops/test_loading.py
+++ b/example/extensions/lib_external_ops/test_loading.py
@@ -32,10 +32,10 @@
 
 # test loading library
 if (os.name=='posix'):
-    path = os.path.abspath('../../../build/libexternal_lib.so')
+    path = os.path.abspath('build/libexternal_lib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('../../../build/libexternal_lib.dll')
+    path = os.path.abspath('build/libexternal_lib.dll')
     mx.library.load(path)
 
 print(mx.nd.min_ex())

From 27c46d772a439b2047cbaf4e8b28bd9d704ece68 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 04:59:44 +0000
Subject: [PATCH 33/43] fixed gpu example

---
 example/extensions/lib_external_ops/CMakeLists.txt | 14 ++++++++------
 example/extensions/lib_external_ops/min_ex-inl.h   |  1 +
 example/extensions/lib_external_ops/min_ex.cc      |  2 +-
 example/extensions/lib_external_ops/min_ex.cu      | 10 ++++++++++
 4 files changed, 20 insertions(+), 7 deletions(-)
 create mode 100644 example/extensions/lib_external_ops/min_ex.cu

diff --git a/example/extensions/lib_external_ops/CMakeLists.txt b/example/extensions/lib_external_ops/CMakeLists.txt
index 3d9ae6549078..5f46f82e7de8 100644
--- a/example/extensions/lib_external_ops/CMakeLists.txt
+++ b/example/extensions/lib_external_ops/CMakeLists.txt
@@ -1,11 +1,11 @@
 # specify CXX sources
 FILE(GLOB CXX_SRCS
-  ${CMAKE_CURRENT_SOURCE_DIR}/min_ex.cc
+  # Required files
   ${CMAKE_CURRENT_SOURCE_DIR}/init_lib.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../src/lib_api.cc)
-
-# specify GPU sources (optional)
-FILE(GLOB CU_SRCS "*.cu")
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../src/lib_api.cc
+  # Your custom files
+  ${CMAKE_CURRENT_SOURCE_DIR}/min_ex.cc
+  )
 
 # create library & set libraries
 add_library(external_lib SHARED ${CXX_SRCS})
@@ -13,7 +13,9 @@ target_link_libraries(external_lib PUBLIC mshadow mxnet)
 
 # generic GPU stuff
 if(USE_CUDA)
-  target_sources(external_lib $CU_SRCS)
+  # specify GPU sources (optional)
+  FILE(GLOB CU_SRCS "*.cu")
+  target_sources(external_lib PUBLIC ${CU_SRCS})
 endif(USE_CUDA)
 
 if(UNIX)
diff --git a/example/extensions/lib_external_ops/min_ex-inl.h b/example/extensions/lib_external_ops/min_ex-inl.h
index 868ccfabca73..98d93f60dc22 100644
--- a/example/extensions/lib_external_ops/min_ex-inl.h
+++ b/example/extensions/lib_external_ops/min_ex-inl.h
@@ -30,6 +30,7 @@
 namespace mxnet {
 namespace op {
 
+template<typename xpu>
 void MinExForward(const nnvm::NodeAttrs& attrs,
                   const OpContext& ctx,
                   const std::vector<TBlob>& inputs,
diff --git a/example/extensions/lib_external_ops/min_ex.cc b/example/extensions/lib_external_ops/min_ex.cc
index f8cf7003138a..2ffc1a8f7c68 100644
--- a/example/extensions/lib_external_ops/min_ex.cc
+++ b/example/extensions/lib_external_ops/min_ex.cc
@@ -28,7 +28,7 @@ NNVM_REGISTER_OP(min_ex)
 .set_num_outputs(0)
 .set_attr<mxnet::FInferShape>("FInferShape", MinExOpShape)
 .set_attr<nnvm::FInferType>("FInferType", MinExOpType)
-.set_attr<FCompute>("FCompute<cpu>", MinExForward);
+.set_attr<FCompute>("FCompute<cpu>", MinExForward<cpu>);
 
 }  // namespace op                                                                                                                                                                     
 }  // namespace mxnet      
diff --git a/example/extensions/lib_external_ops/min_ex.cu b/example/extensions/lib_external_ops/min_ex.cu
new file mode 100644
index 000000000000..34c86b6b21d5
--- /dev/null
+++ b/example/extensions/lib_external_ops/min_ex.cu
@@ -0,0 +1,10 @@
+#include "./min_ex-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(min_ex)
+.set_attr<FCompute>("FCompute<gpu>", MinExForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet

From 591141aacf94f892db0cae45164c64ebedfd1b1c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 05:13:51 +0000
Subject: [PATCH 34/43] fixed license

---
 example/extensions/lib_external_ops/min_ex.cu | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/example/extensions/lib_external_ops/min_ex.cu b/example/extensions/lib_external_ops/min_ex.cu
index 34c86b6b21d5..1183d28c4877 100644
--- a/example/extensions/lib_external_ops/min_ex.cu
+++ b/example/extensions/lib_external_ops/min_ex.cu
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #include "./min_ex-inl.h"
 
 namespace mxnet {

From 0a4621db9dbf6b8b18e690849ee9856c177cd8cc Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 07:12:03 +0000
Subject: [PATCH 35/43] added dlmc library dependency

---
 example/extensions/lib_external_ops/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/extensions/lib_external_ops/CMakeLists.txt b/example/extensions/lib_external_ops/CMakeLists.txt
index 5f46f82e7de8..fb0a496698dc 100644
--- a/example/extensions/lib_external_ops/CMakeLists.txt
+++ b/example/extensions/lib_external_ops/CMakeLists.txt
@@ -9,7 +9,7 @@ FILE(GLOB CXX_SRCS
 
 # create library & set libraries
 add_library(external_lib SHARED ${CXX_SRCS})
-target_link_libraries(external_lib PUBLIC mshadow mxnet)
+target_link_libraries(external_lib PUBLIC mshadow mxnet dmlc)
 
 # generic GPU stuff
 if(USE_CUDA)

From c2e534b2c47c956092055fa2fbf451a604365abd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 20:02:22 +0000
Subject: [PATCH 36/43] added nnvm dependency

---
 config/linux_gpu.cmake                        | 20 +++++++++----------
 .../lib_external_ops/CMakeLists.txt           |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index e46db8ba6d92..af72ca2238c1 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -24,16 +24,13 @@
 #
 #  $ cp config/linux_gpu.cmake config.cmake
 #
-#  Next modify the according entries, and then compile by
+#  Next modify the entries in the config.cmake like MXNET_CUDA_ARCH to set the specific
+#  GPU architecture, and then compile by
 #
 #  $ mkdir build; cd build
 #  $ cmake ..
 #  $ cmake --build .
 #
-#  or you can specify the particular GPU architecture by
-#
-#  $ cmake .. -DMXNET_CUDA_ARCH=7.0
-#
 # Specify `cmake --build . --parallel N` to set the number of parallel compilation jobs.
 # Default is derived from CPUs available.
 #
@@ -46,15 +43,18 @@ set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found")
 
 # Target NVIDIA GPU achitecture.
-# Valid options are "Auto" for autodetection, "All" for all available
-# architectures or a list of architectures by compute capability number, such as
-# "7.0" or "7.0;7.5" as well as name, such as "Volta" or "Volta;Turing".
+# Valid options are:
+#   - "Auto" for autodetection, will try and discover which GPU architecture to use by
+#            looking at the available GPUs on the machine that you're building on
+#   - "All" for all available GPU architectures supported by the version of CUDA installed
+#   - "specific GPU architectures" by giving the compute capability number such as
+#            "7.0" or "7.0;7.5" (ie. sm_70 or sm_75) or you can specify the name like:
+#            "Volta" or "Volta;Turing".
 # The value specified here is passed to cmake's CUDA_SELECT_NVCC_ARCH_FLAGS to
 # obtain the compilation flags for nvcc.
 #
 # When compiling on a machine without GPU, autodetection will fail and you
-# should instead specify the target architecture manually to avoid excessive
-# compilation times.
+# should instead specify the target architecture manually.
 set(MXNET_CUDA_ARCH "Auto" CACHE STRING "Target NVIDIA GPU achitecture")
 
 #---------------------------------------------
diff --git a/example/extensions/lib_external_ops/CMakeLists.txt b/example/extensions/lib_external_ops/CMakeLists.txt
index fb0a496698dc..e2c3518796c7 100644
--- a/example/extensions/lib_external_ops/CMakeLists.txt
+++ b/example/extensions/lib_external_ops/CMakeLists.txt
@@ -8,7 +8,7 @@ FILE(GLOB CXX_SRCS
   )
 
 # create library & set libraries
-add_library(external_lib SHARED ${CXX_SRCS})
+add_library(external_lib SHARED $<TARGET_OBJECTS:nnvm> ${CXX_SRCS})
 target_link_libraries(external_lib PUBLIC mshadow mxnet dmlc)
 
 # generic GPU stuff

From fd9f836b8941155594cb18e066610881ae5cb77b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 22:09:34 +0000
Subject: [PATCH 37/43] removed nnvm dmlc dependencies, added
 WINDOWS_EXPORT_ALL_SYMBOLS option

---
 CMakeLists.txt                                     | 2 ++
 example/extensions/lib_external_ops/CMakeLists.txt | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6db70af1f17..e75934e86fa6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -696,6 +696,8 @@ elseif(MSVC)
     add_library(mxnet SHARED ${SOURCE})
     target_link_libraries(mxnet PUBLIC mshadow)
   endif()
+  # export all symbols for external ops
+  set_target_properties(mxnet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS)
 endif()
 target_compile_definitions(mxnet PUBLIC DMLC_LOG_FATAL_THROW=$<BOOL:${LOG_FATAL_THROW}>)
 
diff --git a/example/extensions/lib_external_ops/CMakeLists.txt b/example/extensions/lib_external_ops/CMakeLists.txt
index e2c3518796c7..5f46f82e7de8 100644
--- a/example/extensions/lib_external_ops/CMakeLists.txt
+++ b/example/extensions/lib_external_ops/CMakeLists.txt
@@ -8,8 +8,8 @@ FILE(GLOB CXX_SRCS
   )
 
 # create library & set libraries
-add_library(external_lib SHARED $<TARGET_OBJECTS:nnvm> ${CXX_SRCS})
-target_link_libraries(external_lib PUBLIC mshadow mxnet dmlc)
+add_library(external_lib SHARED ${CXX_SRCS})
+target_link_libraries(external_lib PUBLIC mshadow mxnet)
 
 # generic GPU stuff
 if(USE_CUDA)

From 545aff6d1e18012b5413abbf96b57c4ca4896ea8 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 25 Aug 2020 22:19:13 +0000
Subject: [PATCH 38/43] fixed WINDOWS_EXPORT_ALL_SYMBOLS

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e75934e86fa6..82f7fdbc3911 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -697,7 +697,7 @@ elseif(MSVC)
     target_link_libraries(mxnet PUBLIC mshadow)
   endif()
   # export all symbols for external ops
-  set_target_properties(mxnet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS)
+  set_target_properties(mxnet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
 endif()
 target_compile_definitions(mxnet PUBLIC DMLC_LOG_FATAL_THROW=$<BOOL:${LOG_FATAL_THROW}>)
 

From df85c383545148bb54549aac1d8646a3f0ec9b59 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 26 Aug 2020 00:27:23 +0000
Subject: [PATCH 39/43] changed nnvm to shared library

---
 CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82f7fdbc3911..aa18b5c54878 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -483,9 +483,9 @@ FILE(GLOB_RECURSE NNVMSOURCE
   3rdparty/tvm/nnvm/src/core/*.h
   3rdparty/tvm/nnvm/src/pass/*.h
   3rdparty/tvm/nnvm/include/*.h)
-add_library(nnvm OBJECT ${NNVMSOURCE})
+add_library(nnvm SHARED ${NNVMSOURCE})
 set_target_properties(nnvm PROPERTIES CXX_CLANG_TIDY "")  # don't lint 3rdparty dependency
-list(APPEND SOURCE $<TARGET_OBJECTS:nnvm>)
+#list(APPEND SOURCE $<TARGET_OBJECTS:nnvm>)
 
 # add source group
 FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/tvm/nnvm/*.cc" "plugin/*.cc")
@@ -791,13 +791,13 @@ if(MSVC)
   if(USE_SPLIT_ARCH_DLL AND USE_CUDA)
     foreach(arch ${arch_code_list})
       target_link_libraries(mxnet_${arch} PUBLIC ${mxnet_LINKER_LIBS})
-      target_link_libraries(mxnet_${arch} PUBLIC dmlc)
+      target_link_libraries(mxnet_${arch} PUBLIC dmlc nnvm)
     endforeach()
   endif()
 endif()
 
 target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS})
-target_link_libraries(mxnet PUBLIC dmlc)
+target_link_libraries(mxnet PUBLIC dmlc nnvm)
 
 if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
   add_executable(im2rec "tools/im2rec.cc")
@@ -806,6 +806,7 @@ if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
     ${OpenCV_LIBS}
     mxnet
     dmlc
+    nnvm
     )
 else()
     message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \

From bc80960f274206013a9f0104fe178fda8705b64e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 26 Aug 2020 04:28:20 +0000
Subject: [PATCH 40/43] backed out external ops changes

---
 CMakeLists.txt                                | 13 ++--
 .../lib_external_ops/CMakeLists.txt           | 36 -----------
 example/extensions/lib_external_ops/README.md | 38 ------------
 .../extensions/lib_external_ops/init_lib.cc   | 39 ------------
 .../extensions/lib_external_ops/min_ex-inl.h  | 60 -------------------
 example/extensions/lib_external_ops/min_ex.cc | 34 -----------
 example/extensions/lib_external_ops/min_ex.cu | 29 ---------
 .../lib_external_ops/test_loading.py          | 41 -------------
 8 files changed, 4 insertions(+), 286 deletions(-)
 delete mode 100644 example/extensions/lib_external_ops/CMakeLists.txt
 delete mode 100644 example/extensions/lib_external_ops/README.md
 delete mode 100644 example/extensions/lib_external_ops/init_lib.cc
 delete mode 100644 example/extensions/lib_external_ops/min_ex-inl.h
 delete mode 100644 example/extensions/lib_external_ops/min_ex.cc
 delete mode 100644 example/extensions/lib_external_ops/min_ex.cu
 delete mode 100644 example/extensions/lib_external_ops/test_loading.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa18b5c54878..039750a5ba73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -483,9 +483,9 @@ FILE(GLOB_RECURSE NNVMSOURCE
   3rdparty/tvm/nnvm/src/core/*.h
   3rdparty/tvm/nnvm/src/pass/*.h
   3rdparty/tvm/nnvm/include/*.h)
-add_library(nnvm SHARED ${NNVMSOURCE})
+add_library(nnvm OBJECT ${NNVMSOURCE})
 set_target_properties(nnvm PROPERTIES CXX_CLANG_TIDY "")  # don't lint 3rdparty dependency
-#list(APPEND SOURCE $<TARGET_OBJECTS:nnvm>)
+list(APPEND SOURCE $<TARGET_OBJECTS:nnvm>)
 
 # add source group
 FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/tvm/nnvm/*.cc" "plugin/*.cc")
@@ -696,8 +696,6 @@ elseif(MSVC)
     add_library(mxnet SHARED ${SOURCE})
     target_link_libraries(mxnet PUBLIC mshadow)
   endif()
-  # export all symbols for external ops
-  set_target_properties(mxnet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
 endif()
 target_compile_definitions(mxnet PUBLIC DMLC_LOG_FATAL_THROW=$<BOOL:${LOG_FATAL_THROW}>)
 
@@ -708,8 +706,6 @@ add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extens
 add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_external_ops/build)
-
 target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
@@ -791,13 +787,13 @@ if(MSVC)
   if(USE_SPLIT_ARCH_DLL AND USE_CUDA)
     foreach(arch ${arch_code_list})
       target_link_libraries(mxnet_${arch} PUBLIC ${mxnet_LINKER_LIBS})
-      target_link_libraries(mxnet_${arch} PUBLIC dmlc nnvm)
+      target_link_libraries(mxnet_${arch} PUBLIC dmlc)
     endforeach()
   endif()
 endif()
 
 target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS})
-target_link_libraries(mxnet PUBLIC dmlc nnvm)
+target_link_libraries(mxnet PUBLIC dmlc)
 
 if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
   add_executable(im2rec "tools/im2rec.cc")
@@ -806,7 +802,6 @@ if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
     ${OpenCV_LIBS}
     mxnet
     dmlc
-    nnvm
     )
 else()
     message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \
diff --git a/example/extensions/lib_external_ops/CMakeLists.txt b/example/extensions/lib_external_ops/CMakeLists.txt
deleted file mode 100644
index 5f46f82e7de8..000000000000
--- a/example/extensions/lib_external_ops/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# specify CXX sources
-FILE(GLOB CXX_SRCS
-  # Required files
-  ${CMAKE_CURRENT_SOURCE_DIR}/init_lib.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../src/lib_api.cc
-  # Your custom files
-  ${CMAKE_CURRENT_SOURCE_DIR}/min_ex.cc
-  )
-
-# create library & set libraries
-add_library(external_lib SHARED ${CXX_SRCS})
-target_link_libraries(external_lib PUBLIC mshadow mxnet)
-
-# generic GPU stuff
-if(USE_CUDA)
-  # specify GPU sources (optional)
-  FILE(GLOB CU_SRCS "*.cu")
-  target_sources(external_lib PUBLIC ${CU_SRCS})
-endif(USE_CUDA)
-
-if(UNIX)
-  # unix-specific stuff
-  if(USE_CUDA)
-    # unix+GPU-specific stuff
-    target_compile_options(external_lib PUBLIC -shared)
-  endif(USE_CUDA)
-elseif(MSVC)
-  # windows-specific stuff
-  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
-  target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
-  set_target_properties(external_lib PROPERTIES PREFIX "lib")
-  if(USE_CUDA)
-    # windows+GPU-specific stuff
-    target_compile_options(external_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
-  endif(USE_CUDA)
-endif()
diff --git a/example/extensions/lib_external_ops/README.md b/example/extensions/lib_external_ops/README.md
deleted file mode 100644
index 1a911a9406e0..000000000000
--- a/example/extensions/lib_external_ops/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-External Operators Example and Tutorial
-=======================================
-
-## Introduction
-
-Extending MXNet with custom components used to mean distributing a custom fork. This feature allows adding custom components to MXNet by dynamically loading external libraries at runtime.
-
-## Getting Started
-
-### Have MXNet Ready
-
-For this tutorial, clone MXNet from source and build it.
-
-### Run An Example
-
-This example shows compiling a custom backend operator and then dynamically loading it into MXNet at runtime. Go to the **lib_external_ops** directory and follow these steps:
-
-1. Touch or modify the **min_ex.cc** and/or **min_ex-inl.h** file(s)
-2. Go into the **build** directory that was created when building MXNet.
-3. Run `make external_lib`. Notice that **libexternal_lib.so** has been rebuilt
-4. Go to the **example/extensions/lib_external_ops** directory again and run `python test_loading.py`.
\ No newline at end of file
diff --git a/example/extensions/lib_external_ops/init_lib.cc b/example/extensions/lib_external_ops/init_lib.cc
deleted file mode 100644
index a21c481bee2f..000000000000
--- a/example/extensions/lib_external_ops/init_lib.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file init_lib.cc
- * \brief Sample library file
- */
-
-#include <iostream>
-#include "mxnet/lib_api.h"
-
-using namespace mxnet::ext;
-
-MXReturnValue initialize(int version) {
-  if (version >= 10700) {
-    std::cout << "MXNet version " << version << " supported" << std::endl;
-    return MX_SUCCESS;
-  } else {
-    MX_ERROR_MSG << "MXNet version " << version << " not supported";
-    return MX_FAIL;
-  }
-}
diff --git a/example/extensions/lib_external_ops/min_ex-inl.h b/example/extensions/lib_external_ops/min_ex-inl.h
deleted file mode 100644
index 98d93f60dc22..000000000000
--- a/example/extensions/lib_external_ops/min_ex-inl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
-#define MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
-
-#include <dmlc/parameter.h>
-#include <vector>
-#include <algorithm>
-#include "operator/mxnet_op.h"
-#include "operator/operator_common.h"
-#include "operator/elemwise_op_common.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu>
-void MinExForward(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-  //do nothing                                                                                                                                                                         
-}
-
-
-inline bool MinExOpShape(const nnvm::NodeAttrs& attrs,
-                         mxnet::ShapeVector* in_attrs,
-                         mxnet::ShapeVector* out_attrs) {
-    //do nothing                                                                                                                                                                       
-    return true;
-}
-
-inline bool MinExOpType(const nnvm::NodeAttrs& attrs,
-                        std::vector<int> *in_attrs,
-                        std::vector<int> *out_attrs) {
-  //do nothing                                                                                                                                                                         
-  return true;
-}
-
-}  // namespace op                                                                                                                                                                     
-}  // namespace mxnet                                                                                                                                                                  
-
-#endif  // MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
diff --git a/example/extensions/lib_external_ops/min_ex.cc b/example/extensions/lib_external_ops/min_ex.cc
deleted file mode 100644
index 2ffc1a8f7c68..000000000000
--- a/example/extensions/lib_external_ops/min_ex.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "min_ex-inl.h"
-
-namespace mxnet {
-namespace op {
-
-NNVM_REGISTER_OP(min_ex)
-.describe("some description")
-.set_num_inputs(0)
-.set_num_outputs(0)
-.set_attr<mxnet::FInferShape>("FInferShape", MinExOpShape)
-.set_attr<nnvm::FInferType>("FInferType", MinExOpType)
-.set_attr<FCompute>("FCompute<cpu>", MinExForward<cpu>);
-
-}  // namespace op                                                                                                                                                                     
-}  // namespace mxnet      
diff --git a/example/extensions/lib_external_ops/min_ex.cu b/example/extensions/lib_external_ops/min_ex.cu
deleted file mode 100644
index 1183d28c4877..000000000000
--- a/example/extensions/lib_external_ops/min_ex.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "./min_ex-inl.h"
-
-namespace mxnet {
-namespace op {
-
-NNVM_REGISTER_OP(min_ex)
-.set_attr<FCompute>("FCompute<gpu>", MinExForward<gpu>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/example/extensions/lib_external_ops/test_loading.py b/example/extensions/lib_external_ops/test_loading.py
deleted file mode 100644
index cb94f3252a2f..000000000000
--- a/example/extensions/lib_external_ops/test_loading.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=arguments-differ
-
-# This test checks if dynamic loading of library into MXNet is successful
-
-import mxnet as mx
-import os
-
-try:
-    print(mx.nd.min_ex())
-except Exception as ex:
-    print('Operator not registered yet. %s: %s' %(type(ex).__name__,ex))
-
-# test loading library
-if (os.name=='posix'):
-    path = os.path.abspath('build/libexternal_lib.so')
-    mx.library.load(path)
-elif (os.name=='nt'):
-    path = os.path.abspath('build/libexternal_lib.dll')
-    mx.library.load(path)
-
-print(mx.nd.min_ex())

From ab0cc435d8ac01da01e0f563eb7a1029880ee212 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Fri, 28 Aug 2020 00:06:57 +0000
Subject: [PATCH 41/43] split relu example into separate files to test separate
 lib_api.h/cc

---
 CMakeLists.txt                               |   4 +-
 example/extensions/lib_custom_op/Makefile    |   5 +-
 example/extensions/lib_custom_op/relu_lib.cc | 171 ++++++++++++++++++
 example/extensions/lib_custom_op/relu_lib.cu | 173 +------------------
 example/extensions/lib_custom_op/relu_lib.h  |  90 ++++++++++
 include/mxnet/lib_api.h                      |  35 ++--
 src/lib_api.cc                               |  25 +++
 7 files changed, 311 insertions(+), 192 deletions(-)
 create mode 100644 example/extensions/lib_custom_op/relu_lib.cc
 create mode 100644 example/extensions/lib_custom_op/relu_lib.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 039750a5ba73..3a73745edfb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -712,8 +712,8 @@ target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
 target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 if(USE_CUDA)
-  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
-  target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
+  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+  target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op)
 endif()
 if(UNIX)
   if (USE_CUDA)
diff --git a/example/extensions/lib_custom_op/Makefile b/example/extensions/lib_custom_op/Makefile
index c16d7cd0207e..97dabf8a0759 100644
--- a/example/extensions/lib_custom_op/Makefile
+++ b/example/extensions/lib_custom_op/Makefile
@@ -21,7 +21,10 @@ gemm_lib:
 	g++ -shared -fPIC -std=c++11 gemm_lib.cc ../../../src/lib_api.cc -o libgemm_lib.so -I ../../../include
 
 relu_lib:
-	nvcc -shared -std=c++11 -Xcompiler -fPIC relu_lib.cu ../../../src/lib_api.cc -o librelu_lib.so -I ../../../include
+	g++ -fPIC -c -std=c++11 relu_lib.cc -o relu_lib.cc.o -I ../../../include
+	g++ -fPIC -c -std=c++11 ../../../src/lib_api.cc -o lib_api.cc.o -I ../../../include
+	nvcc -c -std=c++11 -Xcompiler -fPIC relu_lib.cu -o relu_lib.cu.o -I ../../../include
+	nvcc -shared relu_lib.cc.o lib_api.cc.o relu_lib.cu.o -o librelu_lib.so
 
 transposecsr_lib:
 	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc ../../../src/lib_api.cc -o libtransposecsr_lib.so -I ../../../include
diff --git a/example/extensions/lib_custom_op/relu_lib.cc b/example/extensions/lib_custom_op/relu_lib.cc
new file mode 100644
index 000000000000..4d9533d85465
--- /dev/null
+++ b/example/extensions/lib_custom_op/relu_lib.cc
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file relu_lib.cu
+ * \brief simple custom relu and noisy relu operator implemented using CUDA function
+ */
+
+#include <iostream>
+#include "relu_lib.h"
+
+using namespace mxnet::ext;
+
+MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
+                         int* num_in, int* num_out) {
+  *num_in = 1;
+  *num_out = 1;
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferType(const std::unordered_map<std::string, std::string>& attrs,
+                        std::vector<int>* intypes,
+                        std::vector<int>* outtypes) {
+  outtypes->at(0) = intypes->at(0);
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& attrs,
+                         std::vector<std::vector<unsigned int>>* inshapes,
+                         std::vector<std::vector<unsigned int>>* outshapes) {
+  outshapes->at(0) = inshapes->at(0);
+  return MX_SUCCESS;
+}
+
+MXReturnValue forwardCPU(const std::unordered_map<std::string, std::string>& attrs,
+                         std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& res) {
+  float* in_data = inputs->at(0).data<float>();
+  float* out_data = outputs->at(0).data<float>();
+  for (int i=0; i<inputs->at(0).size(); i++) {
+    out_data[i] = in_data[i] > 0 ? in_data[i] : 0;
+  }
+  return MX_SUCCESS;
+}
+
+MXReturnValue backwardCPU(const std::unordered_map<std::string, std::string>& attrs,
+                          std::vector<MXTensor>* inputs,
+                          std::vector<MXTensor>* outputs,
+                          const OpResource& res) {
+  float* out_grad = inputs->at(0).data<float>();
+  float* in_data = inputs->at(1).data<float>();
+  float* in_grad = outputs->at(0).data<float>();
+  for (int i=0; i<inputs->at(1).size(); i++) {
+    in_grad[i] = in_data[i] > 0 ? 1 * out_grad[i] : 0;
+  }
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(my_relu)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape)
+.setForward(forwardCPU, "cpu")
+.setForward(forwardGPU, "gpu")
+.setBackward(backwardCPU, "cpu")
+.setBackward(backwardGPU, "gpu");
+
+
+MyStatefulReluCPU::MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs)
+  : attrs_(attrs) {}
+
+MXReturnValue MyStatefulReluCPU::Forward(std::vector<MXTensor>* inputs,
+                                         std::vector<MXTensor>* outputs,
+                                         const OpResource& op_res) {
+  return forwardCPU(attrs_, inputs, outputs, op_res);
+}
+
+MXReturnValue MyStatefulReluCPU::Backward(std::vector<MXTensor>* inputs,
+                                          std::vector<MXTensor>* outputs,
+                                          const OpResource& op_res) {
+  return backwardCPU(attrs_, inputs, outputs, op_res);
+}
+
+MyStatefulReluGPU::MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs)
+  : attrs_(attrs) {}
+
+MXReturnValue MyStatefulReluGPU::Forward(std::vector<MXTensor>* inputs,
+                                         std::vector<MXTensor>* outputs,
+                                         const OpResource& op_res) {
+  return forwardGPU(attrs_, inputs, outputs, op_res);
+}
+
+MXReturnValue MyStatefulReluGPU::Backward(std::vector<MXTensor>* inputs,
+                                          std::vector<MXTensor>* outputs,
+                                          const OpResource& op_res) {
+  return backwardGPU(attrs_, inputs, outputs, op_res);
+}
+
+
+MXReturnValue createOpStateCPU(const std::unordered_map<std::string, std::string>& attrs,
+                               CustomStatefulOp** op_inst) {
+  *op_inst = new MyStatefulReluCPU(attrs);
+  return MX_SUCCESS;
+}
+
+MXReturnValue createOpStateGPU(const std::unordered_map<std::string, std::string>& attrs,
+                               CustomStatefulOp** op_inst) {
+  *op_inst = new MyStatefulReluGPU(attrs);
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(my_state_relu)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape)
+.setCreateOpState(createOpStateCPU, "cpu")
+.setCreateOpState(createOpStateGPU, "gpu");
+
+MXReturnValue noisyForwardCPU(const std::unordered_map<std::string, std::string>& attrs,
+                              std::vector<MXTensor>* inputs,
+                              std::vector<MXTensor>* outputs,
+                              const OpResource& res) {
+  float* in_data = inputs->at(0).data<float>();
+  float* out_data = outputs->at(0).data<float>();
+
+  mx_cpu_rand_t* states = res.get_cpu_rand_states();
+  std::normal_distribution<float> dist_normal;
+
+  for (int i=0; i<inputs->at(0).size(); ++i) {
+    float noise = dist_normal(*states);
+    out_data[i] = in_data[i] + noise > 0 ? in_data[i] + noise : 0;
+  }
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(my_noisy_relu)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape)
+.setForward(noisyForwardCPU, "cpu")
+.setForward(noisyForwardGPU, "gpu")
+.setBackward(backwardCPU, "cpu")
+.setBackward(backwardGPU, "gpu");
+
+MXReturnValue initialize(int version) {
+  if (version >= 20000) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    MX_ERROR_MSG << "MXNet version " << version << " not supported";
+    return MX_FAIL;
+  }
+}
diff --git a/example/extensions/lib_custom_op/relu_lib.cu b/example/extensions/lib_custom_op/relu_lib.cu
index 34ce08db6373..c309274e61c6 100644
--- a/example/extensions/lib_custom_op/relu_lib.cu
+++ b/example/extensions/lib_custom_op/relu_lib.cu
@@ -24,49 +24,16 @@
  */
 
 #include <iostream>
-#include "mxnet/lib_api.h"
+#include "relu_lib.h"
 
 using namespace mxnet::ext;
 
-#define NumThreadPerBlock 256 // mxnet recommended cuda thread number per block
-
 __global__ void relu_gpu_forward(float *out, float *in, int64_t N) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < N)
     out[tid] = in[tid] > 0 ? in[tid] : 0;
 }
 
-__global__ void relu_gpu_backward(float *ingrad, float *outgrad, float *indata, int64_t N) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < N)
-    ingrad[tid] = indata[tid] > 0 ? 1 * outgrad[tid] : 0;
-}
-
-MXReturnValue forwardCPU(const std::unordered_map<std::string, std::string>& attrs,
-                         std::vector<MXTensor>* inputs,
-                         std::vector<MXTensor>* outputs,
-                         const OpResource& res) {
-  float* in_data = inputs->at(0).data<float>();
-  float* out_data = outputs->at(0).data<float>();
-  for (int i=0; i<inputs->at(0).size(); i++) {
-    out_data[i] = in_data[i] > 0 ? in_data[i] : 0;
-  }
-  return MX_SUCCESS;
-}
-
-MXReturnValue backwardCPU(const std::unordered_map<std::string, std::string>& attrs,
-                          std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& res) {
-  float* out_grad = inputs->at(0).data<float>();
-  float* in_data = inputs->at(1).data<float>();
-  float* in_grad = outputs->at(0).data<float>();
-  for (int i=0; i<inputs->at(1).size(); i++) {
-    in_grad[i] = in_data[i] > 0 ? 1 * out_grad[i] : 0;
-  }
-  return MX_SUCCESS;
-}
-
 MXReturnValue forwardGPU(const std::unordered_map<std::string, std::string>& attrs,
                          std::vector<MXTensor>* inputs,
                          std::vector<MXTensor>* outputs,
@@ -83,6 +50,12 @@ MXReturnValue forwardGPU(const std::unordered_map<std::string, std::string>& att
   return MX_SUCCESS;
 }
 
+__global__ void relu_gpu_backward(float *ingrad, float *outgrad, float *indata, int64_t N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N)
+    ingrad[tid] = indata[tid] > 0 ? 1 * outgrad[tid] : 0;
+}
+
 MXReturnValue backwardGPU(const std::unordered_map<std::string, std::string>& attrs,
                           std::vector<MXTensor>* inputs,
                           std::vector<MXTensor>* outputs,
@@ -99,102 +72,6 @@ MXReturnValue backwardGPU(const std::unordered_map<std::string, std::string>& at
   return MX_SUCCESS;
 }
 
-MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
-                         int* num_in, int* num_out) {
-  *num_in = 1;
-  *num_out = 1;
-  return MX_SUCCESS;
-}
-
-MXReturnValue inferType(const std::unordered_map<std::string, std::string>& attrs,
-                        std::vector<int>* intypes,
-                        std::vector<int>* outtypes) {
-  outtypes->at(0) = intypes->at(0);
-  return MX_SUCCESS;
-}
-
-MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& attrs,
-                         std::vector<std::vector<unsigned int>>* inshapes,
-                         std::vector<std::vector<unsigned int>>* outshapes) {
-  outshapes->at(0) = inshapes->at(0);
-  return MX_SUCCESS;
-}
-
-REGISTER_OP(my_relu)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setForward(forwardCPU, "cpu")
-.setForward(forwardGPU, "gpu")
-.setBackward(backwardCPU, "cpu")
-.setBackward(backwardGPU, "gpu");
-
-class MyStatefulReluCPU : public CustomStatefulOp {
-  public:
-    explicit MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs)
-      : attrs_(attrs) {}
-    MXReturnValue Forward(std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& op_res) {
-      return forwardCPU(attrs_, inputs, outputs, op_res);
-    }
-    MXReturnValue Backward(std::vector<MXTensor>* inputs,
-                           std::vector<MXTensor>* outputs,
-                           const OpResource& op_res) {
-      return backwardCPU(attrs_, inputs, outputs, op_res);
-    }
-    ~MyStatefulReluCPU() {}
-  private:
-    const std::unordered_map<std::string, std::string> attrs_;
-};
-
-class MyStatefulReluGPU : public CustomStatefulOp {
-  public:
-    explicit MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs)
-      : attrs_(attrs) {}
-    MXReturnValue Forward(std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& op_res) {
-      return forwardGPU(attrs_, inputs, outputs, op_res);
-    }
-    MXReturnValue Backward(std::vector<MXTensor>* inputs,
-                           std::vector<MXTensor>* outputs,
-                           const OpResource& op_res) {
-      return backwardGPU(attrs_, inputs, outputs, op_res);
-    }
-    ~MyStatefulReluGPU() {}
-  private:
-    const std::unordered_map<std::string, std::string> attrs_;
-};
-
-MXReturnValue createOpStateCPU(const std::unordered_map<std::string, std::string>& attrs,
-                               CustomStatefulOp** op_inst) {
-  *op_inst = new MyStatefulReluCPU(attrs);
-  return MX_SUCCESS;
-}
-
-MXReturnValue createOpStateGPU(const std::unordered_map<std::string, std::string>& attrs,
-                               CustomStatefulOp** op_inst) {
-  *op_inst = new MyStatefulReluGPU(attrs);
-  return MX_SUCCESS;
-}
-
-REGISTER_OP(my_state_relu)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setCreateOpState(createOpStateCPU, "cpu")
-.setCreateOpState(createOpStateGPU, "gpu");
-
-/*
- * Below is noisy ReLU operator example
- * noisy ReLU is made from ReLU extended to include Gaussian noise
- * forward - add Gaussian noise generated from normal distribution to each unit
- * backward - gradient doesn't need to change since noise is constant
- */
-
-#define NumRandomPerThread 64 // mxnet recommended random numbers generated per thread
-
 __global__ void noisy_relu_gpu_forward(float *out, float *in, int64_t N, mx_gpu_rand_t* states, int step) {
     // the launcher logic ensures tid less than NumGPURandomStates
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -209,23 +86,6 @@ __global__ void noisy_relu_gpu_forward(float *out, float *in, int64_t N, mx_gpu_
     }
 }
 
-MXReturnValue noisyForwardCPU(const std::unordered_map<std::string, std::string>& attrs,
-                              std::vector<MXTensor>* inputs,
-                              std::vector<MXTensor>* outputs,
-                              const OpResource& res) {
-  float* in_data = inputs->at(0).data<float>();
-  float* out_data = outputs->at(0).data<float>();
-
-  mx_cpu_rand_t* states = res.get_cpu_rand_states();
-  std::normal_distribution<float> dist_normal;
-
-  for (int i=0; i<inputs->at(0).size(); ++i) {
-    float noise = dist_normal(*states);
-    out_data[i] = in_data[i] + noise > 0 ? in_data[i] + noise : 0;
-  }
-  return MX_SUCCESS;
-}
-
 MXReturnValue noisyForwardGPU(const std::unordered_map<std::string, std::string>& attrs,
                               std::vector<MXTensor>* inputs,
                               std::vector<MXTensor>* outputs,
@@ -250,22 +110,3 @@ MXReturnValue noisyForwardGPU(const std::unordered_map<std::string, std::string>
 
   return MX_SUCCESS;
 }
-
-REGISTER_OP(my_noisy_relu)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setForward(noisyForwardCPU, "cpu")
-.setForward(noisyForwardGPU, "gpu")
-.setBackward(backwardCPU, "cpu")
-.setBackward(backwardGPU, "gpu");
-
-MXReturnValue initialize(int version) {
-  if (version >= 10700) {
-    std::cout << "MXNet version " << version << " supported" << std::endl;
-    return MX_SUCCESS;
-  } else {
-    MX_ERROR_MSG << "MXNet version " << version << " not supported";
-    return MX_FAIL;
-  }
-}
diff --git a/example/extensions/lib_custom_op/relu_lib.h b/example/extensions/lib_custom_op/relu_lib.h
new file mode 100644
index 000000000000..5aadfe930340
--- /dev/null
+++ b/example/extensions/lib_custom_op/relu_lib.h
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file relu_lib.cu
+ * \brief simple custom relu and noisy relu operator implemented using CUDA function
+ */
+
+#ifndef __EXAMPLE__RELU_LIB_H__
+#define __EXAMPLE__RELU_LIB_H__
+
+#include <iostream>
+#include "mxnet/lib_api.h"
+
+using namespace mxnet::ext;
+
+#define NumThreadPerBlock 256 // mxnet recommended cuda thread number per block
+#define NumRandomPerThread 64 // mxnet recommended random numbers generated per thread
+
+class MyStatefulReluCPU : public CustomStatefulOp {
+  public:
+   explicit MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs);
+
+   MXReturnValue Forward(std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& op_res);
+   MXReturnValue Backward(std::vector<MXTensor>* inputs,
+                          std::vector<MXTensor>* outputs,
+                          const OpResource& op_res);
+
+  private:
+    const std::unordered_map<std::string, std::string> attrs_;
+};
+
+class MyStatefulReluGPU : public CustomStatefulOp {
+  public:
+   explicit MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs);
+
+    MXReturnValue Forward(std::vector<MXTensor>* inputs,
+                          std::vector<MXTensor>* outputs,
+                          const OpResource& op_res);
+    
+    MXReturnValue Backward(std::vector<MXTensor>* inputs,
+                           std::vector<MXTensor>* outputs,
+                           const OpResource& op_res);
+    
+  private:
+    const std::unordered_map<std::string, std::string> attrs_;
+};
+
+MXReturnValue forwardGPU(const std::unordered_map<std::string, std::string>& attrs,
+                         std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& res);
+
+MXReturnValue backwardGPU(const std::unordered_map<std::string, std::string>& attrs,
+                          std::vector<MXTensor>* inputs,
+                          std::vector<MXTensor>* outputs,
+                          const OpResource& res);
+
+/*
+ * Below is noisy ReLU operator example
+ * noisy ReLU is made from ReLU extended to include Gaussian noise
+ * forward - add Gaussian noise generated from normal distribution to each unit
+ * backward - gradient doesn't need to change since noise is constant
+ */
+
+MXReturnValue noisyForwardGPU(const std::unordered_map<std::string, std::string>& attrs,
+                              std::vector<MXTensor>* inputs,
+                              std::vector<MXTensor>* outputs,
+                              const OpResource& res);
+
+#endif
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index ee028dfe60e2..a28a45f3595d 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -220,29 +220,18 @@ namespace ext {
 /* \brief Class to store error messages from extensions to pass to MXNet */
 class MXerrorMsgs {
  public:
-  /*!
-   * \brief get singleton pointer to class
-   * \returns pointer to class
-   */
-  static MXerrorMsgs* get() {
-    static MXerrorMsgs inst;
-    return &inst;
-  }
-  /*!
-   * \brief add a new error message
-   */
-  std::stringstream& add(const char* file, int line) {
-    messages.push_back(std::stringstream());
-    messages.back() << file << "[" << line << "]: ";
-    return messages.back();
-  }
-  int size() {
-    return messages.size();
-  }
-  const std::string* get(int idx) {
-    return new std::string(messages.at(idx).str());
-  }
-
+  /* \brief get singleton pointer to class */
+  static MXerrorMsgs* get();
+  
+  /* \brief add a new error message */
+  std::stringstream& add(const char* file, int line);
+
+  /* \brief return number of error messages */
+  int size();
+
+  /* \brief get error message at index */
+  const std::string* get(int idx);
+  
  private:
   /*! \brief constructor */
   MXerrorMsgs() {}
diff --git a/src/lib_api.cc b/src/lib_api.cc
index b00dcfaf9472..fbc3580184a7 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -31,14 +31,39 @@
 
 #include "mxnet/lib_api.h"
 
+mxnet::ext::MXerrorMsgs* mxnet::ext::MXerrorMsgs::get() {
+    static MXerrorMsgs inst;
+    return &inst;
+  }
+
+std::stringstream& mxnet::ext::MXerrorMsgs::add(const char* file, int line) {
+  messages.push_back(std::stringstream());
+  messages.back() << file << "[" << line << "]: ";
+  return messages.back();
+}
+
+int mxnet::ext::MXerrorMsgs::size() {
+  return messages.size();
+}
+
+const std::string* mxnet::ext::MXerrorMsgs::get(int idx) {
+  return new std::string(messages.at(idx).str());
+}
+
 mxnet::ext::MXContext::MXContext() : dev_type("error"), dev_id(-1) {}
+
 mxnet::ext::MXContext::MXContext(std::string dev_type_, int dev_id_)
   : dev_type(std::move(dev_type_)), dev_id(dev_id_) {}
+
 mxnet::ext::MXContext::MXContext(const char* dev_type_, int dev_id_)
   : dev_type(dev_type_), dev_id(dev_id_) {}
+
 mxnet::ext::MXContext mxnet::ext::MXContext::CPU() { return MXContext("cpu", 0); }
+
 mxnet::ext::MXContext mxnet::ext::MXContext::GPU() { return MXContext("gpu", 0); }
+
 mxnet::ext::MXContext mxnet::ext::MXContext::CPU(int dev_id) { return MXContext("cpu", dev_id); }
+
 mxnet::ext::MXContext mxnet::ext::MXContext::GPU(int dev_id) { return MXContext("gpu", dev_id); }
 
 void mxnet::ext::MXSparse::set(void *data_ptr, const int64_t* dims, int ndims, void *idx,

From 8fd4e2dcf5d878aaa54dc992a8a8825a52e9a322 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Fri, 28 Aug 2020 00:19:39 +0000
Subject: [PATCH 42/43] sanity

---
 include/mxnet/lib_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index a28a45f3595d..57267d420a33 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -222,7 +222,7 @@ class MXerrorMsgs {
  public:
   /* \brief get singleton pointer to class */
   static MXerrorMsgs* get();
-  
+
   /* \brief add a new error message */
   std::stringstream& add(const char* file, int line);
 
@@ -231,7 +231,7 @@ class MXerrorMsgs {
 
   /* \brief get error message at index */
   const std::string* get(int idx);
-  
+
  private:
   /*! \brief constructor */
   MXerrorMsgs() {}

From 2431c9e9543624d915f4887ff7a53ee6ddf743cf Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Fri, 28 Aug 2020 17:20:12 +0000
Subject: [PATCH 43/43] addressed initial review items

---
 config/linux_gpu.cmake                                  | 2 +-
 example/extensions/lib_custom_op/test_transposerowsp.py | 5 -----
 src/lib_api.cc                                          | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index af72ca2238c1..c75d2947d421 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -49,7 +49,7 @@ set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found")
 #   - "All" for all available GPU architectures supported by the version of CUDA installed
 #   - "specific GPU architectures" by giving the compute capability number such as
 #            "7.0" or "7.0;7.5" (ie. sm_70 or sm_75) or you can specify the name like:
-#            "Volta" or "Volta;Turing".
+#            "Volta" or "Volta;Turing", be sure not to use quotes (ie. just set to 7.0)
 # The value specified here is passed to cmake's CUDA_SELECT_NVCC_ARCH_FLAGS to
 # obtain the compilation flags for nvcc.
 #
diff --git a/example/extensions/lib_custom_op/test_transposerowsp.py b/example/extensions/lib_custom_op/test_transposerowsp.py
index d2d2c2eeeb32..ef51deaba24a 100644
--- a/example/extensions/lib_custom_op/test_transposerowsp.py
+++ b/example/extensions/lib_custom_op/test_transposerowsp.py
@@ -56,20 +56,15 @@
 e = mx.sym.my_transposerowsp(d)
 f = mx.sym.my_state_transposerowsp(d, test_kw=200)
 
-#exe = e.bind(ctx=mx.cpu(),args={'d':a})
 block = mx.gluon.nn.SymbolBlock(e,[d])
-#out = exe.forward()
 out = block(a)
 print("Compute Results:")
 print(out)
 print("data:", out.data.asnumpy())
 print("indices:", out.indices.asnumpy())
 
-#exe2 = f.bind(ctx=mx.cpu(),args={'d':a})
 block2 = mx.gluon.nn.SymbolBlock(f,[d])
-#out2 = exe2.forward()
 out2 = block2(a)
-#out2 = exe2.forward()
 out2 = block2(a)
 print("Stateful Compute Result:")
 print("data:", out2.data.asnumpy())
diff --git a/src/lib_api.cc b/src/lib_api.cc
index fbc3580184a7..4181d8b70213 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -37,7 +37,7 @@ mxnet::ext::MXerrorMsgs* mxnet::ext::MXerrorMsgs::get() {
   }
 
 std::stringstream& mxnet::ext::MXerrorMsgs::add(const char* file, int line) {
-  messages.push_back(std::stringstream());
+  messages.emplace_back();
   messages.back() << file << "[" << line << "]: ";
   return messages.back();
 }