diff --git a/include/mxnet/base.h b/include/mxnet/base.h index 4c63b35bade5..e6add64d9bad 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -79,6 +79,18 @@ struct Context { */ Context(int dev_mask, int dev_id) : dev_mask(dev_mask), dev_id(dev_id) {} + /*! + * \brief Comparator, used to enable Context as std::map key. + * \param b another context to compare + * \return compared result + */ + inline bool operator<(const Context &b) const { + if (dev_mask == b.dev_mask) { + return dev_id < b.dev_id; + } else { + return dev_mask < b.dev_mask; + } + } /*! * \brief check if current context equals another one * \param b another context to compare diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc index 943a50f63f6a..6983beab1e90 100644 --- a/src/symbol/graph_executor.cc +++ b/src/symbol/graph_executor.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include "./graph_executor.h" #include "./graph_algorithm.h" @@ -361,7 +362,7 @@ void GraphExecutor::InitDataEntryMemory() { } // use allocator to allocate memory. - GraphStorageAllocator allocator(&graph_); + GraphStorageAllocator allocator(&graph_, topo_order_); for (size_t i = 0; i < topo_order_.size(); ++i) { uint32_t nid = topo_order_[i]; if (!op_nodes_[nid].activated) continue; @@ -453,13 +454,9 @@ void GraphExecutor::InitDataEntryMemory() { } } -// simple unique context index of context -inline uint32_t UniqueContextIndex(const Context &ctx) { - if (ctx.dev_mask == cpu::kDevMask) return 0; - return ctx.dev_id + 1; -} - void GraphExecutor::InitResources() { + // maximum amount of color allowed in coloring algorithm + const uint32_t kMaxNumColor = 8; // prepare for temp space allocation std::vector req_temp_cnt(topo_order_.size(), 0); for (size_t i = 0; i < topo_order_.size(); ++i) { @@ -473,14 +470,14 @@ void GraphExecutor::InitResources() { CHECK_LE(cnt, 1) << "Node can only have one temp space request"; req_temp_cnt[nid] = cnt; } - uint32_t num_color = 16; + uint32_t num_color = kMaxNumColor; std::vector req_temp_color; // use graph coloring to find node that won't run in parallel num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt, num_color, &req_temp_color); // cached resources temp space - std::map > cached_temp; + std::map > cached_temp; total_allocated_temp_ = 0; // Resource allocation @@ -496,9 +493,8 @@ void GraphExecutor::InitResources() { const Context &ctx = op_nodes_[nid].ctx; if (req.type == ResourceRequest::kTempSpace) { uint32_t color = req_temp_color[nid]; - uint32_t ctx_id = UniqueContextIndex(ctx); // try to reuse graph in same color - std::map &cmap = cached_temp[ctx_id]; + std::map &cmap = cached_temp[ctx]; if (cmap.count(color) != 0) { requested.push_back(cmap.at(color)); } else { diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h index 5812e5c94b86..f6a989b27099 100644 --- a/src/symbol/graph_memory_allocator.h +++ b/src/symbol/graph_memory_allocator.h @@ -11,6 +11,7 @@ #include #include #include +#include "./graph_algorithm.h" namespace mxnet { /*! @@ -34,7 +35,9 @@ class GraphStorageAllocator { /*! \brief bad storage id */ static const StorageID kBadStorageID = -1; /*! \brief constructor to the graph memory allocator */ - explicit GraphStorageAllocator(StaticGraph *graph); + explicit GraphStorageAllocator( + StaticGraph *graph, + const std::vector& topo_order) noexcept(false); /*! * \brief Request a memory. * \param ctx the context of the graph @@ -69,10 +72,12 @@ class GraphStorageAllocator { Context ctx; /*! \brief maximum size of the storage that is requested */ size_t max_size; + /*! \brief node index that released it last time */ + uint32_t released_by_node; /*! \brief the actual NDArray to hold the data */ NDArray data; /*! \brief constructor */ - StorageEntry() : max_size(0) {} + StorageEntry() : max_size(0), released_by_node(0) {} }; /*! * \brief Allocate a StorageID when Request cannot found existing ones. @@ -80,7 +85,11 @@ class GraphStorageAllocator { * \param shape shape of the NDArray we want */ StorageID Alloc(Context ctx, size_t size); - + /*! + * \brief Initialize the colors of graph nodes. + * \param topo_order the topological order in the graph. + */ + void InitColor(const std::vector &topo_order); /*! \brief reference to the computation graph */ StaticGraph *graph_; /*! \brief all the resources available */ @@ -91,12 +100,39 @@ class GraphStorageAllocator { * \brief free list of storage entries, maps size to free list */ std::multimap free_; + + /*! + * \brief color of nodes in the graph, used for auxiliary policy making. + */ + std::vector node_color_; + /*! \brief whether use color based match algorithm */ + uint32_t num_match_color_; }; // put implementation in header files for now -GraphStorageAllocator::GraphStorageAllocator(StaticGraph *graph) - : graph_(graph) { +GraphStorageAllocator::GraphStorageAllocator( + StaticGraph *graph, + const std::vector& topo_order) noexcept(false) + : graph_(graph) , num_match_color_(0) { match_range_ = dmlc::GetEnv("MXNET_EXEC_MATCH_RANGE", 16); + // if we set this to 1, this means no color based match. + // color based match will cost a bit more memory usually + // but also enables more parallelization. + num_match_color_ = dmlc::GetEnv("MXNET_EXEC_MATCH_NUM_COLOR", 4); + this->InitColor(topo_order); +} + +void GraphStorageAllocator::InitColor(const std::vector& topo_order) { + std::vector importance(graph_->nodes.size(), 0); + for (size_t i = 0; i < topo_order.size(); ++i) { + uint32_t nid = topo_order[i]; + if (graph_->nodes[nid].is_variable()) continue; + importance[nid] = 1; + } + num_match_color_ = graph::ColorNodeGroup( + *graph_, topo_order, + importance, num_match_color_, + &node_color_); } GraphStorageAllocator::StorageID @@ -114,6 +150,7 @@ GraphStorageAllocator::StorageID GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) { // search memory block in [size / match_range_, size * match_range_) size_t size = shape.Size(); + if (match_range_ == 0) return this->Alloc(ctx, size); auto begin = free_.lower_bound(size / match_range_); auto mid = free_.lower_bound(size); auto end = free_.upper_bound(size * match_range_); @@ -122,6 +159,7 @@ GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) { for (auto it = mid; it != end; ++it) { StorageEntry *e = it->second; if (e->ctx != ctx) continue; + if (node_color_[e->released_by_node] != node_color_[node_id]) continue; // Use exect matching strategy e->max_size = std::max(size, e->max_size); // find a exact match, erase from map and return @@ -133,6 +171,7 @@ GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) { --it; StorageEntry *e = it->second; if (e->ctx != ctx) continue; + if (node_color_[e->released_by_node] != node_color_[node_id]) continue; // Use exect matching strategy e->max_size = std::max(size, e->max_size); // find a exact match, erase from map and return @@ -146,6 +185,7 @@ GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) { void GraphStorageAllocator::Release(StorageID id, uint32_t node_id) { CHECK_NE(id, kBadStorageID); StorageEntry *e = data_[id].get(); + e->released_by_node = node_id; free_.insert({e->max_size, e}); }