From c1868633c711b0b495cf9e956e94a5dc47d8a823 Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Fri, 19 Jul 2019 18:01:05 +0800 Subject: [PATCH 01/24] MKLDNN LBR-GRU Integration --- src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 381 +++++++++++++---------- src/operator/rnn-inl.h | 28 +- src/operator/rnn.cc | 141 +++++---- 3 files changed, 293 insertions(+), 257 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h index ea8e07ea617c..2db46ea84fc7 100644 --- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h +++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h @@ -39,6 +39,26 @@ namespace mxnet { namespace op { +struct MKLDNNRNNMemory { + std::vector concat_weight_memory; + std::vector concat_iter_memory; + std::vector x_memory; + std::vector hcx_memory; + std::vector wx_memory; + std::vector wh_memory; + std::vector bias_memory; + std::vector y_memory; + std::vector hcy_memory; + std::vector uni_states_memory; + std::vector concat_states_memory; + std::vector weight_layer_mems; + std::vector weight_iter_mems; + mkldnn::memory user_src_layer_memory_l; + + MKLDNNRNNMemory() : user_src_layer_memory_l( + null_memory(CpuEngine::Get()->get_engine())) {} +}; + static algorithm GetMKLDNNRNNAlgo(int mode, int* ngates, int* nstates) { @@ -52,7 +72,7 @@ static algorithm GetMKLDNNRNNAlgo(int mode, case rnn_enum::kGru: *ngates = 3; *nstates = 1; - algo = algorithm::vanilla_gru; + algo = algorithm::gru_linear_before_reset; break; case rnn_enum::kRnnRelu: case rnn_enum::kRnnTanh: @@ -73,35 +93,48 @@ static void ConcatData(mkldnn::memory::format src_format, mkldnn::memory::dims dst_cds, mkldnn::memory::data_type mkldnn_dtype, int concat_dimension, - std::vector srcs_data, - const mkldnn::memory &dst) { + const std::vector &srcs_data, + const mkldnn::memory &dst, + std::vector *tmp_src_mems) { auto cpu_engine = CpuEngine::Get()->get_engine(); std::vector srcs_pd; - std::vector srcs; + bool initialized = tmp_src_mems->size() > 0; for (size_t i = 0; i < srcs_cds.size(); i++) { auto desc = mkldnn::memory::desc(srcs_cds[i], mkldnn_dtype, src_format); auto mpd = mkldnn::memory::primitive_desc(desc, cpu_engine); - auto src_memory = mkldnn::memory(mpd, srcs_data[i]); srcs_pd.push_back(mpd); - srcs.push_back(src_memory); - } - std::vector inputs; - for (size_t i = 0; i < srcs_cds.size(); i++) { - inputs.push_back(srcs[i]); + if (initialized) { + tmp_src_mems->at(i).set_data_handle(srcs_data[i]); + } else { + auto src_memory = mkldnn::memory(mpd, srcs_data[i]); + tmp_src_mems->push_back(src_memory); + } } + std::vector inputs(tmp_src_mems->begin(), tmp_src_mems->end()); auto dst_desc = mkldnn::memory::desc(dst_cds, mkldnn_dtype, dst_format); auto concat_pd = concat::primitive_desc(dst_desc, concat_dimension, srcs_pd); MKLDNNStream::Get()->RegisterPrim(concat(concat_pd, inputs, dst)); - MKLDNNStream::Get()->Submit(); } -// cached mkldnn memory -// first layer wx, wh with next L - 1 layers wx and wh -// with L layers hx and cx, src and dst data/iter etc. -// it will prepare memory on before and after reorder and concat. -// for unidirectional, it will fused as dim like 1 + (L - 1) when I != H. -// for bidirectional, it will fused as data + back_data (weight, bias, iter etc), -// also need to identify first layer and next layers +/** + * Size of cached memory + * + * Cache memory of wx, wh from the first layer and next L - 1 layers + * seperately, as well as the layer and iter memory for src and dst. + * Output states memory hx, hc and bias memory are also cached. It + * will prepare memory on before and after reorder and concat. For + * unidirectional, it will fused as dim like 1 + (L - 1) when I != H. + * For bidirectional, it will fused as data + back_data (weight, bias, + * iter etc) + * + * @param L Number of Layers + * @param D Direction of the RNN implement. It should be 1 or 2. + * @param T The maximum sequence length. + * @param N Batch size. + * @param I Input channel. Also the dimension of the input feature. + * @param H Hidden state size. + * @return The required cache size. + */ static size_t GetMKLDNNRNNCacheMemorySize(int L, int D, int T, @@ -118,7 +151,7 @@ static size_t GetMKLDNNRNNCacheMemorySize(int L, break; case rnn_enum::kGru: size = 2 * (D * (I + H) * 3 * H + (L - 1) * D * (D * H + H) * 3 * H + - L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 3 * H + (L + 2) * D * 2 * N * H + + L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H + 6 * D * (I + H + 2) * 3 * H + T * N * I * 2; break; case rnn_enum::kRnnRelu: @@ -177,7 +210,6 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, const int I, const int H, DType* x_ptr, - mkldnn::memory *user_src_layer_memory, DType* hx_ptr, DType* cx_ptr, DType* w_ptr, @@ -185,15 +217,7 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, DType* y_ptr, DType* hy_ptr, DType* cy_ptr, - std::vector *concat_weight_memory, - std::vector *concat_iter_memory, - std::vector *x_memory, - std::vector *hcx_memory, - std::vector *wx_memory, - std::vector *wh_memory, - std::vector *bias_memory, - std::vector *y_memory, - std::vector *hcy_memory, + MKLDNNRNNMemory *mkldnn_mems, std::vector *rnn_forward_prim, int layer_index, bool *has_cache, @@ -203,16 +227,17 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, int mode) { int ngates = 0, nstates = 0; algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates); + const int nbias = mode == rnn_enum::kGru ? ngates + 1 : ngates; mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype); const int single_cell_size = N * H; - const int single_b_size = ngates * H; + const int mx_single_b_sz = ngates * H; DType* wx = w_ptr; // ngates * H, I DType* wh = w_ptr + I * H * ngates; // ngates * H, H DType* back_wx = w_ptr + ngates * H * (I + H); DType* back_wh = back_wx + I * H * ngates; DType* bx = b_ptr; DType* bh = b_ptr + H * ngates; - DType* back_bx = b_ptr + single_b_size * 2; + DType* back_bx = b_ptr + mx_single_b_sz * 2; DType* back_bh = back_bx + H * ngates; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); auto cpu_engine = CpuEngine::Get()->get_engine(); @@ -225,54 +250,76 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H}; // ldigo for reorder mkldnn::memory::dims weights_iter_tz = {1, 2, H, ngates, H}; // ldigo mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H}; // ldigo for reorder - mkldnn::memory::dims bias_tz = {1, 2, ngates, H}; + mkldnn::memory::dims bias_tz = {1, 2, nbias, H}; // ldgo mkldnn::memory::dims src_iter_tz = {1, 2, nstates, N, H}; // ldsnc mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, N, H}; // ldsnc - if (!initialized) { + bool has_adjusted = false; + if (!initialized || is_train) { if (mode == rnn_enum::kGru) { AdjustGruWeightGateOrder(wx, I, H); AdjustGruWeightGateOrder(back_wx, I, H); AdjustGruWeightGateOrder(wh, H, H); AdjustGruWeightGateOrder(back_wh, H, H); - AdjustGruBiasGateOrder(bx, H); - AdjustGruBiasGateOrder(back_bx, H); - AdjustGruBiasGateOrder(bh, H); - AdjustGruBiasGateOrder(back_bh, H); + has_adjusted = true; } - auto src_wx = (*concat_weight_memory)[2 * layer_index]; - auto src_wh = (*concat_weight_memory)[2 * layer_index + 1]; + auto src_wx = mkldnn_mems->concat_weight_memory[2 * layer_index]; + auto src_wh = mkldnn_mems->concat_weight_memory[2 * layer_index + 1]; std::vector srcs_data1; srcs_data1.push_back(wx); srcs_data1.push_back(back_wx); ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi, {weights_layer_r_tz, weights_layer_r_tz}, weights_layer_tz, - mkldnn_dtype, 1, srcs_data1, src_wx); + mkldnn_dtype, 1, srcs_data1, src_wx, &(mkldnn_mems->weight_layer_mems)); srcs_data1.clear(); srcs_data1.push_back(wh); srcs_data1.push_back(back_wh); ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi, {weights_iter_r_tz, weights_iter_r_tz}, weights_iter_tz, - mkldnn_dtype, 1, srcs_data1, src_wh); + mkldnn_dtype, 1, srcs_data1, src_wh, &(mkldnn_mems->weight_iter_mems)); int tmpvalue = 0; if (lvalue > 0) { tmpvalue = lvalue + 1; } - MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, (*wx_memory)[tmpvalue])); - MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, (*wh_memory)[tmpvalue])); + MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, mkldnn_mems->wx_memory[tmpvalue])); + MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, mkldnn_mems->wh_memory[tmpvalue])); DType* user_bias = reinterpret_cast - ((*bias_memory)[tmpvalue].get_data_handle()); - #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < single_b_size; j++) { - user_bias[j] = bx[j] + bh[j]; - user_bias[single_b_size + j] = back_bx[j] + back_bh[j]; + (mkldnn_mems->bias_memory[tmpvalue].get_data_handle()); + if (mode == rnn_enum::kGru) { + // While mxnet gru gate order is reset, update and new gates, + // mkldnn gru gate order is update, reset and new gates. So + // we need to swap the order of reset and update from mxnet. + const index_t single_b_sz = nbias * H; + #pragma omp parallel for num_threads(omp_threads) + for (int j = 0; j < H; j++) { + user_bias[j + H] = bx[j] + bh[j]; + user_bias[single_b_sz + j + H] = back_bx[j] + back_bh[j]; + user_bias[j] = bx[j + H] + bh[j + H]; + user_bias[single_b_sz + j] = back_bx[j + H] + back_bh[j + H]; + } + #pragma omp parallel for num_threads(omp_threads) + for (int j = 2 * H; j < 3 * H; j++) { + user_bias[j] = bx[j]; + user_bias[j + H] = bh[j]; + user_bias[single_b_sz + j] = back_bx[j]; + user_bias[single_b_sz + j + H] = back_bh[j]; + } + } else { + #pragma omp parallel for num_threads(omp_threads) + for (int j = 0; j < mx_single_b_sz; j++) { + user_bias[j] = bx[j] + bh[j]; + user_bias[mx_single_b_sz + j] = back_bx[j] + back_bh[j]; + } } } if (lvalue > 0) { - (*wx_memory)[layer_index].set_data_handle((*wx_memory)[lvalue + 1].get_data_handle()); - (*wh_memory)[layer_index].set_data_handle((*wh_memory)[lvalue + 1].get_data_handle()); - (*bias_memory)[layer_index].set_data_handle((*bias_memory)[lvalue + 1].get_data_handle()); + mkldnn_mems->wx_memory[layer_index].set_data_handle( + mkldnn_mems->wx_memory[lvalue + 1].get_data_handle()); + mkldnn_mems->wh_memory[layer_index].set_data_handle( + mkldnn_mems->wh_memory[lvalue + 1].get_data_handle()); + mkldnn_mems->bias_memory[layer_index].set_data_handle( + mkldnn_mems->bias_memory[lvalue + 1].get_data_handle()); } auto src_layer_md = mkldnn::memory::desc( @@ -290,32 +337,32 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, auto bias_md = mkldnn::memory::desc({bias_tz}, mkldnn_dtype, mkldnn::memory::format::ldgo); - auto user_src_iter_memory = (*concat_iter_memory)[2]; + auto user_src_iter_memory = mkldnn_mems->concat_iter_memory[2]; if (mode == rnn_enum::kLstm) { std::vector srcs_data1; srcs_data1.push_back(hx_ptr); srcs_data1.push_back(cx_ptr); - auto tmp1_src_iter_memory = (*concat_iter_memory)[0]; + auto tmp1_src_iter_memory = mkldnn_mems->concat_iter_memory[0]; ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2, - srcs_data1, tmp1_src_iter_memory); + srcs_data1, tmp1_src_iter_memory, &(mkldnn_mems->uni_states_memory)); std::vector srcs_data2; srcs_data2.push_back(hx_ptr + single_cell_size); srcs_data2.push_back(cx_ptr + single_cell_size); - auto tmp2_src_iter_memory = (*concat_iter_memory)[1]; + auto tmp2_src_iter_memory = mkldnn_mems->concat_iter_memory[1]; ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2, - srcs_data2, tmp2_src_iter_memory); + srcs_data2, tmp2_src_iter_memory, &(mkldnn_mems->uni_states_memory)); std::vector srcs_data3; srcs_data3.push_back(reinterpret_cast(tmp1_src_iter_memory.get_data_handle())); srcs_data3.push_back(reinterpret_cast(tmp2_src_iter_memory.get_data_handle())); ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, {{1, 1, nstates, N, H}, {1, 1, nstates, N, H}}, {1, 2, nstates, N, H}, - mkldnn_dtype, 1, srcs_data3, user_src_iter_memory); + mkldnn_dtype, 1, srcs_data3, user_src_iter_memory, &(mkldnn_mems->concat_states_memory)); } else { user_src_iter_memory.set_data_handle(hx_ptr); } - (*hcx_memory)[layer_index].set_data_handle(user_src_iter_memory.get_data_handle()); + mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory.get_data_handle()); rnn_cell::desc rnn_cell(nalgorithm, mode == rnn_enum::kRnnRelu ? algorithm::eltwise_relu : algorithm::eltwise_tanh); @@ -329,25 +376,25 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, = rnn_forward::primitive_desc(layer_desc, cpu_engine); if (x_ptr && layer_index == 0) { - (*x_memory)[layer_index].set_data_handle(x_ptr); + mkldnn_mems->x_memory[layer_index].set_data_handle(x_ptr); } else { - (*x_memory)[layer_index].set_data_handle((*user_src_layer_memory).get_data_handle()); + mkldnn_mems->x_memory[layer_index].set_data_handle( + mkldnn_mems->user_src_layer_memory_l.get_data_handle()); } - (*y_memory)[layer_index].set_data_handle(y_ptr); - + mkldnn_mems->y_memory[layer_index].set_data_handle(y_ptr); if (rnn_forward_prim->size() <= (size_t)layer_index) { - primitive rnn_prim = rnn_forward(prim_desc, (*x_memory)[layer_index], - (*hcx_memory)[layer_index], (*wx_memory)[layer_index], - (*wh_memory)[layer_index], (*bias_memory)[layer_index], - (*y_memory)[layer_index], - (*hcy_memory)[layer_index], null_memory_); + primitive rnn_prim = rnn_forward(prim_desc, mkldnn_mems->x_memory[layer_index], + mkldnn_mems->hcx_memory[layer_index], mkldnn_mems->wx_memory[layer_index], + mkldnn_mems->wh_memory[layer_index], mkldnn_mems->bias_memory[layer_index], + mkldnn_mems->y_memory[layer_index], + mkldnn_mems->hcy_memory[layer_index], null_memory_); rnn_forward_prim->push_back(rnn_prim); } MKLDNNStream::Get()->RegisterPrim((*rnn_forward_prim)[layer_index]); MKLDNNStream::Get()->Submit(); if (state_outputs) { - DType* dst_hcy = reinterpret_cast ((*hcy_memory)[layer_index].get_data_handle()); + DType* dst_hcy = reinterpret_cast(mkldnn_mems->hcy_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kLstm) { offset1 = nstates * single_cell_size; offset2 = (nstates + 1) * single_cell_size; @@ -365,6 +412,12 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, } } } + if (has_adjusted) { + AdjustGruWeightGateOrder(wx, I, H); + AdjustGruWeightGateOrder(back_wx, I, H); + AdjustGruWeightGateOrder(wh, H, H); + AdjustGruWeightGateOrder(back_wh, H, H); + } } @@ -376,7 +429,6 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, const int I, const int H, DType* x_ptr, - mkldnn::memory *user_src_layer_memory, DType* hx_ptr, DType* cx_ptr, DType* w_ptr, @@ -384,15 +436,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, DType* y_ptr, DType* hy_ptr, DType* cy_ptr, - std::vector *concat_weight_memory, - std::vector *concat_iter_memory, - std::vector *x_memory, - std::vector *hcx_memory, - std::vector *wx_memory, - std::vector *wh_memory, - std::vector *bias_memory, - std::vector *y_memory, - std::vector *hcy_memory, + MKLDNNRNNMemory *mkldnn_mems, std::vector *rnn_forward_prim, int layer_index, bool *has_cache, @@ -401,10 +445,11 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, int mode) { int ngates = 0, nstates = 0; algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates); + const int nbias = (mode == rnn_enum::kGru ? ngates + 1 : ngates); mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype); const int cell_size = N * H; const int single_cell_size = N * H; - const int single_b_size = ngates * H; + const int single_b_size = nbias * H; int w_size = (I + H) * H * ngates; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); auto cpu_engine = CpuEngine::Get()->get_engine(); @@ -416,7 +461,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, mkldnn::memory::dims dst_layer_tz = {T, N, H}; mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H}; // ldigo mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H}; // ldigo - mkldnn::memory::dims bias_tz = {L, 1, ngates, H}; + mkldnn::memory::dims bias_tz = {L, 1, nbias, H}; // ldgo mkldnn::memory::dims src_iter_tz = {L, 1, nstates, N, H}; // ldsnc mkldnn::memory::dims dst_iter_tz = {L, 1, nstates, N, H}; // ldsnc mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H}; // ldigo for reorder @@ -442,12 +487,12 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, std::vector srcs_data; srcs_data.push_back(hx_ptr); srcs_data.push_back(cx_ptr); - auto tmp_src_iter_memory = (*concat_iter_memory)[l + layer_index]; + auto tmp_src_iter_memory = mkldnn_mems->concat_iter_memory[l + layer_index]; ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, - 2, srcs_data, tmp_src_iter_memory); + 2, srcs_data, tmp_src_iter_memory, &(mkldnn_mems->uni_states_memory)); } else { - (*concat_iter_memory)[l + layer_index].set_data_handle(hx_ptr); + mkldnn_mems->concat_iter_memory[l + layer_index].set_data_handle(hx_ptr); } hx_ptr += cell_size; if (mode == rnn_enum::kLstm) { @@ -457,71 +502,95 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, auto user_src_iter_memory = null_memory_; if (L == 1) { - user_src_iter_memory = (*concat_iter_memory)[layer_index]; + user_src_iter_memory = mkldnn_mems->concat_iter_memory[layer_index]; } else { - user_src_iter_memory = (*concat_iter_memory)[L + layer_index]; + user_src_iter_memory = mkldnn_mems->concat_iter_memory[L + layer_index]; std::vector src_l_data; std::vector src_l_dim; for (int l = 0; l < L; l++) { src_l_data.push_back(reinterpret_cast - ((*concat_iter_memory)[l + layer_index].get_data_handle())); + (mkldnn_mems->concat_iter_memory[l + layer_index].get_data_handle())); src_l_dim.push_back({1, 1, nstates, N, H}); } ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, src_l_dim, - {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory); + {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory, + &(mkldnn_mems->concat_states_memory)); } - (*hcx_memory)[layer_index].set_data_handle(user_src_iter_memory.get_data_handle()); + mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory.get_data_handle()); - auto src_wx_f = (*concat_weight_memory)[2 * layer_index]; - auto src_wh_f = (*concat_weight_memory)[2 * layer_index + 1]; + auto src_wx_f = mkldnn_mems->concat_weight_memory[2 * layer_index]; + auto src_wh_f = mkldnn_mems->concat_weight_memory[2 * layer_index + 1]; std::vector srcs_data_x; std::vector srcs_data_h; std::vector src_l_dim_x; std::vector src_l_dim_h; + + bool has_adjusted = false; if (!initialized) { if (L == 1) { DType* wx = w_ptr; - DType* wh = w_ptr + I * H * ngates; + DType* wh = wx + I * H * ngates; if (mode == rnn_enum::kGru) { AdjustGruWeightGateOrder(wx, I, H); AdjustGruWeightGateOrder(wh, H, H); - AdjustGruBiasGateOrder(b_ptr, H); - AdjustGruBiasGateOrder(b_ptr + H * ngates, H); + has_adjusted = true; } src_wx_f.set_data_handle(wx); src_wh_f.set_data_handle(wh); } else { for (int l = 0; l < L; l++) { - DType* wx = w_ptr; - DType* wh = w_ptr + I * H * ngates; - DType* bx = b_ptr + l * ngates * H * 2; - DType* bh = b_ptr + l * ngates * H * 2 + H * ngates; + DType* wx = w_ptr + l * w_size; + DType* wh = wx + I * H * ngates; if (mode == rnn_enum::kGru) { AdjustGruWeightGateOrder(wx, I, H); AdjustGruWeightGateOrder(wh, H, H); - AdjustGruBiasGateOrder(bx, H); - AdjustGruBiasGateOrder(bh, H); + has_adjusted = true; } srcs_data_x.push_back(wx); srcs_data_h.push_back(wh); src_l_dim_x.push_back(weights_layer_r_tz); src_l_dim_h.push_back(weights_iter_r_tz); - w_ptr = w_ptr + w_size; } ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi, - src_l_dim_x, weights_layer_tz, mkldnn_dtype, 0, srcs_data_x, src_wx_f); + src_l_dim_x, weights_layer_tz, mkldnn_dtype, 0, srcs_data_x, src_wx_f, + &(mkldnn_mems->weight_layer_mems)); ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi, - src_l_dim_h, weights_iter_tz, mkldnn_dtype, 0, srcs_data_h, src_wh_f); + src_l_dim_h, weights_iter_tz, mkldnn_dtype, 0, srcs_data_h, src_wh_f, + &(mkldnn_mems->weight_iter_mems)); } - MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, (*wx_memory)[layer_index])); - MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, (*wh_memory)[layer_index])); - - DType* user_bias_f = reinterpret_cast ((*bias_memory)[layer_index].get_data_handle()); - #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < L * single_b_size; j++) { - int k = j / single_b_size; - user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size]; + MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, mkldnn_mems->wx_memory[layer_index])); + MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, mkldnn_mems->wh_memory[layer_index])); + + DType* user_bias_f = reinterpret_cast(mkldnn_mems->bias_memory[layer_index].get_data_handle()); + if (mode == rnn_enum::kGru) { + const int mx_single_b_sz = ngates * H; + for (int l = 0; l < L; l++) { + #pragma omp parallel for num_threads(omp_threads) + for (int g = 0; g < H; g++) { + // While mxnet gru gate order is reset, update and new gates, + // mkldnn gru gate order is update, reset and new gates. So + // we need to swap the order of reset and update from mxnet. + user_bias_f[g + H + l * single_b_size] = + b_ptr[g + l * mx_single_b_sz * 2] + + b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz]; + user_bias_f[g + l * single_b_size] = + b_ptr[g + H + l * mx_single_b_sz * 2] + + b_ptr[g + H + l * mx_single_b_sz * 2 + mx_single_b_sz]; + } + #pragma omp parallel for num_threads(omp_threads) + for (int g = 2 * H; g < 3 * H; g++) { + user_bias_f[g + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2]; + user_bias_f[g + l * single_b_size + H] = + b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz]; + } + } + } else { + #pragma omp parallel for num_threads(omp_threads) + for (int j = 0; j < L * single_b_size; j++) { + int k = j / single_b_size; + user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size]; + } } } @@ -537,25 +606,25 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, = rnn_forward::primitive_desc(layer_desc, cpu_engine); if (x_ptr && layer_index == 0) { - (*x_memory)[layer_index].set_data_handle(x_ptr); + mkldnn_mems->x_memory[layer_index].set_data_handle(x_ptr); } else { - (*x_memory)[layer_index].set_data_handle((*user_src_layer_memory).get_data_handle()); + mkldnn_mems->x_memory[layer_index].set_data_handle( + mkldnn_mems->user_src_layer_memory_l.get_data_handle()); } - (*y_memory)[layer_index].set_data_handle(y_ptr); - + mkldnn_mems->y_memory[layer_index].set_data_handle(y_ptr); if (rnn_forward_prim->size() <= (size_t)layer_index) { - primitive rnn_prim = rnn_forward(prim_desc, (*x_memory)[layer_index], - (*hcx_memory)[layer_index], (*wx_memory)[layer_index], - (*wh_memory)[layer_index], (*bias_memory)[layer_index], - (*y_memory)[layer_index], - (*hcy_memory)[layer_index], null_memory_); + primitive rnn_prim = rnn_forward(prim_desc, mkldnn_mems->x_memory[layer_index], + mkldnn_mems->hcx_memory[layer_index], mkldnn_mems->wx_memory[layer_index], + mkldnn_mems->wh_memory[layer_index], mkldnn_mems->bias_memory[layer_index], + mkldnn_mems->y_memory[layer_index], + mkldnn_mems->hcy_memory[layer_index], null_memory_); rnn_forward_prim->push_back(rnn_prim); } MKLDNNStream::Get()->RegisterPrim((*rnn_forward_prim)[layer_index]); MKLDNNStream::Get()->Submit(); if (state_outputs) { - DType* dst_hcy = reinterpret_cast ((*hcy_memory)[layer_index].get_data_handle()); + DType* dst_hcy = reinterpret_cast(mkldnn_mems->hcy_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kLstm) { for (int l = 0; l < L; l++) { offset1 = l * single_cell_size; @@ -573,6 +642,14 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, } } } + if (has_adjusted) { + for (int l = 0; l < L; l++) { + DType* wx = w_ptr + l * w_size; + DType* wh = wx + I * H * ngates; + AdjustGruWeightGateOrder(wx, I, H); + AdjustGruWeightGateOrder(wh, H, H); + } + } } template @@ -591,15 +668,7 @@ static void MKLDNNRNNForward(bool state_outputs, DType* y_ptr, DType* hy_ptr, DType* cy_ptr, - std::vector *concat_weight_memory, - std::vector *concat_iter_memory, - std::vector *x_memory, - std::vector *hcx_memory, - std::vector *wx_memory, - std::vector *wh_memory, - std::vector *bias_memory, - std::vector *y_memory, - std::vector *hcy_memory, + MKLDNNRNNMemory *mkldnn_mems, std::vector *rnn_forward_prim, bool *has_cache, int dtype, @@ -611,33 +680,27 @@ static void MKLDNNRNNForward(bool state_outputs, const int cell_size = N * H * D; // First layer int w_size = (I + H) * H * ngates * D; - auto cpu_engine = CpuEngine::Get()->get_engine(); - auto null_memory_ = null_memory(cpu_engine); DType* tmpNull = NULL; // when D = 1 and I == H, L layers can be fused together if (D == 1 && I == H) { - MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr, &null_memory_, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory, - concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory, - bias_memory, y_memory, hcy_memory, rnn_forward_prim, + MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr, + hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, + mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode); } else { - auto user_src_layer_memory_l = null_memory_; if (D == 2) { - MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr, &user_src_layer_memory_l, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory, - concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory, - bias_memory, y_memory, hcy_memory, rnn_forward_prim, + MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr, + hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, + mkldnn_mems, rnn_forward_prim, 0, has_cache, 0, dtype, is_train, mode); } else { - MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr, &user_src_layer_memory_l, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory, - concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory, - bias_memory, y_memory, hcy_memory, rnn_forward_prim, + MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr, + hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, + mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode); } if (L > 1) { - user_src_layer_memory_l = (*y_memory)[0]; + mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[0]; // go to next L - 1 layers. // If D = 2, do it layer by layer. If D = 1, fused L - 1 layers w_ptr += w_size; @@ -656,12 +719,10 @@ static void MKLDNNRNNForward(bool state_outputs, cx_ptr += cell_size; } MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull, - &user_src_layer_memory_l, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, - cy_ptr, concat_weight_memory, concat_iter_memory, x_memory, - hcx_memory, wx_memory, wh_memory, bias_memory, - y_memory, hcy_memory, rnn_forward_prim, + hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, + cy_ptr, mkldnn_mems, rnn_forward_prim, 1, has_cache, l + 1, dtype, is_train, mode); - user_src_layer_memory_l = (*y_memory)[1]; + mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[1]; w_ptr += w_size; b_ptr += b_size; } @@ -674,10 +735,8 @@ static void MKLDNNRNNForward(bool state_outputs, } } w_size = (H + H) * H * ngates; - MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull, &user_src_layer_memory_l, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory, - concat_iter_memory, x_memory, hcx_memory, wx_memory, - wh_memory, bias_memory, y_memory, hcy_memory, + MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull, + hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, 1, has_cache, dtype, is_train, mode); } } @@ -701,15 +760,7 @@ static void MKLDNNRNNForwardInference(bool state_outputs, DType* y_ptr, DType* hy_ptr, DType* cy_ptr, - std::vector* concat_weight_memory, - std::vector* concat_iter_memory, - std::vector *x_memory, - std::vector *hcx_memory, - std::vector *wx_memory, - std::vector *wh_memory, - std::vector *bias_memory, - std::vector *y_memory, - std::vector *hcy_memory, + MKLDNNRNNMemory *mkldnn_mems, std::vector *rnn_forward_prim, bool *has_cache, int dtype, @@ -723,9 +774,7 @@ static void MKLDNNRNNForwardInference(bool state_outputs, MKLDNNRNNForward(state_outputs, num_layers, direction, seq_length, batch_size, input_size, state_size, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, - concat_weight_memory, concat_iter_memory, x_memory, - hcx_memory, wx_memory, wh_memory, - bias_memory, y_memory, hcy_memory, rnn_forward_prim, + mkldnn_mems, rnn_forward_prim, has_cache, dtype, is_train, mode); break; default: diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 328e28de8537..e3a2bfb6a322 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -397,20 +397,12 @@ class RNNOp { RNNParam param_; Context ctx_; #if MXNET_USE_MKLDNN == 1 - std::vector concat_weight_memory; - std::vector concat_iter_memory; - std::vector rnn_forward_prim; - std::vector x_memory; - std::vector hcx_memory; - std::vector wx_memory; - std::vector wh_memory; - std::vector bias_memory; - std::vector y_memory; - std::vector hcy_memory; bool has_cache; bool init_mem_; size_t reserve_mem_size_; Storage::Handle mem_space_; + MKLDNNRNNMemory mkldnn_mems; + std::vector rnn_forward_prim; #endif explicit RNNOp(RNNParam param, Context ctx) { this->param_ = param; @@ -908,9 +900,7 @@ class RNNOp { param_.mode); } else { #if MXNET_USE_MKLDNN == 1 - if (dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1) && param_.mode != rnn_enum::kGru) { - // TODO(zixuanweeei): MKLDNN GRU has precision issue. A stable one - // will be added to MXNet when we figure out the issue. + if (dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) { int dtype = in_data[rnn_enum::kData].type_flag_; MKLDNNRNNForwardInference(param_.state_outputs, param_.num_layers, @@ -927,15 +917,7 @@ class RNNOp { y.dptr_, hy_ptr, cy_ptr, - &concat_weight_memory, - &concat_iter_memory, - &x_memory, - &hcx_memory, - &wx_memory, - &wh_memory, - &bias_memory, - &y_memory, - &hcy_memory, + &mkldnn_mems, &rnn_forward_prim, &has_cache, dtype, @@ -943,8 +925,6 @@ class RNNOp { param_.mode); } else { #endif - // Before integrating MKLDNN GRU fp32 inference - // using below code for keep func being OK const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size, direction, param_.mode); diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 6a0dbd7a4e23..7edcbe5c61a9 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -260,13 +260,14 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, const RNNParam& param = op.param_; int ngates = 0, nstates = 0; GetMKLDNNRNNAlgo(param.mode, &ngates, &nstates); - int D = param.bidirectional ? 2 : 1; + const int D = param.bidirectional ? 2 : 1; Tensor x = in_blobs[rnn_enum::kData].get(s); - int T = x.shape_[0]; - int N = x.shape_[1]; - int I = x.shape_[2]; - int H = param.state_size; - int L = param.num_layers; + const int T = x.shape_[0]; + const int N = x.shape_[1]; + const int I = x.shape_[2]; + const int H = param.state_size; + const int L = param.num_layers; + const int nbias = param.mode == rnn_enum::kGru ? ngates + 1 : ngates; const size_t r_size = GetMKLDNNRNNCacheMemorySize(L, D, T, N, I, H, param.mode); if (op.init_mem_ && op.reserve_mem_size_ < r_size) { @@ -281,7 +282,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, op.init_mem_ = true; op.has_cache = false; } - if (op.has_cache && op.x_memory.size() == 0) { + if (op.has_cache && op.mkldnn_mems.x_memory.size() == 0) { op.has_cache = false; } @@ -291,16 +292,16 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, mkldnn::memory::dims dst_layer_tz = {T, N, D * H}; auto dst_layer_md = mkldnn::memory::desc( { dst_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc); - if (op.x_memory.size() == 0) { + if (op.mkldnn_mems.x_memory.size() == 0) { if (D == 1 && I == H) { auto user_src_layer_md = mkldnn::memory::desc( { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc); auto user_src_layer_memory_n = mkldnn::memory({ user_src_layer_md, cpu_engine }); - op.x_memory.push_back(user_src_layer_memory_n); + op.mkldnn_mems.x_memory.push_back(user_src_layer_memory_n); mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H}; // ldigo mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H}; // ldigo - mkldnn::memory::dims bias_tz = {L, 1, ngates, H}; + mkldnn::memory::dims bias_tz = {L, 1, nbias, H}; auto user_weight_layer_md = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo); auto user_weight_iter_md = mkldnn::memory::desc( @@ -310,21 +311,22 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* weight_layer_n = workptr; // L * I * ngates * H auto user_weight_layer_memory_n = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n); - op.wx_memory.push_back(user_weight_layer_memory_n); + op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n); DType* weight_iter_n = weight_layer_n + L * I * ngates * H; // L * H * ngates * H auto user_weight_iter_memory_n = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n); - op.wh_memory.push_back(user_weight_iter_memory_n); + op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n); - DType* bias_n = weight_iter_n + L * H * ngates * H; // L * ngates * H + DType* bias_n = weight_iter_n + L * H * ngates * H; // Generally, L * ngates * H + // LBR-Gru, L * (ngates + 1) * H auto user_bias_memory_n = mkldnn::memory({ user_bias_md, cpu_engine }, bias_n); - op.bias_memory.push_back(user_bias_memory_n); + op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n); auto wx_md_n = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi); - DType* wx_n = bias_n + L * ngates * H; // L * ngates * I * H + DType* wx_n = bias_n + L * nbias * H; // L * ngates * I * H auto wx_memory_n = mkldnn::memory({ wx_md_n, cpu_engine }, wx_n); DType* wh_n = wx_n + L * ngates * I * H; // L * ngates * H * H @@ -333,8 +335,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, auto wh_memory_n = mkldnn::memory({ wh_md_n, cpu_engine }, wh_n); - op.concat_weight_memory.push_back(wx_memory_n); - op.concat_weight_memory.push_back(wh_memory_n); + op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_n); + op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_n); workptr = wh_n + L * ngates * H * H; mkldnn::memory::dims src_iter_tz_n1 = {1, 1, nstates, N, H}; // ldsnc @@ -344,7 +346,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_n1 = workptr; // nstates * N * H auto src_iter_memory_n1 = mkldnn::memory({ src_iter_md_n1, cpu_engine }, src_iter_n1); - op.concat_iter_memory.push_back(src_iter_memory_n1); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n1); workptr = src_iter_n1 + nstates * N * H; } mkldnn::memory::dims src_iter_tz_n = {L, 1, nstates, N, H}; // ldsnc @@ -353,12 +355,12 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_n = workptr; // L * nstates * N * H auto src_iter_memory_n = mkldnn::memory({ src_iter_md_n, cpu_engine }, src_iter_n); - op.concat_iter_memory.push_back(src_iter_memory_n); - op.hcx_memory.push_back(src_iter_memory_n); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n); + op.mkldnn_mems.hcx_memory.push_back(src_iter_memory_n); DType* dst_layer_n = src_iter_n + L * nstates * N * H; // T * N * D * H auto dst_layer_memory_n = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n); - op.y_memory.push_back(dst_layer_memory_n); + op.mkldnn_mems.y_memory.push_back(dst_layer_memory_n); mkldnn::memory::dims dst_iter_tz_n = {L, 1, nstates, N, H}; // ldsnc auto dst_iter_md_n = mkldnn::memory::desc( @@ -366,18 +368,18 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* dst_iter_n = dst_layer_n + T * N * D * H; // L * nstates * N * H auto dst_iter_memory_n = mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n); - op.hcy_memory.push_back(dst_iter_memory_n); + op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_n); workptr = dst_iter_n + L * nstates * N * H; } else { auto user_src_layer_md_0 = mkldnn::memory::desc( { src_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::tnc); auto user_src_layer_memory_0 = mkldnn::memory({ user_src_layer_md_0, cpu_engine }); - op.x_memory.push_back(user_src_layer_memory_0); + op.mkldnn_mems.x_memory.push_back(user_src_layer_memory_0); mkldnn::memory::dims weights_layer_tz_0 = {1, D, I, ngates, H}; // ldigo mkldnn::memory::dims weights_iter_tz_0 = {1, D, H, ngates, H}; // ldigo - mkldnn::memory::dims bias_tz_0 = {1, D, ngates, H}; + mkldnn::memory::dims bias_tz_0 = {1, D, nbias, H}; auto user_weight_layer_md_0 = mkldnn::memory::desc( { weights_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldigo); auto user_weight_iter_md_0 = mkldnn::memory::desc( @@ -388,18 +390,19 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* weight_layer_0 = workptr; // D * I * ngates * H auto user_weight_layer_memory_0 = mkldnn::memory({ user_weight_layer_md_0, cpu_engine }, weight_layer_0); - op.wx_memory.push_back(user_weight_layer_memory_0); + op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_0); DType* weight_iter_0 = weight_layer_0 + D * I * ngates * H; // D * H * ngates * H auto user_weight_iter_memory_0 = mkldnn::memory({ user_weight_iter_md_0, cpu_engine }, weight_iter_0); - op.wh_memory.push_back(user_weight_iter_memory_0); + op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_0); - DType* bias_0 = weight_iter_0 + D * H * ngates * H; // D * ngates * H + DType* bias_0 = weight_iter_0 + D * H * ngates * H; // Generally, D * ngates * H + // LBR-Gru, D * (ngates + 1) * H auto user_bias_memory_0 = mkldnn::memory({ user_bias_md_0, cpu_engine }, bias_0); - op.bias_memory.push_back(user_bias_memory_0); - workptr = bias_0 + D * ngates * H; + op.mkldnn_mems.bias_memory.push_back(user_bias_memory_0); + workptr = bias_0 + D * nbias * H; auto wx_md_0 = mkldnn::memory::desc( { weights_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldgoi); @@ -416,8 +419,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, wh_memory_0.set_data_handle(wh_0); workptr = wh_0 + D * ngates * H * H; } - op.concat_weight_memory.push_back(wx_memory_0); - op.concat_weight_memory.push_back(wh_memory_0); + op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_0); + op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_0); mkldnn::memory::dims src_iter_undi_tz_0 = {1, 1, nstates, N, H}; // ldsnc auto src_iter_undi_md_0 = mkldnn::memory::desc( @@ -425,15 +428,15 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_undi_0 = workptr; // nstates * N * H auto src_iter_undi_memory_0 = mkldnn::memory({ src_iter_undi_md_0, cpu_engine }, src_iter_undi_0); - op.concat_iter_memory.push_back(src_iter_undi_memory_0); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi_memory_0); workptr = src_iter_undi_0 + nstates * N * H; if (D == 1) { - op.hcx_memory.push_back(src_iter_undi_memory_0); + op.mkldnn_mems.hcx_memory.push_back(src_iter_undi_memory_0); } else { DType* src_iter_undi2_0 = workptr; // nstates * N * H auto src_iter_undi2_memory_0 = mkldnn::memory({ src_iter_undi_md_0, cpu_engine }, src_iter_undi2_0); - op.concat_iter_memory.push_back(src_iter_undi2_memory_0); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi2_memory_0); mkldnn::memory::dims src_iter_tz_0 = {1, D, nstates, N, H}; // ldsnc auto src_iter_md_0 = mkldnn::memory::desc( @@ -441,15 +444,15 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_0 = src_iter_undi2_0 + nstates * N * H; // D * nstates * N * H auto src_iter_memory_0 = mkldnn::memory({ src_iter_md_0, cpu_engine }, src_iter_0); - op.concat_iter_memory.push_back(src_iter_memory_0); - op.hcx_memory.push_back(src_iter_memory_0); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_0); + op.mkldnn_mems.hcx_memory.push_back(src_iter_memory_0); workptr = src_iter_0 + D * nstates * N * H; } DType* dst_layer_0 = workptr; // T * N * D * H auto dst_layer_memory_0 = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_0); - op.y_memory.push_back(dst_layer_memory_0); + op.mkldnn_mems.y_memory.push_back(dst_layer_memory_0); mkldnn::memory::dims dst_iter_tz_0 = {1, D, nstates, N, H}; // ldsnc auto dst_iter_md_0 = mkldnn::memory::desc( @@ -457,7 +460,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* dst_iter_0 = dst_layer_0 + T * N * D * H; // D * nstates * N * H auto dst_iter_memory_0 = mkldnn::memory({ dst_iter_md_0, cpu_engine }, dst_iter_0); - op.hcy_memory.push_back(dst_iter_memory_0); + op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_0); workptr = dst_iter_0 + D * nstates * N * H; // next L - 1 layers @@ -465,11 +468,11 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, auto user_src_layer_md = mkldnn::memory::desc( { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc); auto user_src_layer_memory = mkldnn::memory({ user_src_layer_md, cpu_engine }); - op.x_memory.push_back(user_src_layer_memory); + op.mkldnn_mems.x_memory.push_back(user_src_layer_memory); mkldnn::memory::dims weights_layer_tz = {L - 1, 1, H, ngates, H}; // ldigo mkldnn::memory::dims weights_iter_tz = {L - 1, 1, H, ngates, H}; // ldigo - mkldnn::memory::dims bias_tz = {L - 1, 1, ngates, H}; + mkldnn::memory::dims bias_tz = {L - 1, 1, nbias, H}; auto user_weight_layer_md = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo); auto user_weight_iter_md = mkldnn::memory::desc( @@ -480,22 +483,23 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* weight_layer_n = workptr; // (L - 1) * H * ngates * H auto user_weight_layer_memory_n = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n); - op.wx_memory.push_back(user_weight_layer_memory_n); + op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n); DType* weight_iter_n = weight_layer_n + (L - 1) * H * ngates * H; // (L - 1) * H * ngates * H auto user_weight_iter_memory_n = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n); - op.wh_memory.push_back(user_weight_iter_memory_n); + op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n); - DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H; // (L - 1) * ngates * H + DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H; // Generally, (L - 1) * ngates * H + // LBR-Gru, (L -1) * (ngates + 1) * H auto user_bias_memory_n = mkldnn::memory({ user_bias_md, cpu_engine }, bias_n); - op.bias_memory.push_back(user_bias_memory_n); + op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n); auto wx_md_n = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi); - DType* wx_n = bias_n + (L - 1) * ngates * H; // (L - 1) * ngates * H * H + DType* wx_n = bias_n + (L - 1) * nbias * H; // (L - 1) * ngates * H * H auto wx_memory_n = mkldnn::memory({ wx_md_n, cpu_engine }, wx_n); DType* wh_n = wx_n + (L - 1) * ngates * H * H; // (L - 1) * ngates * H * H @@ -504,8 +508,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, auto wh_memory_n = mkldnn::memory({ wh_md_n, cpu_engine }, wh_n); - op.concat_weight_memory.push_back(wx_memory_n); - op.concat_weight_memory.push_back(wh_memory_n); + op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_n); + op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_n); workptr = wh_n + (L - 1) * ngates * H * H; mkldnn::memory::dims src_iter_tz_n1 = {1, 1, nstates, N, H}; // ldsnc @@ -515,7 +519,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_n1 = workptr; // nstates * N * H auto src_iter_memory_n1 = mkldnn::memory({ src_iter_md_n1, cpu_engine }, src_iter_n1); - op.concat_iter_memory.push_back(src_iter_memory_n1); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n1); workptr = src_iter_n1 + nstates * N * H; } mkldnn::memory::dims src_iter_tz_n = {L - 1, 1, nstates, N, H}; // ldsnc @@ -524,13 +528,13 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_n = workptr; // (L - 1) * nstates * N * H auto src_iter_memory_n = mkldnn::memory({ src_iter_md_n, cpu_engine }, src_iter_n); - op.concat_iter_memory.push_back(src_iter_memory_n); - op.hcx_memory.push_back(src_iter_memory_n); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n); + op.mkldnn_mems.hcx_memory.push_back(src_iter_memory_n); DType* dst_layer_n = src_iter_n + (L - 1) * nstates * N * H; // T * N * D * H auto dst_layer_memory_n = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n); - op.y_memory.push_back(dst_layer_memory_n); + op.mkldnn_mems.y_memory.push_back(dst_layer_memory_n); mkldnn::memory::dims dst_iter_tz_n = {L - 1, 1, nstates, N, H}; // ldsnc auto dst_iter_md_n = mkldnn::memory::desc( @@ -538,13 +542,14 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* dst_iter_n = dst_layer_n + T * N * D * H; // (L - 1) * nstates * N * H auto dst_iter_memory_n = mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n); - op.hcy_memory.push_back(dst_iter_memory_n); + op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_n); + workptr = dst_iter_n + (L - 1) * nstates * N * H; } if (L > 1 && D == 2) { mkldnn::memory::dims weights_layer_tz = {1, D, H * D, ngates, H}; // ldigo mkldnn::memory::dims weights_iter_tz = {1, D, H, ngates, H}; // ldigo - mkldnn::memory::dims bias_tz = {1, D, ngates, H}; + mkldnn::memory::dims bias_tz = {1, D, nbias, H}; auto user_weight_layer_md = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo); auto user_weight_iter_md = mkldnn::memory::desc( @@ -555,7 +560,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, auto user_src_layer_md = mkldnn::memory::desc( { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc); auto user_src_layer_memory = mkldnn::memory({ user_src_layer_md, cpu_engine }); - op.x_memory.push_back(user_src_layer_memory); + op.mkldnn_mems.x_memory.push_back(user_src_layer_memory); auto wx_md_n = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi); @@ -566,19 +571,20 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* weight_layer_n = workptr; // D * (H * D) * ngates * H auto user_weight_layer_memory_n = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n); - op.wx_memory.push_back(user_weight_layer_memory_n); + op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n); DType* weight_iter_n = weight_layer_n + D * (H * D) * ngates * H; // D * H * ngates * H auto user_weight_iter_memory_n = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n); - op.wh_memory.push_back(user_weight_iter_memory_n); + op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n); - DType* bias_n = weight_iter_n + D * H * ngates * H; // D * ngates * H + DType* bias_n = weight_iter_n + D * H * ngates * H; // Generally, D * ngates * H + // LBR-Gru, D * (ngates + 1) * H auto user_bias_memory_n = mkldnn::memory({ user_bias_md, cpu_engine }, bias_n); - op.bias_memory.push_back(user_bias_memory_n); - workptr = bias_n + D * ngates * H; + op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n); + workptr = bias_n + D * nbias * H; } DType* wx_n = workptr; // D * ngates * (D * H) * H @@ -587,8 +593,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, mkldnn::memory({ wx_md_n, cpu_engine }, wx_n); auto wh_memory_n = mkldnn::memory({ wh_md_n, cpu_engine }, wh_n); - op.concat_weight_memory.push_back(wx_memory_n); - op.concat_weight_memory.push_back(wh_memory_n); + op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_n); + op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_n); mkldnn::memory::dims src_iter_undi_tz = {1, 1, nstates, N, H}; // ldsnc auto src_iter_undi_md = mkldnn::memory::desc( @@ -596,12 +602,12 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter_undi = wh_n + D * ngates * H * H; // nstates * N * H auto src_iter_undi_memory = mkldnn::memory({ src_iter_undi_md, cpu_engine }, src_iter_undi); - op.concat_iter_memory.push_back(src_iter_undi_memory_0); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi_memory_0); DType* src_iter_undi2 = src_iter_undi + nstates * N * H; // nstates * N * H auto src_iter_undi2_memory = mkldnn::memory({ src_iter_undi_md, cpu_engine }, src_iter_undi2); - op.concat_iter_memory.push_back(src_iter_undi2_memory); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi2_memory); mkldnn::memory::dims src_iter_tz = {1, D, nstates, N, H}; // ldsnc auto src_iter_md = mkldnn::memory::desc( @@ -609,13 +615,13 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* src_iter = src_iter_undi2 + nstates * N * H; // D * nstates * N * H auto src_iter_memory = mkldnn::memory({ src_iter_md, cpu_engine }, src_iter); - op.concat_iter_memory.push_back(src_iter_memory); - op.hcx_memory.push_back(src_iter_memory); + op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory); + op.mkldnn_mems.hcx_memory.push_back(src_iter_memory); DType* dst_layer_n = src_iter + D * nstates * N * H; // T * N * D * H auto dst_layer_memory_n = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n); - op.y_memory.push_back(dst_layer_memory_n); + op.mkldnn_mems.y_memory.push_back(dst_layer_memory_n); mkldnn::memory::dims dst_iter_tz_n = {1, D, nstates, N, H}; // ldsnc auto dst_iter_md_n = mkldnn::memory::desc( @@ -623,7 +629,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, DType* dst_iter_n = dst_layer_n + T * N * D * H; // D * nstates * N * H auto dst_iter_memory_n = mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n); - op.hcy_memory.push_back(dst_iter_memory_n); + op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_n); + workptr = dst_iter_n + D * nstates * N * H; } } } From 1e1f799c322a7886f826a2a765f770c9ee443afd Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Thu, 25 Jul 2019 18:56:57 +0800 Subject: [PATCH 02/24] Readable params and UT supplement --- src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 86 +++----- src/operator/rnn.cc | 36 ++-- tests/python/unittest/test_operator.py | 255 +++++++++++++---------- 3 files changed, 191 insertions(+), 186 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h index 2db46ea84fc7..ecb4b536516a 100644 --- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h +++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h @@ -127,43 +127,35 @@ static void ConcatData(mkldnn::memory::format src_format, * For bidirectional, it will fused as data + back_data (weight, bias, * iter etc) * - * @param L Number of Layers - * @param D Direction of the RNN implement. It should be 1 or 2. - * @param T The maximum sequence length. - * @param N Batch size. - * @param I Input channel. Also the dimension of the input feature. - * @param H Hidden state size. + * @param num_layer Number of Layers + * @param direction Direction of the RNN implement. It should be 1 or 2. + * @param seq_len The maximum sequence length. + * @param batch_size Batch size. + * @param input_size Input channel. Also the dimension of the input feature. + * @param hidden_size Hidden state size. * @return The required cache size. */ -static size_t GetMKLDNNRNNCacheMemorySize(int L, - int D, - int T, - int N, - int I, - int H, +static size_t GetMKLDNNRNNCacheMemorySize(int num_layer, + int direction, + int seq_len, + int batch_size, + int input_size, + int hidden_size, int mode) { - size_t size = 0; - switch (mode) { - case rnn_enum::kLstm: - size = 2 * (D * (I + H) * 4 * H + (L - 1) * D * (D * H + H) * 4 * H + - L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H + - 6 * D * (I + H + 2) * 4 * H + T * N * I * 2; - break; - case rnn_enum::kGru: - size = 2 * (D * (I + H) * 3 * H + (L - 1) * D * (D * H + H) * 3 * H + - L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H + - 6 * D * (I + H + 2) * 3 * H + T * N * I * 2; - break; - case rnn_enum::kRnnRelu: - case rnn_enum::kRnnTanh: - size = 2 * (D * (I + H) * 1 * H + (L - 1) * D * (D * H + H) * 1 * H + - L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 1 * H + (L + 2) * D * 2 * N * H + - 6 * D * (I + H + 2) * 1 * H + T * N * I * 2; - break; - default: - LOG(FATAL) << "unknown RNN mode " << mode; - break; - } + int n_gates = 0, n_states = 0; + GetMKLDNNRNNAlgo(mode, &n_gates, &n_states); + int n_bias = mode == rnn_enum::kGru ? n_gates + 1 : n_gates; + // sizes of single gates from a single cell + const size_t weights_size_0 = direction * (input_size + hidden_size) * hidden_size; + const size_t weights_size_n = direction * (direction * hidden_size + hidden_size) * hidden_size; + const size_t bias_size = direction * hidden_size; + const size_t src_iter_size = direction * batch_size * hidden_size; + const size_t dst_iter_size = direction * batch_size * hidden_size; + const size_t dst_layer_size = seq_len * batch_size * direction * hidden_size; + + size_t size = (weights_size_0 + weights_size_n * (num_layer - 1)) * n_gates * 2 + + bias_size * num_layer * n_bias + src_iter_size * num_layer * n_states * 2 + + dst_iter_size * num_layer * n_states + dst_layer_size * 2; return size; } @@ -221,7 +213,6 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, std::vector *rnn_forward_prim, int layer_index, bool *has_cache, - int lvalue, int dtype, bool is_train, int mode) { @@ -277,15 +268,12 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi, {weights_iter_r_tz, weights_iter_r_tz}, weights_iter_tz, mkldnn_dtype, 1, srcs_data1, src_wh, &(mkldnn_mems->weight_iter_mems)); - int tmpvalue = 0; - if (lvalue > 0) { - tmpvalue = lvalue + 1; - } - MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, mkldnn_mems->wx_memory[tmpvalue])); - MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, mkldnn_mems->wh_memory[tmpvalue])); + + MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, mkldnn_mems->wx_memory[layer_index])); + MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, mkldnn_mems->wh_memory[layer_index])); DType* user_bias = reinterpret_cast - (mkldnn_mems->bias_memory[tmpvalue].get_data_handle()); + (mkldnn_mems->bias_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kGru) { // While mxnet gru gate order is reset, update and new gates, // mkldnn gru gate order is update, reset and new gates. So @@ -313,14 +301,6 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, } } } - if (lvalue > 0) { - mkldnn_mems->wx_memory[layer_index].set_data_handle( - mkldnn_mems->wx_memory[lvalue + 1].get_data_handle()); - mkldnn_mems->wh_memory[layer_index].set_data_handle( - mkldnn_mems->wh_memory[lvalue + 1].get_data_handle()); - mkldnn_mems->bias_memory[layer_index].set_data_handle( - mkldnn_mems->bias_memory[lvalue + 1].get_data_handle()); - } auto src_layer_md = mkldnn::memory::desc( { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc); @@ -577,7 +557,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, user_bias_f[g + l * single_b_size] = b_ptr[g + H + l * mx_single_b_sz * 2] + b_ptr[g + H + l * mx_single_b_sz * 2 + mx_single_b_sz]; - } + } #pragma omp parallel for num_threads(omp_threads) for (int g = 2 * H; g < 3 * H; g++) { user_bias_f[g + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2]; @@ -692,7 +672,7 @@ static void MKLDNNRNNForward(bool state_outputs, MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, - 0, has_cache, 0, dtype, is_train, mode); + 0, has_cache, dtype, is_train, mode); } else { MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, @@ -721,7 +701,7 @@ static void MKLDNNRNNForward(bool state_outputs, MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, - 1, has_cache, l + 1, dtype, is_train, mode); + 1, has_cache, dtype, is_train, mode); mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[1]; w_ptr += w_size; b_ptr += b_size; diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index 7edcbe5c61a9..c6d6890fd3d3 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -567,25 +567,23 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, auto wh_md_n = mkldnn::memory::desc( { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi); - for (int l = 0; l < L; l++) { - DType* weight_layer_n = workptr; // D * (H * D) * ngates * H - auto user_weight_layer_memory_n - = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n); - op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n); - - DType* weight_iter_n = weight_layer_n + - D * (H * D) * ngates * H; // D * H * ngates * H - auto user_weight_iter_memory_n - = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n); - op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n); - - DType* bias_n = weight_iter_n + D * H * ngates * H; // Generally, D * ngates * H - // LBR-Gru, D * (ngates + 1) * H - auto user_bias_memory_n = - mkldnn::memory({ user_bias_md, cpu_engine }, bias_n); - op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n); - workptr = bias_n + D * nbias * H; - } + DType* weight_layer_n = workptr; // D * (H * D) * ngates * H + auto user_weight_layer_memory_n + = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n); + op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n); + + DType* weight_iter_n = weight_layer_n + + D * (H * D) * ngates * H; // D * H * ngates * H + auto user_weight_iter_memory_n + = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n); + op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n); + + DType* bias_n = weight_iter_n + D * H * ngates * H; // Generally, D * ngates * H + // LBR-Gru, D * (ngates + 1) * H + auto user_bias_memory_n = + mkldnn::memory({ user_bias_md, cpu_engine }, bias_n); + op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n); + workptr = bias_n + D * nbias * H; DType* wx_n = workptr; // D * ngates * (D * H) * H DType* wh_n = wx_n + D * ngates * (D * H) * H; // D * ngates * H * H diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index d195ea9ef2f3..d875b224cd75 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -79,148 +79,175 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_lstm_sym(): - T, N, I, H = 5, 32, 800, 800 - fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='lstm', get_next_state=True, prefix='') - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.LSTMCell(H, prefix='l0_')) - stack.add(mx.rnn.LSTMCell(H, prefix='l1_')) - stack.add(mx.rnn.LSTMCell(H, prefix='l2_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='lstm', get_next_state=True, prefix='') + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.LSTMCell(H, prefix='l0_')) + stack.add(mx.rnn.LSTMCell(H, prefix='l1_')) + stack.add(mx.rnn.LSTMCell(H, prefix='l2_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_lstm_bidirectional(): - T, N, I, H = 5, 20, 800, 800 - fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='lstm', - bidirectional=True, get_next_state=True, prefix='') - - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.LSTMCell(H, prefix='l0_'), - mx.rnn.LSTMCell(H, prefix='r0_'), - output_prefix='bi_lstm_0_')) - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.LSTMCell(H, prefix='l1_'), - mx.rnn.LSTMCell(H, prefix='r1_'), - output_prefix='bi_lstm_1_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='lstm', + bidirectional=True, get_next_state=True, prefix='') + + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.LSTMCell(H, prefix='l0_'), + mx.rnn.LSTMCell(H, prefix='r0_'), + output_prefix='bi_lstm_0_')) + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.LSTMCell(H, prefix='l1_'), + mx.rnn.LSTMCell(H, prefix='r1_'), + output_prefix='bi_lstm_1_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_gru_sym(): - T, N, I, H = 5, 32, 800, 800 - fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='gru', get_next_state=True, prefix='') - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.GRUCell(H, prefix='l0_')) - stack.add(mx.rnn.GRUCell(H, prefix='l1_')) - stack.add(mx.rnn.GRUCell(H, prefix='l2_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='gru', get_next_state=True, prefix='') + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.GRUCell(H, prefix='l0_')) + stack.add(mx.rnn.GRUCell(H, prefix='l1_')) + stack.add(mx.rnn.GRUCell(H, prefix='l2_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_gru_bidirectional(): - T, N, I, H = 5, 20, 800, 800 - - fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='gru', - bidirectional=True, get_next_state=True, prefix='') - - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.GRUCell(H, prefix='l0_'), - mx.rnn.GRUCell(H, prefix='r0_'), - output_prefix='bi_gru_0_')) - - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.GRUCell(H, prefix='l1_'), - mx.rnn.GRUCell(H, prefix='r1_'), - output_prefix='bi_gru_1_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='gru', + bidirectional=True, get_next_state=True, prefix='') + + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.GRUCell(H, prefix='l0_'), + mx.rnn.GRUCell(H, prefix='r0_'), + output_prefix='bi_gru_0_')) + + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.GRUCell(H, prefix='l1_'), + mx.rnn.GRUCell(H, prefix='r1_'), + output_prefix='bi_gru_1_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnntanh_sym(): - T, N, I, H = 5, 32, 800, 800 - - fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_tanh', get_next_state=True, prefix='') - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l0_')) - stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l1_')) - stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l2_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_tanh', get_next_state=True, prefix='') + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l0_')) + stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l1_')) + stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l2_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnntanh_bidirectional(): - T, N, I, H = 5, 20, 800, 800 - - fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_tanh', - bidirectional=True, get_next_state=True, prefix='') - - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'), - mx.rnn.RNNCell(H, activation='tanh', prefix='r0_'), - output_prefix='bi_rnntanh_0_')) - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'), - mx.rnn.RNNCell(H, activation='tanh', prefix='r1_'), - output_prefix='bi_rnntanh_1_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_tanh', + bidirectional=True, get_next_state=True, prefix='') + + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'), + mx.rnn.RNNCell(H, activation='tanh', prefix='r0_'), + output_prefix='bi_rnntanh_0_')) + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'), + mx.rnn.RNNCell(H, activation='tanh', prefix='r1_'), + output_prefix='bi_rnntanh_1_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnnrelu_sym(): - T, N, I, H = 5, 32, 200, 200 - - fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_relu', get_next_state=True, prefix='') - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l0_')) - stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l1_')) - stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l2_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write') - check_rnn_consistency(fused, stack, T, N, I, H, 'add') - check_rnn_consistency(fused, stack, T, N, I, H, 'null') + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_relu', get_next_state=True, prefix='') + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l0_')) + stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l1_')) + stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l2_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write') + check_rnn_consistency(fused, stack, T, N, I, H, 'add') + check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnnrelu_bidirectional(): - T, N, I, H = 5, 20, 200, 200 - - fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_relu', - bidirectional=True, get_next_state=True, prefix='') - - stack = mx.rnn.SequentialRNNCell() - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.RNNCell(H, activation='relu', prefix='l0_'), - mx.rnn.RNNCell(H, activation='relu', prefix='r0_'), - output_prefix='bi_rnnrelu_0_')) - stack.add(mx.rnn.BidirectionalCell( - mx.rnn.RNNCell(H, activation='relu', prefix='l1_'), - mx.rnn.RNNCell(H, activation='relu', prefix='r1_'), - output_prefix='bi_rnnrelu_1_')) - - check_rnn_consistency(fused, stack, T, N, I, H, 'write', rtol=1e-2, atol=1e-2) - check_rnn_consistency(fused, stack, T, N, I, H, 'add', rtol=1e-2, atol=1e-2) - check_rnn_consistency(fused, stack, T, N, I, H, 'null', rtol=1e-2, atol=1e-2) + Ts = [1, 5] + Ns = [1, 32] + Is = [32, 128, 512] + Hs = [32, 128, 512] + for T, N, I, H in itertools.product(Ts, Ns, Is, Hs): + fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_relu', + bidirectional=True, get_next_state=True, prefix='') + + stack = mx.rnn.SequentialRNNCell() + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.RNNCell(H, activation='relu', prefix='l0_'), + mx.rnn.RNNCell(H, activation='relu', prefix='r0_'), + output_prefix='bi_rnnrelu_0_')) + stack.add(mx.rnn.BidirectionalCell( + mx.rnn.RNNCell(H, activation='relu', prefix='l1_'), + mx.rnn.RNNCell(H, activation='relu', prefix='r1_'), + output_prefix='bi_rnnrelu_1_')) + + check_rnn_consistency(fused, stack, T, N, I, H, 'write', rtol=1e-2, atol=1e-2) + check_rnn_consistency(fused, stack, T, N, I, H, 'add', rtol=1e-2, atol=1e-2) + check_rnn_consistency(fused, stack, T, N, I, H, 'null', rtol=1e-2, atol=1e-2) @with_seed() def test_lstm_dropout(): From 49ebe014037b87b335706c91b13b4948a4172d7a Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Thu, 25 Jul 2019 19:18:51 +0800 Subject: [PATCH 03/24] Fix lint errors --- src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 12 ++++++++---- src/operator/rnn.cc | 5 +++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h index ecb4b536516a..5422636f82f6 100644 --- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h +++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h @@ -374,7 +374,8 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, MKLDNNStream::Get()->Submit(); if (state_outputs) { - DType* dst_hcy = reinterpret_cast(mkldnn_mems->hcy_memory[layer_index].get_data_handle()); + DType* dst_hcy = reinterpret_cast( + mkldnn_mems->hcy_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kLstm) { offset1 = nstates * single_cell_size; offset2 = (nstates + 1) * single_cell_size; @@ -542,7 +543,8 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, mkldnn_mems->wx_memory[layer_index])); MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, mkldnn_mems->wh_memory[layer_index])); - DType* user_bias_f = reinterpret_cast(mkldnn_mems->bias_memory[layer_index].get_data_handle()); + DType* user_bias_f = reinterpret_cast( + mkldnn_mems->bias_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kGru) { const int mx_single_b_sz = ngates * H; for (int l = 0; l < L; l++) { @@ -569,7 +571,8 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, #pragma omp parallel for num_threads(omp_threads) for (int j = 0; j < L * single_b_size; j++) { int k = j / single_b_size; - user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size]; + user_bias_f[j] = b_ptr[j + k * single_b_size] + + b_ptr[j + k * single_b_size + single_b_size]; } } } @@ -604,7 +607,8 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, MKLDNNStream::Get()->Submit(); if (state_outputs) { - DType* dst_hcy = reinterpret_cast(mkldnn_mems->hcy_memory[layer_index].get_data_handle()); + DType* dst_hcy = reinterpret_cast( + mkldnn_mems->hcy_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kLstm) { for (int l = 0; l < L; l++) { offset1 = l * single_cell_size; diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc index c6d6890fd3d3..cc70beeb8e79 100644 --- a/src/operator/rnn.cc +++ b/src/operator/rnn.cc @@ -491,8 +491,9 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr, = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n); op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n); - DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H; // Generally, (L - 1) * ngates * H - // LBR-Gru, (L -1) * (ngates + 1) * H + DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H; // Generally, (L - 1) * + // ngates * H. LBR-Gru, + // (L -1) * (ngates + 1) * H auto user_bias_memory_n = mkldnn::memory({ user_bias_md, cpu_engine }, bias_n); op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n); From 71a822a14a8fb6059677952d875007aa35b485f6 Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Mon, 29 Jul 2019 15:11:21 +0800 Subject: [PATCH 04/24] Retrigger CI From 2facb29459191c35d5231e432bf9a91cc4122bb7 Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Wed, 31 Jul 2019 09:26:30 +0800 Subject: [PATCH 05/24] Enable re-initialization with training path --- src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h index 5422636f82f6..89f871719031 100644 --- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h +++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h @@ -508,7 +508,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, std::vector src_l_dim_h; bool has_adjusted = false; - if (!initialized) { + if (!initialized || is_train) { if (L == 1) { DType* wx = w_ptr; DType* wh = wx + I * H * ngates; From a6ee56c1736122b2cf4beba50be3a6c704e6da30 Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Wed, 31 Jul 2019 18:43:55 +0800 Subject: [PATCH 06/24] Trigger CI From 302f8dc350030ec72503109fa45d60889ca0b8ec Mon Sep 17 00:00:00 2001 From: zixuanweeei Date: Fri, 2 Aug 2019 16:35:31 +0800 Subject: [PATCH 07/24] Type refine and meaningful params --- src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 370 +++++++++++------------ 1 file changed, 178 insertions(+), 192 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h index 89f871719031..d71cbdfeacdf 100644 --- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h +++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h @@ -92,13 +92,13 @@ static void ConcatData(mkldnn::memory::format src_format, std::vector srcs_cds, mkldnn::memory::dims dst_cds, mkldnn::memory::data_type mkldnn_dtype, - int concat_dimension, + const int concat_dimension, const std::vector &srcs_data, const mkldnn::memory &dst, std::vector *tmp_src_mems) { auto cpu_engine = CpuEngine::Get()->get_engine(); std::vector srcs_pd; - bool initialized = tmp_src_mems->size() > 0; + const bool initialized = tmp_src_mems->size() > 0; for (size_t i = 0; i < srcs_cds.size(); i++) { auto desc = mkldnn::memory::desc(srcs_cds[i], mkldnn_dtype, src_format); auto mpd = mkldnn::memory::primitive_desc(desc, cpu_engine); @@ -119,13 +119,13 @@ static void ConcatData(mkldnn::memory::format src_format, /** * Size of cached memory * - * Cache memory of wx, wh from the first layer and next L - 1 layers + * Cache memory of wx, wh from the first layer and next num_layer - 1 layers * seperately, as well as the layer and iter memory for src and dst. * Output states memory hx, hc and bias memory are also cached. It * will prepare memory on before and after reorder and concat. For - * unidirectional, it will fused as dim like 1 + (L - 1) when I != H. - * For bidirectional, it will fused as data + back_data (weight, bias, - * iter etc) + * unidirectional, it will fused as dim like 1 + (num_layer - 1) when + * input_size != hidden_size. For bidirectional, it will fused as data + + * back_data (weight, bias, iter etc) * * @param num_layer Number of Layers * @param direction Direction of the RNN implement. It should be 1 or 2. @@ -135,16 +135,16 @@ static void ConcatData(mkldnn::memory::format src_format, * @param hidden_size Hidden state size. * @return The required cache size. */ -static size_t GetMKLDNNRNNCacheMemorySize(int num_layer, - int direction, - int seq_len, - int batch_size, - int input_size, - int hidden_size, - int mode) { +static size_t GetMKLDNNRNNCacheMemorySize(const size_t num_layer, + const size_t direction, + const size_t seq_len, + const size_t batch_size, + const size_t input_size, + const size_t hidden_size, + const size_t mode) { int n_gates = 0, n_states = 0; GetMKLDNNRNNAlgo(mode, &n_gates, &n_states); - int n_bias = mode == rnn_enum::kGru ? n_gates + 1 : n_gates; + const size_t n_bias = mode == rnn_enum::kGru ? n_gates + 1 : n_gates; // sizes of single gates from a single cell const size_t weights_size_0 = direction * (input_size + hidden_size) * hidden_size; const size_t weights_size_n = direction * (direction * hidden_size + hidden_size) * hidden_size; @@ -161,46 +161,31 @@ static size_t GetMKLDNNRNNCacheMemorySize(int num_layer, template static void AdjustGruWeightGateOrder(DType* weight, - const int I, - const int H) { + const int input_size, + const int hidden_size) { // mxnet gru gate order is reset, update and new gates // mkldnn gru gate order is update, reset and new gates const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); DType* weight_reset = weight; - DType* weight_update = weight + I * H; + DType* weight_update = weight + input_size * hidden_size; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < I * H; i++) { + for (int i = 0; i < input_size * hidden_size; i++) { DType tmp = weight_update[i]; weight_update[i] = weight_reset[i]; weight_reset[i] = tmp; } } -template -static void AdjustGruBiasGateOrder(DType* bias, - const int H) { - // mxnet gru gate order is reset, update and new gates - // mkldnn gru gate order is update, reset and new gates - const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - DType* bias_reset = bias; - DType* bias_update = bias + H; - #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H; i++) { - DType tmp = bias_update[i]; - bias_update[i] = bias_reset[i]; - bias_reset[i] = tmp; - } -} // since there is different sematics of MKLDNN's Fused RNN and MXNet FusedRNN, // bidirectional will be fused layer by layer, -// unidirectional will be done by fused 1 + fused (L - 1) layers or fused L layers(when I = H) - +// unidirectional will be done by fused 1 + fused (num_layer - 1) layers or fused num_layer +// layers(when input_size = hidden_size) template static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, - const int T, - const int N, - const int I, - const int H, + const int seq_len, + const int batch_size, + const int input_size, + const int hidden_size, DType* x_ptr, DType* hx_ptr, DType* cx_ptr, @@ -220,42 +205,42 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates); const int nbias = mode == rnn_enum::kGru ? ngates + 1 : ngates; mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype); - const int single_cell_size = N * H; - const int mx_single_b_sz = ngates * H; - DType* wx = w_ptr; // ngates * H, I - DType* wh = w_ptr + I * H * ngates; // ngates * H, H - DType* back_wx = w_ptr + ngates * H * (I + H); - DType* back_wh = back_wx + I * H * ngates; + const int single_cell_size = batch_size * hidden_size; + const int mx_single_b_sz = ngates * hidden_size; + DType* wx = w_ptr; // ngates * hidden_size, input_size + DType* wh = w_ptr + input_size * hidden_size * ngates; // ngates * hidden_size, hidden_size + DType* back_wx = w_ptr + ngates * hidden_size * (input_size + hidden_size); + DType* back_wh = back_wx + input_size * hidden_size * ngates; DType* bx = b_ptr; - DType* bh = b_ptr + H * ngates; + DType* bh = b_ptr + hidden_size * ngates; DType* back_bx = b_ptr + mx_single_b_sz * 2; - DType* back_bh = back_bx + H * ngates; + DType* back_bh = back_bx + hidden_size * ngates; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); auto cpu_engine = CpuEngine::Get()->get_engine(); auto null_memory_ = null_memory(cpu_engine); int offset1 = 0, offset2 = 0; bool initialized = *has_cache; - mkldnn::memory::dims src_layer_tz = {T, N, I}; - mkldnn::memory::dims dst_layer_tz = {T, N, 2 * H}; - mkldnn::memory::dims weights_layer_tz = {1, 2, I, ngates, H}; // ldigo - mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H}; // ldigo for reorder - mkldnn::memory::dims weights_iter_tz = {1, 2, H, ngates, H}; // ldigo - mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H}; // ldigo for reorder - mkldnn::memory::dims bias_tz = {1, 2, nbias, H}; // ldgo - mkldnn::memory::dims src_iter_tz = {1, 2, nstates, N, H}; // ldsnc - mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, N, H}; // ldsnc + mkldnn::memory::dims src_layer_tz = {seq_len, batch_size, input_size}; + mkldnn::memory::dims dst_layer_tz = {seq_len, batch_size, 2 * hidden_size}; + mkldnn::memory::dims weights_layer_tz = {1, 2, input_size, ngates, hidden_size}; // ldigo + mkldnn::memory::dims weights_iter_tz = {1, 2, hidden_size, ngates, hidden_size}; // ldigo + mkldnn::memory::dims bias_tz = {1, 2, nbias, hidden_size}; // ldgo + mkldnn::memory::dims src_iter_tz = {1, 2, nstates, batch_size, hidden_size}; // ldsnc + mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, batch_size, hidden_size}; // ldsnc + mkldnn::memory::dims weights_layer_r_tz = {1, 1, input_size, ngates, hidden_size}; + mkldnn::memory::dims weights_iter_r_tz = {1, 1, hidden_size, ngates, hidden_size}; bool has_adjusted = false; if (!initialized || is_train) { if (mode == rnn_enum::kGru) { - AdjustGruWeightGateOrder(wx, I, H); - AdjustGruWeightGateOrder(back_wx, I, H); - AdjustGruWeightGateOrder(wh, H, H); - AdjustGruWeightGateOrder(back_wh, H, H); + AdjustGruWeightGateOrder(wx, input_size, hidden_size); + AdjustGruWeightGateOrder(back_wx, input_size, hidden_size); + AdjustGruWeightGateOrder(wh, hidden_size, hidden_size); + AdjustGruWeightGateOrder(back_wh, hidden_size, hidden_size); has_adjusted = true; } - auto src_wx = mkldnn_mems->concat_weight_memory[2 * layer_index]; - auto src_wh = mkldnn_mems->concat_weight_memory[2 * layer_index + 1]; + mkldnn::memory& src_wx = mkldnn_mems->concat_weight_memory[2 * layer_index]; + mkldnn::memory& src_wh = mkldnn_mems->concat_weight_memory[2 * layer_index + 1]; std::vector srcs_data1; srcs_data1.push_back(wx); srcs_data1.push_back(back_wx); @@ -278,20 +263,19 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, // While mxnet gru gate order is reset, update and new gates, // mkldnn gru gate order is update, reset and new gates. So // we need to swap the order of reset and update from mxnet. - const index_t single_b_sz = nbias * H; - #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < H; j++) { - user_bias[j + H] = bx[j] + bh[j]; - user_bias[single_b_sz + j + H] = back_bx[j] + back_bh[j]; - user_bias[j] = bx[j + H] + bh[j + H]; - user_bias[single_b_sz + j] = back_bx[j + H] + back_bh[j + H]; - } + const index_t single_b_sz = nbias * hidden_size; #pragma omp parallel for num_threads(omp_threads) - for (int j = 2 * H; j < 3 * H; j++) { - user_bias[j] = bx[j]; - user_bias[j + H] = bh[j]; - user_bias[single_b_sz + j] = back_bx[j]; - user_bias[single_b_sz + j + H] = back_bh[j]; + for (int j = 0; j < hidden_size; j++) { + user_bias[j + hidden_size] = bx[j] + bh[j]; + user_bias[single_b_sz + j + hidden_size] = back_bx[j] + back_bh[j]; + + user_bias[j] = bx[j + hidden_size] + bh[j + hidden_size]; + user_bias[single_b_sz + j] = back_bx[j + hidden_size] + back_bh[j + hidden_size]; + + user_bias[j + 2 * hidden_size] = bx[j + 2 * hidden_size]; + user_bias[j + 3 * hidden_size] = bh[j + 2 * hidden_size]; + user_bias[single_b_sz + j + 2 * hidden_size] = back_bx[j + 2 * hidden_size]; + user_bias[single_b_sz + j + 3 * hidden_size] = back_bh[j + 2 * hidden_size]; } } else { #pragma omp parallel for num_threads(omp_threads) @@ -313,32 +297,35 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, auto dst_iter_md = mkldnn::memory::desc( { dst_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc); auto src_iter_md = mkldnn::memory::desc( - {src_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc); - auto bias_md = mkldnn::memory::desc({bias_tz}, - mkldnn_dtype, mkldnn::memory::format::ldgo); + { src_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc); + auto bias_md = mkldnn::memory::desc( + { bias_tz }, mkldnn_dtype, mkldnn::memory::format::ldgo); - auto user_src_iter_memory = mkldnn_mems->concat_iter_memory[2]; + mkldnn::memory& user_src_iter_memory = mkldnn_mems->concat_iter_memory[2]; if (mode == rnn_enum::kLstm) { std::vector srcs_data1; srcs_data1.push_back(hx_ptr); srcs_data1.push_back(cx_ptr); - auto tmp1_src_iter_memory = mkldnn_mems->concat_iter_memory[0]; + mkldnn::memory& tmp1_src_iter_memory = mkldnn_mems->concat_iter_memory[0]; ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, - {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2, - srcs_data1, tmp1_src_iter_memory, &(mkldnn_mems->uni_states_memory)); + {{1, 1, 1, batch_size, hidden_size}, {1, 1, 1, batch_size, hidden_size}}, + {1, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 2, srcs_data1, + tmp1_src_iter_memory, &(mkldnn_mems->uni_states_memory)); std::vector srcs_data2; srcs_data2.push_back(hx_ptr + single_cell_size); srcs_data2.push_back(cx_ptr + single_cell_size); - auto tmp2_src_iter_memory = mkldnn_mems->concat_iter_memory[1]; + mkldnn::memory& tmp2_src_iter_memory = mkldnn_mems->concat_iter_memory[1]; ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, - {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2, - srcs_data2, tmp2_src_iter_memory, &(mkldnn_mems->uni_states_memory)); + {{1, 1, 1, batch_size, hidden_size}, {1, 1, 1, batch_size, hidden_size}}, + {1, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 2, srcs_data2, + tmp2_src_iter_memory, &(mkldnn_mems->uni_states_memory)); std::vector srcs_data3; srcs_data3.push_back(reinterpret_cast(tmp1_src_iter_memory.get_data_handle())); srcs_data3.push_back(reinterpret_cast(tmp2_src_iter_memory.get_data_handle())); ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, - {{1, 1, nstates, N, H}, {1, 1, nstates, N, H}}, {1, 2, nstates, N, H}, - mkldnn_dtype, 1, srcs_data3, user_src_iter_memory, &(mkldnn_mems->concat_states_memory)); + {{1, 1, nstates, batch_size, hidden_size}, {1, 1, nstates, batch_size, hidden_size}}, + {1, 2, nstates, batch_size, hidden_size}, mkldnn_dtype, 1, srcs_data3, + user_src_iter_memory, &(mkldnn_mems->concat_states_memory)); } else { user_src_iter_memory.set_data_handle(hx_ptr); } @@ -394,21 +381,21 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs, } } if (has_adjusted) { - AdjustGruWeightGateOrder(wx, I, H); - AdjustGruWeightGateOrder(back_wx, I, H); - AdjustGruWeightGateOrder(wh, H, H); - AdjustGruWeightGateOrder(back_wh, H, H); + AdjustGruWeightGateOrder(wx, input_size, hidden_size); + AdjustGruWeightGateOrder(back_wx, input_size, hidden_size); + AdjustGruWeightGateOrder(wh, hidden_size, hidden_size); + AdjustGruWeightGateOrder(back_wh, hidden_size, hidden_size); } } template -static void MKLDNNRNNForwardUnidi(bool state_outputs, - const int L, - const int T, - const int N, - const int I, - const int H, +static void MKLDNNRNNForwardUnidi(const bool state_outputs, + const int num_layer, + const int seq_len, + const int batch_size, + const int input_size, + const int hidden_size, DType* x_ptr, DType* hx_ptr, DType* cx_ptr, @@ -428,25 +415,25 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates); const int nbias = (mode == rnn_enum::kGru ? ngates + 1 : ngates); mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype); - const int cell_size = N * H; - const int single_cell_size = N * H; - const int single_b_size = nbias * H; - int w_size = (I + H) * H * ngates; + const int cell_size = batch_size * hidden_size; + const int single_cell_size = batch_size * hidden_size; + const int single_b_size = nbias * hidden_size; + const int w_size = (input_size + hidden_size) * hidden_size * ngates; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); auto cpu_engine = CpuEngine::Get()->get_engine(); auto null_memory_ = null_memory(cpu_engine); int offset1 = 0, offset2 = 0; bool initialized = *has_cache; - mkldnn::memory::dims src_layer_tz = {T, N, I}; - mkldnn::memory::dims dst_layer_tz = {T, N, H}; - mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H}; // ldigo - mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H}; // ldigo - mkldnn::memory::dims bias_tz = {L, 1, nbias, H}; // ldgo - mkldnn::memory::dims src_iter_tz = {L, 1, nstates, N, H}; // ldsnc - mkldnn::memory::dims dst_iter_tz = {L, 1, nstates, N, H}; // ldsnc - mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H}; // ldigo for reorder - mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H}; // ldigo for reorder + mkldnn::memory::dims src_layer_tz = {seq_len, batch_size, input_size}; + mkldnn::memory::dims dst_layer_tz = {seq_len, batch_size, hidden_size}; + mkldnn::memory::dims weights_layer_tz = {num_layer, 1, input_size, ngates, hidden_size}; // ldigo + mkldnn::memory::dims weights_iter_tz = {num_layer, 1, hidden_size, ngates, hidden_size}; // ldigo + mkldnn::memory::dims bias_tz = {num_layer, 1, nbias, hidden_size}; // ldgo + mkldnn::memory::dims src_iter_tz = {num_layer, 1, nstates, batch_size, hidden_size}; // ldsnc + mkldnn::memory::dims dst_iter_tz = {num_layer, 1, nstates, batch_size, hidden_size}; // ldsnc + mkldnn::memory::dims weights_layer_r_tz = {1, 1, input_size, ngates, hidden_size}; + mkldnn::memory::dims weights_iter_r_tz = {1, 1, hidden_size, ngates, hidden_size}; auto weight_layer_md = mkldnn::memory::desc( { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo); @@ -463,15 +450,16 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, auto dst_iter_md = mkldnn::memory::desc( {dst_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc); - for (int l = 0; l < L; l++) { + for (int l = 0; l < num_layer; l++) { if (mode == rnn_enum::kLstm) { std::vector srcs_data; srcs_data.push_back(hx_ptr); srcs_data.push_back(cx_ptr); - auto tmp_src_iter_memory = mkldnn_mems->concat_iter_memory[l + layer_index]; + mkldnn::memory& tmp_src_iter_memory = mkldnn_mems->concat_iter_memory[l + layer_index]; ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, - {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, - 2, srcs_data, tmp_src_iter_memory, &(mkldnn_mems->uni_states_memory)); + {{1, 1, 1, batch_size, hidden_size}, {1, 1, 1, batch_size, hidden_size}}, + {1, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 2, srcs_data, + tmp_src_iter_memory, &(mkldnn_mems->uni_states_memory)); } else { mkldnn_mems->concat_iter_memory[l + layer_index].set_data_handle(hx_ptr); } @@ -481,26 +469,26 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, } } - auto user_src_iter_memory = null_memory_; - if (L == 1) { - user_src_iter_memory = mkldnn_mems->concat_iter_memory[layer_index]; + mkldnn::memory* user_src_iter_memory; + if (num_layer == 1) { + user_src_iter_memory = &(mkldnn_mems->concat_iter_memory[layer_index]); } else { - user_src_iter_memory = mkldnn_mems->concat_iter_memory[L + layer_index]; + user_src_iter_memory = &(mkldnn_mems->concat_iter_memory[num_layer + layer_index]); std::vector src_l_data; std::vector src_l_dim; - for (int l = 0; l < L; l++) { + for (int l = 0; l < num_layer; l++) { src_l_data.push_back(reinterpret_cast (mkldnn_mems->concat_iter_memory[l + layer_index].get_data_handle())); - src_l_dim.push_back({1, 1, nstates, N, H}); + src_l_dim.push_back({1, 1, nstates, batch_size, hidden_size}); } ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, src_l_dim, - {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory, - &(mkldnn_mems->concat_states_memory)); + {num_layer, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 0, src_l_data, + *user_src_iter_memory, &(mkldnn_mems->concat_states_memory)); } - mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory.get_data_handle()); + mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory->get_data_handle()); - auto src_wx_f = mkldnn_mems->concat_weight_memory[2 * layer_index]; - auto src_wh_f = mkldnn_mems->concat_weight_memory[2 * layer_index + 1]; + mkldnn::memory& src_wx_f = mkldnn_mems->concat_weight_memory[2 * layer_index]; + mkldnn::memory& src_wh_f = mkldnn_mems->concat_weight_memory[2 * layer_index + 1]; std::vector srcs_data_x; std::vector srcs_data_h; @@ -509,23 +497,23 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, bool has_adjusted = false; if (!initialized || is_train) { - if (L == 1) { + if (num_layer == 1) { DType* wx = w_ptr; - DType* wh = wx + I * H * ngates; + DType* wh = wx + input_size * hidden_size * ngates; if (mode == rnn_enum::kGru) { - AdjustGruWeightGateOrder(wx, I, H); - AdjustGruWeightGateOrder(wh, H, H); + AdjustGruWeightGateOrder(wx, input_size, hidden_size); + AdjustGruWeightGateOrder(wh, hidden_size, hidden_size); has_adjusted = true; } src_wx_f.set_data_handle(wx); src_wh_f.set_data_handle(wh); } else { - for (int l = 0; l < L; l++) { + for (int l = 0; l < num_layer; l++) { DType* wx = w_ptr + l * w_size; - DType* wh = wx + I * H * ngates; + DType* wh = wx + input_size * hidden_size * ngates; if (mode == rnn_enum::kGru) { - AdjustGruWeightGateOrder(wx, I, H); - AdjustGruWeightGateOrder(wh, H, H); + AdjustGruWeightGateOrder(wx, input_size, hidden_size); + AdjustGruWeightGateOrder(wh, hidden_size, hidden_size); has_adjusted = true; } srcs_data_x.push_back(wx); @@ -546,30 +534,30 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, DType* user_bias_f = reinterpret_cast( mkldnn_mems->bias_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kGru) { - const int mx_single_b_sz = ngates * H; - for (int l = 0; l < L; l++) { + const int mx_single_b_sz = ngates * hidden_size; + for (int l = 0; l < num_layer; l++) { #pragma omp parallel for num_threads(omp_threads) - for (int g = 0; g < H; g++) { + for (int g = 0; g < hidden_size; g++) { // While mxnet gru gate order is reset, update and new gates, // mkldnn gru gate order is update, reset and new gates. So // we need to swap the order of reset and update from mxnet. - user_bias_f[g + H + l * single_b_size] = + user_bias_f[g + hidden_size + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2] + b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz]; + user_bias_f[g + l * single_b_size] = - b_ptr[g + H + l * mx_single_b_sz * 2] - + b_ptr[g + H + l * mx_single_b_sz * 2 + mx_single_b_sz]; - } - #pragma omp parallel for num_threads(omp_threads) - for (int g = 2 * H; g < 3 * H; g++) { - user_bias_f[g + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2]; - user_bias_f[g + l * single_b_size + H] = - b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz]; + b_ptr[g + hidden_size + l * mx_single_b_sz * 2] + + b_ptr[g + hidden_size + l * mx_single_b_sz * 2 + mx_single_b_sz]; + + user_bias_f[g + l * single_b_size + 2 * hidden_size] = + b_ptr[g + l * mx_single_b_sz * 2 + 2 * hidden_size]; + user_bias_f[g + l * single_b_size + 3 * hidden_size] = + b_ptr[g + 2 * hidden_size + l * mx_single_b_sz * 2 + mx_single_b_sz]; } } } else { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < L * single_b_size; j++) { + for (int j = 0; j < num_layer * single_b_size; j++) { int k = j / single_b_size; user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size]; @@ -610,7 +598,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, DType* dst_hcy = reinterpret_cast( mkldnn_mems->hcy_memory[layer_index].get_data_handle()); if (mode == rnn_enum::kLstm) { - for (int l = 0; l < L; l++) { + for (int l = 0; l < num_layer; l++) { offset1 = l * single_cell_size; offset2 = l * nstates * single_cell_size; #pragma omp parallel for num_threads(omp_threads) @@ -621,29 +609,29 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs, } } else { #pragma omp parallel for num_threads(omp_threads) - for (int n = 0; n < L * single_cell_size; n++) { + for (int n = 0; n < num_layer * single_cell_size; n++) { hy_ptr[n] = dst_hcy[n]; } } } if (has_adjusted) { - for (int l = 0; l < L; l++) { + for (int l = 0; l < num_layer; l++) { DType* wx = w_ptr + l * w_size; - DType* wh = wx + I * H * ngates; - AdjustGruWeightGateOrder(wx, I, H); - AdjustGruWeightGateOrder(wh, H, H); + DType* wh = wx + input_size * hidden_size * ngates; + AdjustGruWeightGateOrder(wx, input_size, hidden_size); + AdjustGruWeightGateOrder(wh, hidden_size, hidden_size); } } } template -static void MKLDNNRNNForward(bool state_outputs, - const int L, - const int D, - const int T, - const int N, - const int I, - const int H, +static void MKLDNNRNNForward(const bool state_outputs, + const int num_layer, + const int direction, + const int seq_len, + const int batch_size, + const int input_size, + const int hidden_size, DType* x_ptr, DType* hx_ptr, DType* cx_ptr, @@ -660,38 +648,35 @@ static void MKLDNNRNNForward(bool state_outputs, int mode) { int ngates = 0, nstates = 0; GetMKLDNNRNNAlgo(mode, &ngates, &nstates); - const int b_size = 2 * H * ngates * D; - const int cell_size = N * H * D; + const int b_size = 2 * hidden_size * ngates * direction; + const int cell_size = batch_size * hidden_size * direction; // First layer - int w_size = (I + H) * H * ngates * D; + int w_size = (input_size + hidden_size) * hidden_size * ngates * direction; DType* tmpNull = NULL; - // when D = 1 and I == H, L layers can be fused together - if (D == 1 && I == H) { - MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, - mkldnn_mems, rnn_forward_prim, - 0, has_cache, dtype, is_train, mode); + // when direction = 1 and input_size == hidden_size, num_layer layers can be fused together + if (direction == 1 && input_size == hidden_size) { + MKLDNNRNNForwardUnidi(state_outputs, num_layer, seq_len, batch_size, input_size, + hidden_size, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, + mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode); } else { - if (D == 2) { - MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, - mkldnn_mems, rnn_forward_prim, - 0, has_cache, dtype, is_train, mode); + if (direction == 2) { + MKLDNNRNNForwardSingleLayerBi(state_outputs, seq_len, batch_size, input_size, + hidden_size, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, + mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode); } else { - MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, - mkldnn_mems, rnn_forward_prim, + MKLDNNRNNForwardUnidi(state_outputs, 1, seq_len, batch_size, input_size, hidden_size, x_ptr, + hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode); } - if (L > 1) { + if (num_layer > 1) { mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[0]; - // go to next L - 1 layers. - // If D = 2, do it layer by layer. If D = 1, fused L - 1 layers + // go to next num_layer - 1 layers. + // If direction = 2, do it layer by layer. If direction = 1, fused num_layer - 1 layers w_ptr += w_size; b_ptr += b_size; - if (D == 2) { - w_size = (H * D + H) * H * ngates * D; - for (int l = 0; l < L - 1; l++) { + if (direction == 2) { + w_size = (hidden_size * direction + hidden_size) * hidden_size * ngates * direction; + for (int l = 0; l < num_layer - 1; l++) { if (state_outputs) { hy_ptr += cell_size; if (mode == rnn_enum::kLstm) { @@ -702,26 +687,27 @@ static void MKLDNNRNNForward(bool state_outputs, if (mode == rnn_enum::kLstm) { cx_ptr += cell_size; } - MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, - cy_ptr, mkldnn_mems, rnn_forward_prim, - 1, has_cache, dtype, is_train, mode); + MKLDNNRNNForwardSingleLayerBi(state_outputs, seq_len, batch_size, + direction * hidden_size, hidden_size, tmpNull, hx_ptr, cx_ptr, w_ptr, b_ptr, + y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, 1, has_cache, dtype, + is_train, mode); mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[1]; w_ptr += w_size; b_ptr += b_size; } } - if (D == 1) { + if (direction == 1) { if (state_outputs) { hy_ptr += cell_size; if (mode == rnn_enum::kLstm) { cy_ptr += cell_size; } } - w_size = (H + H) * H * ngates; - MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull, - hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems, - rnn_forward_prim, 1, has_cache, dtype, is_train, mode); + w_size = (hidden_size + hidden_size) * hidden_size * ngates; + MKLDNNRNNForwardUnidi(state_outputs, num_layer - 1, seq_len, batch_size, + hidden_size, hidden_size, tmpNull, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, + hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, 1, has_cache, dtype, + is_train, mode); } } } @@ -729,7 +715,7 @@ static void MKLDNNRNNForward(bool state_outputs, } template -static void MKLDNNRNNForwardInference(bool state_outputs, +static void MKLDNNRNNForwardInference(const bool state_outputs, const int num_layers, const int direction, const int seq_length, From 794b1909a86ebd762584beb857e954250344054c Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Wed, 31 Jul 2019 20:26:55 +0200 Subject: [PATCH 08/24] Add missing default axis value to symbol.squeeze op (#15707) * Add missing default arg * Add test * add test --- python/mxnet/symbol/symbol.py | 2 +- tests/python/unittest/test_gluon.py | 24 ++++++++++++++++++++---- tests/python/unittest/test_symbol.py | 1 + 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py index deedf0fe83d2..1e2defab3713 100644 --- a/python/mxnet/symbol/symbol.py +++ b/python/mxnet/symbol/symbol.py @@ -2539,7 +2539,7 @@ def softmin(self, *args, **kwargs): """ return op.softmin(self, *args, **kwargs) - def squeeze(self, axis, inplace=False, **kwargs): # pylint: disable=unused-argument + def squeeze(self, axis=None, inplace=False, **kwargs): # pylint: disable=unused-argument """Convenience fluent method for :py:func:`squeeze`. The arguments are the same as for :py:func:`squeeze`, with diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index b59ce2d0864c..af30980b10ea 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -115,7 +115,7 @@ def test_parameter_dict(): params1.get('w1', shape=(10, 10), stype='row_sparse') params1.load('test_parameter_dict.params', ctx) trainer1 = mx.gluon.Trainer(params1, 'sgd') - + # compare the values before and after save/load cur_w0 = params1.get('w0').data(ctx) cur_w1 = params1.get('w1').row_sparse_data(all_row_ids) @@ -134,7 +134,7 @@ def test_parameter_dict(): cur_w1 = params2.get('w1').data(ctx) mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) - + # test the dtype casting functionality params0 = gluon.ParameterDict('') params0.get('w0', shape=(10, 10), dtype='float32') @@ -386,7 +386,7 @@ def hybrid_forward(self, F, x): if 'conv' in param_name and 'weight' in param_name: break assert np.dtype(net_fp64.params[param_name].dtype) == np.dtype(np.float64) - + # 3.b Verify same functionnality with the imports API net_fp_64 = mx.gluon.SymbolBlock.imports(sym_file, 'data', params_file, ctx=ctx) @@ -2788,7 +2788,7 @@ def test_gluon_param_load(): net.cast('float16') net.load_parameters('test_gluon_param_load.params', cast_dtype=True) mx.nd.waitall() - + @with_seed() def test_gluon_param_load_dtype_source(): net = mx.gluon.nn.Dense(10, in_units=10) @@ -2800,6 +2800,22 @@ def test_gluon_param_load_dtype_source(): assert net.weight.dtype == np.float16 mx.nd.waitall() +@with_seed() +def test_squeeze_consistency(): + class Foo(gluon.HybridBlock): + def __init__(self, inplace, **kwargs): + super(Foo, self).__init__(**kwargs) + self.inplace = inplace + + def forward(self, x): + return x.squeeze(inplace=self.inplace) + + for inplace in (True, False): + block = Foo(inplace) + block.hybridize() + shape = (np.random.randint(1, 10), np.random.randint(1, 10), 1) + block(mx.nd.ones(shape)) + if __name__ == '__main__': import nose nose.runmodule() diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py index 2dfe3e44eedb..0c97c68b0880 100644 --- a/tests/python/unittest/test_symbol.py +++ b/tests/python/unittest/test_symbol.py @@ -242,6 +242,7 @@ def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False): check_fluent_regular('reshape', {'shape': (17, 1, 5)}) check_fluent_regular('broadcast_to', {'shape': (5, 17, 47)}) check_fluent_regular('squeeze', {'axis': (1, 3)}, shape=(2, 1, 3, 1, 4)) + check_fluent_regular('squeeze', {}, shape=(2, 1, 3, 1, 4)) def check_symbol_consistency(sym1, sym2, ctx, skip_grad=False, equal_nan=False): assert sym1.list_arguments() == sym2.list_arguments() From 0042c49aa91ac2c9e2f721336beb7b708fa5e806 Mon Sep 17 00:00:00 2001 From: Haohuan Wang Date: Wed, 31 Jul 2019 14:36:14 -0700 Subject: [PATCH 09/24] add deconv in TRT subgraph (#15666) --- .../subgraph/tensorrt/nnvm_to_onnx-inl.h | 19 +++++- .../subgraph/tensorrt/nnvm_to_onnx.cc | 46 ++++++++++---- src/operator/subgraph/tensorrt/tensorrt-inl.h | 2 + tests/python/tensorrt/test_tensorrt_deconv.py | 63 +++++++++++++++++++ 4 files changed, 116 insertions(+), 14 deletions(-) create mode 100644 tests/python/tensorrt/test_tensorrt_deconv.py diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h index 55b3d938df0a..5a433f1d9820 100644 --- a/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h +++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h @@ -41,6 +41,8 @@ namespace mxnet { namespace op { namespace nnvm_to_onnx { +enum ConvDeconvType {Convolution, Deconvolution}; + using namespace nnvm; using namespace ::onnx; using int64 = ::google::protobuf::int64; @@ -48,8 +50,7 @@ using int64 = ::google::protobuf::int64; std::unordered_map GetPlaceholderShapes(const ShapeVector& shape_inputs, const nnvm::IndexedGraph& ig); -std::unordered_map GetPlaceholderDTypes(const DTypeVector& -dtype_inputs, +std::unordered_map GetPlaceholderDTypes(const DTypeVector& dtype_inputs, const nnvm::IndexedGraph& ig); std::unordered_map GetOutputLookup(const nnvm::IndexedGraph& ig); @@ -74,12 +75,25 @@ typedef void (*ConverterFunction)(NodeProto *node_proto, const nnvm::IndexedGraph &ig, const array_view &inputs); +template +void ConvDeconvConvertHelper(NodeProto *node_proto, + const NodeAttrs &attrs, + const nnvm::IndexedGraph &ig, + const array_view &inputs, + const ConvDeconvParam& param, + ConvDeconvType type); + // Forward declarations void ConvertConvolution(NodeProto *node_proto, const NodeAttrs &attrs, const nnvm::IndexedGraph &ig, const array_view &inputs); +void ConvertDeconvolution(NodeProto *node_proto, + const NodeAttrs &attrs, + const nnvm::IndexedGraph &ig, + const array_view &inputs); + void ConvertPooling(NodeProto *node_proto, const NodeAttrs &attrs, const nnvm::IndexedGraph &ig, @@ -158,6 +172,7 @@ static const std::unordered_map converter_map = {"BatchNorm", ConvertBatchNorm}, {"clip", ConvertClip}, {"Convolution", ConvertConvolution}, + {"Deconvolution", ConvertDeconvolution}, {"Concat", ConvertConcatenate}, {"Dropout", ConvertDropout}, {"elemwise_add", ConvertElementwiseAdd}, diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc index 6116f296e300..84580d0b05d0 100644 --- a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc +++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include "../../../common/utils.h" #include "../../../ndarray/ndarray_function.h" @@ -170,20 +171,25 @@ std::string ConvertNnvmGraphToOnnx( return serialized_onnx_graph; } -void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs, - const nnvm::IndexedGraph& /*ig*/, - const array_view& /*inputs*/) { - const auto& conv_param = nnvm::get(attrs.parsed); - - node_proto->set_op_type("Conv"); +template +void ConvDeconvConvertHelper(NodeProto* node_proto, const NodeAttrs& attrs, + const nnvm::IndexedGraph& /*ig*/, + const array_view& /*input*/, + const ConvDeconvParam& param, + ConvDeconvType type) { + if (type == ConvDeconvType::Convolution) { + node_proto->set_op_type("Conv"); + } else { + node_proto->set_op_type("ConvTranspose"); + } - const mxnet::TShape kernel = conv_param.kernel; - const mxnet::TShape stride = conv_param.stride; - const mxnet::TShape dilate = conv_param.dilate; - const mxnet::TShape pad = conv_param.pad; - const uint32_t num_group = conv_param.num_group; + const mxnet::TShape kernel = param.kernel; + const mxnet::TShape stride = param.stride; + const mxnet::TShape dilate = param.dilate; + const mxnet::TShape pad = param.pad; + const uint32_t num_group = param.num_group; // const bool no_bias = conv_param.no_bias; - const dmlc::optional layout = conv_param.layout; + const dmlc::optional layout = param.layout; // dilations AttributeProto* const dilations = node_proto->add_attribute(); @@ -226,8 +232,24 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs, for (const dim_t kval : stride) { strides->add_ints(static_cast(kval)); } +} + +void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs, + const nnvm::IndexedGraph& ig, + const array_view& inputs) { + const auto& conv_param = nnvm::get(attrs.parsed); + ConvDeconvConvertHelper(node_proto, attrs, ig, inputs, conv_param, + ConvDeconvType::Convolution); } // end ConvertConvolution +void ConvertDeconvolution(NodeProto* node_proto, const NodeAttrs& attrs, + const nnvm::IndexedGraph& ig, + const array_view& inputs) { + const auto& deconv_param = nnvm::get(attrs.parsed); + ConvDeconvConvertHelper(node_proto, attrs, ig, inputs, deconv_param, + ConvDeconvType::Deconvolution); +} // end ConvertDeconvolution + void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs, const nnvm::IndexedGraph& /*ig*/, const array_view& /*inputs*/) { diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h index e258d892aaba..a6b93f10598a 100644 --- a/src/operator/subgraph/tensorrt/tensorrt-inl.h +++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h @@ -88,6 +88,7 @@ class TensorrtSelector : public SubgraphSelector { "clip", "Concat", "Convolution", + "Deconvolution", "Dropout", "elemwise_add", "elemwise_sub", @@ -104,6 +105,7 @@ class TensorrtSelector : public SubgraphSelector { const std::unordered_set withWeightsOps = { "BatchNorm", "Convolution", + "Deconvolution", "FullyConnected" }; diff --git a/tests/python/tensorrt/test_tensorrt_deconv.py b/tests/python/tensorrt/test_tensorrt_deconv.py new file mode 100644 index 000000000000..ef567d1dae3c --- /dev/null +++ b/tests/python/tensorrt/test_tensorrt_deconv.py @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet.test_utils import assert_almost_equal + +def get_params(): + arg_params = {} + aux_params = {} + arg_params["trt_bn_test_conv_weight"] = mx.nd.ones((1, 1, 3, 3)) + arg_params["trt_bn_test_deconv_weight"] = mx.nd.ones((1, 1, 3, 3)) + return arg_params, aux_params + +def get_symbol(): + data = mx.sym.Variable("data") + conv = mx.sym.Convolution(data=data, kernel=(3,3), no_bias=True, num_filter=1, num_group=1, + name="trt_bn_test_conv") + deconv = mx.sym.Deconvolution(data=conv, kernel=(3, 3), no_bias=True, num_filter=1, + num_group=1, name="trt_bn_test_deconv") + return deconv + +def test_deconvolution_produce_same_output_as_tensorrt(): + arg_params, aux_params = get_params() + arg_params_trt, aux_params_trt = get_params() + + sym = get_symbol() + sym_trt = get_symbol().get_backend_symbol("TensorRT") + + mx.contrib.tensorrt.init_tensorrt_params(sym_trt, arg_params_trt, aux_params_trt) + + executor = sym.simple_bind(ctx=mx.gpu(), data=(1, 1, 3, 3), grad_req='null', force_rebind=True) + executor.copy_params_from(arg_params, aux_params) + + executor_trt = sym_trt.simple_bind(ctx=mx.gpu(), data=(1, 1, 3, 3), grad_req='null', + force_rebind=True) + executor_trt.copy_params_from(arg_params_trt, aux_params_trt) + + input_data = mx.nd.random.uniform(low=0, high=1, shape=(1, 1, 3, 3)) + + y = executor.forward(is_train=False, data=input_data) + y_trt = executor_trt.forward(is_train=False, data=input_data) + + print(y[0].asnumpy()) + print(y_trt[0].asnumpy()) + assert_almost_equal(y[0].asnumpy(), y_trt[0].asnumpy(), 1e-4, 1e-4) + +if __name__ == '__main__': + import nose + nose.runmodule() From 0bfac7d11a39358ba1ec8ff578d9388b0bcad53a Mon Sep 17 00:00:00 2001 From: Cody Allen Date: Wed, 31 Jul 2019 16:09:23 -0700 Subject: [PATCH 10/24] Fix Scala Symbolic API some/Some typo (#15687) --- docs/api/scala/symbol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/scala/symbol.md b/docs/api/scala/symbol.md index aaddc2a8a2f0..f92548e4820d 100644 --- a/docs/api/scala/symbol.md +++ b/docs/api/scala/symbol.md @@ -41,7 +41,7 @@ The following example configures a two-layer neural network. val data = Symbol.Variable("data") val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1") val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1") - val fc2 = Symbol.api.FullyConnected(some(act1), num_hidden = 64, name = "fc2") + val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2") val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out") :type net // org.apache.mxnet.Symbol From d6c17faf2f2d1cab51b6a0700581174df115e059 Mon Sep 17 00:00:00 2001 From: Xinyu Chen Date: Thu, 1 Aug 2019 12:43:36 +0800 Subject: [PATCH 11/24] Add MKLDNN 4c layout to fix gluoncv se_resnext101_64x4d (#15692) * add 4c type * trigger --- src/operator/nn/mkldnn/mkldnn_base.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index e36a0f008821..a13337b122c3 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -329,6 +329,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) { case mkldnn_nchw: case mkldnn_nhwc: case mkldnn_chwn: + case mkldnn_nChw4c: case mkldnn_nChw8c: case mkldnn_nChw16c: return mkldnn_nchw; @@ -338,6 +339,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) { case mkldnn_iohw: case mkldnn_oIhw8i: case mkldnn_oIhw16i: + case mkldnn_OIhw4i4o: case mkldnn_OIhw8i8o: case mkldnn_hwio_s8s8: case mkldnn_OIhw16i16o: @@ -376,6 +378,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) { case mkldnn_giohw: case mkldnn_hwigo: case mkldnn_hwigo_s8s8: + case mkldnn_gOIhw4i4o: case mkldnn_gOIhw8i8o: case mkldnn_gOIhw16i16o: case mkldnn_gOIhw4i16o4i: @@ -383,6 +386,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) { case mkldnn_gOIhw8i16o2i: case mkldnn_gOIhw8o16i2o: case mkldnn_gOIhw8o8i: + case mkldnn_gOIhw4o4i: case mkldnn_gOIhw16o16i: case mkldnn_gIOhw16o16i: case mkldnn_gOihw8o: From 29ba4fb3662eccd2e383fe77127b4acfb8e7dbdd Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Thu, 1 Aug 2019 13:54:15 +0800 Subject: [PATCH 12/24] Fix _copy_to on MKLDNN backend (#15637) * Fix _copy_to * Add comment --- src/imperative/imperative_utils.h | 34 ++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index b867162abc9b..07fe04782bd0 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -419,7 +419,14 @@ inline void PushFCompute(const FCompute& fn, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; #if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(outputs, req); + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. + InvalidateOutputs(outputs, req); + } #endif std::vector tmp_req = req; // setup blobs @@ -461,7 +468,14 @@ inline void PushFComputeEx(const FComputeEx& fn, const auto& run = [=](RunContext rctx) { OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested}; #if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(outputs, req); + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. + InvalidateOutputs(outputs, req); + } #endif fn(attrs, opctx, inputs, req, outputs); if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync && !rctx.is_bulk) { @@ -508,7 +522,14 @@ inline void PushOperator(const OpStatePtr& state, engine::CallbackOnComplete on_complete) { OpContext opctx{need_grad, is_train, rctx, on_complete, requested}; #if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(outputs, req); + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. + InvalidateOutputs(outputs, req); + } #endif fcompute_ex(state, opctx, inputs, req, outputs); if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync @@ -547,7 +568,14 @@ inline void PushOperator(const OpStatePtr& state, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; #if MXNET_USE_MKLDNN == 1 + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. InvalidateOutputs(outputs, req); + } #endif std::vector tmp_req = req; // populate input blobs and output blobs From 862423a70eaa79feea3e40872dc2b33587974c7a Mon Sep 17 00:00:00 2001 From: Pedro Larroy Date: Thu, 1 Aug 2019 12:59:15 -0700 Subject: [PATCH 13/24] [DOC] refine autograd docs (#15109) * refine autograd docs * CR comments * Fix examples * CR comments * Followup CR * CR --- docs/api/python/autograd/autograd.md | 75 +++++++++++++++++++++++++--- python/mxnet/autograd.py | 3 ++ python/mxnet/ndarray/ndarray.py | 2 + 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/docs/api/python/autograd/autograd.md b/docs/api/python/autograd/autograd.md index 1905831b16d9..82da4eac05b5 100644 --- a/docs/api/python/autograd/autograd.md +++ b/docs/api/python/autograd/autograd.md @@ -42,16 +42,28 @@ to allocate space for the gradient. Then, start a `with autograd.record()` block and do some computation. Finally, call `backward()` on the result: ```python ->>> x = mx.nd.array([1,2,3,4]) ->>> x.attach_grad() ->>> with mx.autograd.record(): -... y = x * x + 1 ->>> y.backward() ->>> print(x.grad) +import mxnet as mx +x = mx.nd.array([1,2,3,4]) +x.attach_grad() +with mx.autograd.record(): + y = x * x + 1 +y.backward() +print(x.grad) +``` + +Which outputs: + +``` [ 2. 4. 6. 8.] ``` +Gradient recording is enabled during the scope of the `with mx.autograd.record():` statement, then +disabled when we go out of that scope. + +It can be also set manually by executing `mx.autograd.set_recording(True)`, and turning it off after +we no longer want to record operations with `mx.autograd.set_recording(False)`. + ## Train mode and Predict Mode @@ -76,8 +88,59 @@ Detailed tutorials are available in Part 1 of [the MXNet gluon book](http://gluon.mxnet.io/). +# Higher order gradient + +Some operators support higher order gradients. Some operators support differentiating multiple +times, and others two, most just once. + +For calculating higher order gradients, we can use the `mx.autograd.grad` function while recording +and then call backward, or call `mx.autograd.grad` two times. If we do the latter, is important that +the first call uses `create_graph=True` and `retain_graph=True` and the second call uses +`create_graph=False` and `retain_graph=True`. Otherwise we will not get the results that we want. If +we would be to recreate the graph in the second call, we would end up with a graph of just the +backward nodes, not the full initial graph that includes the forward nodes. + +The pattern to calculate higher order gradients is the following: + +```python +from mxnet import ndarray as nd +from mxnet import autograd as ag +x = nd.array([1,2,3]) +x.attach_grad() +def f(x): + # Any function which supports higher oder gradient + return nd.log(x) +``` + +If the operators used in `f` don't support higher order gradients you will get an error like +`operator ... is non-differentiable because it didn't register FGradient attribute.`. This means +that it doesn't support getting the gradient of the gradient. Which is, running backward on +the backward graph. + +Using mxnet.autograd.grad multiple times: + +```python +with ag.record(): + y = f(x) + x_grad = ag.grad(heads=y, variables=x, create_graph=True, retain_graph=True)[0] + x_grad_grad = ag.grad(heads=x_grad, variables=x, create_graph=False, retain_graph=False)[0] +``` + +Running backward on the backward graph: + +```python +with ag.record(): + y = f(x) + x_grad = ag.grad(heads=y, variables=x, create_graph=True, retain_graph=True)[0] +x_grad.backward() +x_grad_grad = x.grad +``` +Both methods are equivalent, except that in the second case, retain_graph on running backward is set +to False by default. But both calls are running a backward pass as on the graph as usual to get the +gradient of the first gradient `x_grad` with respect to `x` evaluated at the value of `x`. +For more examples, check the [higher order gradient unit tests](https://github.com/apache/incubator-mxnet/blob/master/tests/python/unittest/test_higher_order_grad.py). diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py index f461b77e2818..6f1cc4367821 100644 --- a/python/mxnet/autograd.py +++ b/python/mxnet/autograd.py @@ -197,6 +197,9 @@ def predict_mode(): def mark_variables(variables, gradients, grad_reqs='write'): """Mark NDArrays as variables to compute gradient for autograd. + This is equivalent to the function .attach_grad() in a variable, but with this + call we can set the gradient to any value. + Parameters ---------- variables: NDArray or list of NDArray diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py index 53c563854511..3d8a7aa98c94 100644 --- a/python/mxnet/ndarray/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -2243,6 +2243,8 @@ def attach_grad(self, grad_req='write', stype=None): """Attach a gradient buffer to this NDArray, so that `backward` can compute gradient with respect to it. + The gradient is initialized to zeros. + Parameters ---------- grad_req : {'write', 'add', 'null'} From 4f6f124f55ee20f988e73f6b650f364324fd0ba1 Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Fri, 2 Aug 2019 04:37:58 +0800 Subject: [PATCH 14/24] Fix quantized concat when inputs are mixed int8 and uint8 (#15693) --- .../quantization/mkldnn/mkldnn_quantized_concat.cc | 12 +++++++++++- tests/python/mkl/test_subgraph.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc index d9e884e82806..2a4c6d612e65 100644 --- a/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc +++ b/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc @@ -64,22 +64,32 @@ static void MKLDNNQuantizedConcatForward(const nnvm::NodeAttrs& attrs, const OpC std::vector data_mem; // new_data_mem is for auto-free new created mkldnn memory std::vector> new_data_mem; + const auto out_dtype = out_data[quantized_concat_enum::kOut].dtype(); for (int i = 0; i < param_.num_args; ++i) { auto i_scale = GetScale(in_data[i], data_min[i], data_max[i]); if (i_scale == out_scale) { + CHECK(in_data[i].dtype() == out_dtype); auto mem = in_data[i].GetMKLDNNData(); data_mem.push_back(mem); data_md.push_back(mem->get_primitive_desc()); } else { auto mem = in_data[i].GetMKLDNNData(); auto pd = mem->get_primitive_desc(); + if (in_data[i].dtype() != out_dtype) { + auto mem_desc = pd.desc(); + mkldnn::memory::desc new_md( + mkldnn::memory::dims(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims), + get_mkldnn_type(out_dtype), static_cast(mem_desc.data.format)); + pd = mkldnn::memory::primitive_desc(new_md, CpuEngine::Get()->get_engine()); + } const auto rescaled_mem = std::make_shared(pd); new_data_mem.push_back(rescaled_mem); std::vector reorder_scale = {out_scale / i_scale}; primitive_attr reorder_attr; reorder_attr.set_int_output_round_mode(round_mode::round_nearest); reorder_attr.set_output_scales(0, reorder_scale); - const auto reorder_pd = mkldnn::reorder::primitive_desc(pd, pd, reorder_attr); + const auto reorder_pd = + mkldnn::reorder::primitive_desc(mem->get_primitive_desc(), pd, reorder_attr); MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *mem, *rescaled_mem)); data_mem.push_back(rescaled_mem.get()); data_md.push_back(pd); diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py index b25fefc6cc0e..563fff1a6aa1 100644 --- a/tests/python/mkl/test_subgraph.py +++ b/tests/python/mkl/test_subgraph.py @@ -401,6 +401,15 @@ def single_concat(data_shape, input_num, dim): concat = mx.symbol.Concat(*inputs, name="concat", dim=dim) return concat +def single_concat_pos_neg(data_shape): + data, weight = head_symbol(data_shape) + conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=4, + kernel=(1, 1), stride=(1, 1), no_bias=True) + relu = mx.symbol.Activation(data=conv, name='relu', act_type='relu') + inputs = [data, relu] + concat = mx.symbol.Concat(*inputs, name="concat", dim=1) + return concat + # concat scale alignment case def concat_scale_align(data_shape): data, weight = head_symbol(data_shape) @@ -738,6 +747,8 @@ def test_pos_single_concat(): net = single_concat(data_shape, 4, 3) check_quantize(net, data_shape, out_type, name='conv', check_calibration=False) check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True) + net = single_concat_pos_neg(data_shape) + check_quantize(net, data_shape, out_type, name='', check_calibration=False) @with_seed() def test_pos_concat_scale_align(): From ffcfce587a3085ae221b4458a900c26a48e92bfc Mon Sep 17 00:00:00 2001 From: Xinyu Chen Date: Fri, 2 Aug 2019 04:41:35 +0800 Subject: [PATCH 15/24] [MKLDNN]Enhance Quantization APIs and Tutorial (#15448) * enhance api and new tutorial * Update MKLDNN_QUANTIZATION.md update * fix lint * modify pics * skip test * add quantize layer in graph * update * remove center css flag * change requantize color * fix markdown pics * change to use png * Update MKLDNN_QUANTIZATION.md update * enable ipython script * fix png * fix lint * Update MKLDNN_QUANTIZATION.md * change title * trigger * use lower case * some typo * some typo * use dmlc web data * trigger * trigger --- docs/tutorials/index.md | 3 +- docs/tutorials/mkldnn/mkldnn_quantization.md | 259 ++++++++++++++++++ example/quantization/README.md | 95 +++++-- .../quantization/imagenet_gen_qsym_mkldnn.py | 168 ++++++------ python/mxnet/contrib/quantization.py | 237 ++++++++++++++++ tests/tutorials/test_tutorials.py | 3 + 6 files changed, 657 insertions(+), 108 deletions(-) create mode 100644 docs/tutorials/mkldnn/mkldnn_quantization.md diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 6e31e825e2ca..e01a30dbe68c 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -105,6 +105,8 @@ Select API:  * [Module to Gluon API](/tutorials/python/module_to_gluon.html) * [Gluon end to end from training to inference](/tutorials/gluon/gluon_from_experiment_to_deployment.html) * [Automatic Mixed Precision in Gluon](/tutorials/amp/amp_tutorial.html) + * [How to build and install MXNet with MKL-DNN backend](/tutorials/mkldnn/MKLDNN_README.html) + * [How to quantize custom models with MKL-DNN backend](/tutorials/mkldnn/mkldnn_quantization.html) (new!) * API Guides * Core APIs * NDArray @@ -157,7 +159,6 @@ Select API:  * [Large-Scale Multi-Host Multi-GPU Image Classification](/tutorials/vision/large_scale_classification.html) * [Importing an ONNX model into MXNet](/tutorials/onnx/super_resolution.html) * [Optimizing Deep Learning Computation Graphs with TensorRT](/tutorials/tensorrt/inference_with_trt.html) - * [How to build and install MXNet with MKL-DNN backend](/tutorials/mkldnn/MKLDNN_README.html) * API Guides * Core APIs * NDArray diff --git a/docs/tutorials/mkldnn/mkldnn_quantization.md b/docs/tutorials/mkldnn/mkldnn_quantization.md new file mode 100644 index 000000000000..459bf2a17d40 --- /dev/null +++ b/docs/tutorials/mkldnn/mkldnn_quantization.md @@ -0,0 +1,259 @@ + + + + + + + + + + + + + + + + + + +# Quantize custom models with MKL-DNN backend + +This document is to introduce how to quantize the customer models from FP32 to INT8 with Apache/MXNet toolkit and APIs under Intel CPU. + +If you are not familiar with Apache/MXNet quantization flow, please reference [quantization blog](https://medium.com/apache-mxnet/model-quantization-for-production-level-neural-network-inference-f54462ebba05) first, and the performance data is shown in [Apache/MXNet C++ interface](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) and [GluonCV](https://gluon-cv.mxnet.io/build/examples_deployment/int8_inference.html). + +## Installation and Prerequisites + +Installing MXNet with MKLDNN backend is an easy and essential process. You can follow [How to build and install MXNet with MKL-DNN backend](https://mxnet.incubator.apache.org/tutorials/mkldnn/MKLDNN_README.html) to build and install MXNet from source. Also, you can install the release or nightly version via PyPi and pip directly by running: + +``` +# release version +pip install mxnet-mkl +# nightly version +pip install mxnet-mkl --pre +``` + +## Image Classification Demo + +A quantization script [imagenet_gen_qsym_mkldnn.py](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/imagenet_gen_qsym_mkldnn.py) has been designed to launch quantization for image-classification models. This script is integrated with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that all pre-trained models can be downloaded from Gluon-CV and then converted for quantization. For details, you can refer [Model Quantization with Calibration Examples](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md). + +## Integrate Quantization Flow to Your Project + +Quantization flow works for both symbolic and Gluon models. If you're using Gluon, you can first refer [Saving and Loading Gluon Models](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/save_load_params.html) to hybridize your computation graph and export it as a symbol before running quantization. + +In general, the quantization flow includes 4 steps. The user can get the acceptable accuracy from step 1 to 3 with minimum effort. Most of thing in this stage is out-of-box and the data scientists and researchers only need to focus on how to represent data and layers in their model. After a quantized model is generated, you may want to deploy it online and the performance will be the next key point. Thus, step 4, calibration, can improve the performance a lot by reducing lots of runtime calculation. + +![quantization flow](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/quantization.png) + +Now, we are going to take Gluon ResNet18 as an example to show how each step work. + +### Initialize Model + +```python +import logging +import mxnet as mx +from mxnet.gluon.model_zoo import vision +from mxnet.contrib.quantization import * + +logging.basicConfig() +logger = logging.getLogger('logger') +logger.setLevel(logging.INFO) + +batch_shape = (1, 3, 224, 224) +resnet18 = vision.resnet18_v1(pretrained=True) +resnet18.hybridize() +resnet18.forward(mx.nd.zeros(batch_shape)) +resnet18.export('resnet18_v1') +sym, arg_params, aux_params = mx.model.load_checkpoint('resnet18_v1', 0) +# (optional) visualize float32 model +mx.viz.plot_network(sym) +``` +First, we download resnet18-v1 model from gluon modelzoo and export it as a symbol. You can visualize float32 model. Below is a raw residual block. + +![float32 model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_raw.png) + +#### Model Fusion + +```python +sym = sym.get_backend_symbol('MKLDNN_QUANTIZE') +# (optional) visualize fused float32 model +mx.viz.plot_network(sym) +``` +It's important to add this line to enable graph fusion before quantization to get better performance. Below is a fused residual block. Batchnorm, Activation and elemwise_add are fused into Convolution. + +![float32 fused model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_fusion.png) + +### Quantize Model + +A python interface `quantize_graph` is provided for the user. Thus, it is very flexible for the data scientist to construct the expected models based on different requirements in a real deployment. + +```python +# quantize configs +# set exclude layers +excluded_names = [] +# set calib mode. +calib_mode = 'none' +# set calib_layer +calib_layer = None +# set quantized_dtype +quantized_dtype = 'auto' +logger.info('Quantizing FP32 model Resnet18-V1') +qsym, qarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params, + excluded_sym_names=excluded_names, + calib_mode=calib_mode, calib_layer=calib_layer, + quantized_dtype=quantized_dtype, logger=logger) +# (optional) visualize quantized model +mx.viz.plot_network(qsym) +# save quantized model +mx.model.save_checkpoint('quantized-resnet18_v1', 0, qsym, qarg_params, aux_params) +``` + +By applying `quantize_graph` to the symbolic model, a new quantized model can be generated, named `qsym` along with its parameters. We can see `_contrib_requantize` operators are inserted after `Convolution` to convert the INT32 output to FP32. + +![none calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/none_calib.png) + +Below table gives some descriptions. + +| param | type | description| +|--------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| excluded_sym_names | list of strings | A list of strings representing the names of the symbols that users want to excluding from being quantized.| +| calib_mode | str | If calib_mode='none', no calibration will be used and the thresholds for requantization after the corresponding layers will be calculated at runtime by calling min and max operators. The quantized models generated in this mode are normally 10-20% slower than those with calibrations during inference.
If calib_mode='naive', the min and max values of the layer outputs from a calibration dataset will be directly taken as the thresholds for quantization.
If calib_mode='entropy', the thresholds for quantization will be derived such that the KL divergence between the distributions of FP32 layer outputs and quantized layer outputs is minimized based upon the calibration dataset. | +| calib_layer | function | Given a layer's output name in string, return True or False for deciding whether to calibrate this layer.
If yes, the statistics of the layer's output will be collected; otherwise, no information of the layer's output will be collected.
If not provided, all the layers' outputs that need requantization will be collected.| +| quantized_dtype | str | The quantized destination type for input data. Currently support 'int8', 'uint8' and 'auto'.
'auto' means automatically select output type according to calibration result.| + +### Evaluate & Tune + +Now, you get a pair of quantized symbol and params file for inference. For Gluon inference, only difference is to load model and params by a SymbolBlock as below example: + +```python +quantized_net = mx.gluon.SymbolBlock.imports('quantized-resnet18_v1-symbol.json', 'data', 'quantized-resnet18_v1-0000.params') +quantized_net.hybridize(static_shape=True, static_alloc=True) +batch_size = 1 +data = mx.nd.ones((batch_size,3,224,224)) +quantized_net(data) +``` + +Now, you can get the accuracy from a quantized network. Furthermore, you can try to select different layers or OPs to be quantized by `excluded_sym_names` parameter and figure out an acceptable accuracy. + +### Calibrate Model (optional for performance) + +The quantized model generated in previous steps can be very slow during inference since it will calculate min and max at runtime. We recommend using offline calibration for better performance by setting `calib_mode` to `naive` or `entropy`. And then calling `set_monitor_callback` api to collect layer information with a subset of the validation datasets before int8 inference. + +```python +# quantization configs +# set exclude layers +excluded_names = [] +# set calib mode. +calib_mode = 'naive' +# set calib_layer +calib_layer = None +# set quantized_dtype +quantized_dtype = 'auto' +logger.info('Quantizing FP32 model resnet18-V1') +cqsym, cqarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params, + excluded_sym_names=excluded_names, + calib_mode=calib_mode, calib_layer=calib_layer, + quantized_dtype=quantized_dtype, logger=logger) + +# download imagenet validation dataset +mx.test_utils.download('http://data.mxnet.io/data/val_256_q90.rec', 'dataset.rec') +# set rgb info for data +mean_std = {'mean_r': 123.68, 'mean_g': 116.779, 'mean_b': 103.939, 'std_r': 58.393, 'std_g': 57.12, 'std_b': 57.375} +# set batch size +batch_size = 16 +# create DataIter +data = mx.io.ImageRecordIter(path_imgrec='dataset.rec', batch_size=batch_size, data_shape=batch_shape[1:], rand_crop=False, rand_mirror=False, **mean_std) +# create module +mod = mx.mod.Module(symbol=sym, label_names=None, context=mx.cpu()) +mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=None) +mod.set_params(arg_params, aux_params) + +# calibration configs +# set num_calib_batches +num_calib_batches = 5 +max_num_examples = num_calib_batches * batch_size +# monitor FP32 Inference +mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True) +num_batches = 0 +num_examples = 0 +for batch in data: + mod.forward(data_batch=batch, is_train=False) + num_batches += 1 + num_examples += batch_size + if num_examples >= max_num_examples: + break +if logger is not None: + logger.info("Collected statistics from %d batches with batch_size=%d" + % (num_batches, batch_size)) +``` + +After that, layer information will be filled into the `collector` returned by `quantize_graph` api. Then, you need to write the layer information into int8 model by calling `calib_graph` api. + + +```python +# write scaling factor into quantized symbol +cqsym, cqarg_params, aux_params = calib_graph(qsym=cqsym, arg_params=arg_params, aux_params=aux_params, + collector=collector, calib_mode=calib_mode, + quantized_dtype=quantized_dtype, logger=logger) +# (optional) visualize quantized model +mx.viz.plot_network(cqsym) +``` + +Below is a quantized residual block with naive calibration. We can see `min_calib_range` and `max_calib_range` are written into `_contrib_requantize` operators. + +![naive calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/naive_calib.png) + +When you get a quantized model with calibration, keeping sure to call fusion api again since this can fuse some `requantize` or `dequantize` operators for further performance improvement. + +```python +# perform post-quantization fusion +cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE') +# (optional) visualize post-quantized model +mx.viz.plot_network(cqsym) +# save quantized model +mx.model.save_checkpoint('quantized-resnet18_v1', 0, cqsym, cqarg_params, aux_params) +``` + +Below is a post-quantized residual block. We can see `_contrib_requantize` operators are fused into `Convolution` operators. + +![post-quantized model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/post_quantize.png) + +BTW, You can also modify the `min_calib_range` and `max_calib_range` in the JSON file directly. + +``` + { + "op": "_sg_mkldnn_conv", + "name": "quantized_sg_mkldnn_conv_bn_act_6", + "attrs": { + "max_calib_range": "3.562147", + "min_calib_range": "0.000000", + "quantized": "true", + "with_act": "true", + "with_bn": "true" + }, +...... +``` + +### Tips for Model Calibration + +#### Accuracy Tuning + +- Try to use `entropy` calib mode; + +- Try to exclude some layers which may cause obvious accuracy drop; + +- Change calibration dataset by setting different `num_calib_batches` or shuffle your validation dataset; + +#### Performance Tuning + +- Keep sure to perform graph fusion before quantization; + +- If lots of `requantize` layers exist, keep sure to perform post-quantization fusion after calibration; + +- Compare the MXNet profile or `MKLDNN_VERBOSE` of float32 and int8 inference; + +## Deploy with Python/C++ + +MXNet also supports deploy quantized models with C++. Refer [MXNet C++ Package](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/README.md) for more details. + + diff --git a/example/quantization/README.md b/example/quantization/README.md index 09321beb7997..1ae58fbb3a69 100644 --- a/example/quantization/README.md +++ b/example/quantization/README.md @@ -9,13 +9,76 @@ This folder contains examples of quantizing a FP32 model with Intel® MKL-DNN or

Model Quantization with Intel® MKL-DNN

-Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for CNN models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. This script also supports custom models. - -Calibration is used for generating a calibration table for the quantized symbol. The quantization script supports three methods: - -- **none:** No calibration will be used. The thresholds for quantization will be calculated on the fly. This will result in inference speed slowdown and loss of accuracy in general. -- **naive:** Simply take min and max values of layer outputs as thresholds for quantization. In general, the inference accuracy worsens with more examples used in calibration. It is recommended to use `entropy` mode as it produces more accurate inference results. -- **entropy:** Calculate KL divergence of the fp32 output and quantized output for optimal thresholds. This mode is expected to produce the best inference accuracy of all three kinds of quantized models if the calibration dataset is representative enough of the inference dataset. +Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.incubator.apache.org/tutorials/mkldnn/mkldnn_quantization.html). + +``` +usage: imagenet_gen_qsym_mkldnn.py [-h] [--model MODEL] [--epoch EPOCH] + [--no-pretrained] [--batch-size BATCH_SIZE] + [--label-name LABEL_NAME] + [--calib-dataset CALIB_DATASET] + [--image-shape IMAGE_SHAPE] + [--data-nthreads DATA_NTHREADS] + [--num-calib-batches NUM_CALIB_BATCHES] + [--exclude-first-conv] [--shuffle-dataset] + [--shuffle-chunk-seed SHUFFLE_CHUNK_SEED] + [--shuffle-seed SHUFFLE_SEED] + [--calib-mode CALIB_MODE] + [--quantized-dtype {auto,int8,uint8}] + [--enable-calib-quantize ENABLE_CALIB_QUANTIZE] + +Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN +support + +optional arguments: + -h, --help show this help message and exit + --model MODEL model to be quantized. + --epoch EPOCH number of epochs, default is 0 + --no-pretrained If enabled, will not download pretrained model from + MXNet or Gluon-CV modelzoo. + --batch-size BATCH_SIZE + --label-name LABEL_NAME + --calib-dataset CALIB_DATASET + path of the calibration dataset + --image-shape IMAGE_SHAPE + --data-nthreads DATA_NTHREADS + number of threads for data decoding + --num-calib-batches NUM_CALIB_BATCHES + number of batches for calibration + --exclude-first-conv excluding quantizing the first conv layer since the + input data may have negative value which doesn't + support at moment + --shuffle-dataset shuffle the calibration dataset + --shuffle-chunk-seed SHUFFLE_CHUNK_SEED + shuffling chunk seed, see https://mxnet.incubator.apac + he.org/api/python/io/io.html?highlight=imager#mxnet.io + .ImageRecordIter for more details + --shuffle-seed SHUFFLE_SEED + shuffling seed, see https://mxnet.incubator.apache.org + /api/python/io/io.html?highlight=imager#mxnet.io.Image + RecordIter for more details + --calib-mode CALIB_MODE + calibration mode used for generating calibration table + for the quantized symbol; supports 1. none: no + calibration will be used. The thresholds for + quantization will be calculated on the fly. This will + result in inference speed slowdown and loss of + accuracy in general. 2. naive: simply take min and max + values of layer outputs as thresholds for + quantization. In general, the inference accuracy + worsens with more examples used in calibration. It is + recommended to use `entropy` mode as it produces more + accurate inference results. 3. entropy: calculate KL + divergence of the fp32 output and quantized output for + optimal thresholds. This mode is expected to produce + the best inference accuracy of all three kinds of + quantized models if the calibration dataset is + representative enough of the inference dataset. + --quantized-dtype {auto,int8,uint8} + quantization destination data type for input data + --enable-calib-quantize ENABLE_CALIB_QUANTIZE + If enabled, the quantize op will be calibrated offline + if calibration mode is enabled +``` Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/): @@ -23,12 +86,13 @@ Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/): pip install gluoncv ``` -The following models have been tested on Linux systems. +Below are some quantization demos. These models have been tested on Linux systems. | Model | Source | Dataset | FP32 Accuracy (top-1/top-5)| INT8 Accuracy (top-1/top-5)| |:---|:---|---|:---:|:---:| | [ResNet18-V1](#3) | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html) | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec) |70.15%/89.38%|69.92%/89.26%| | [ResNet50-V1](#3) | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html) | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec) | 76.34%/93.13% | 75.91%/92.95% | +| [ResNet50-V1b](#3) | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html) | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec) | 76.82%/93.38% | 76.39%/93.24% | | [ResNet101-V1](#3) | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html) | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec) | 77.33%/93.59% | 77.05%/93.43% | |[Squeezenet 1.0](#4)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|56.98%/79.20%|52.98%/77.21%| |[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.23%/90.64%|72.03%/90.42%| @@ -39,7 +103,7 @@ The following models have been tested on Linux systems. | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd) | VOC2007/2012 | 0.8366 mAP | 0.8364 mAP | | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd) | COCO2014 | 0.2552 mAP | 0.253 mAP | -

ResNet18/50/101-V1

+

ResNetV1

The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models: @@ -47,7 +111,7 @@ The following command is to download the pre-trained model from Gluon-CV and tra python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive ``` -The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference. +The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. Set `--model` to `resnet18_v1/resnet50_v1b/resnet101_v1` to quantize other models. The following command is to launch inference. ``` # Launch FP32 Inference @@ -204,17 +268,14 @@ SSD model is located in [example/ssd](https://github.com/apache/incubator-mxnet/ This script also supports custom symbolic models. You can easily add some quantization layer configs in `imagenet_gen_qsym_mkldnn.py` like below: ``` -elif args.model == 'custom': +else: + logger.info('Please set proper RGB configs for model %s' % args.model) # add rgb mean/std of your model. rgb_mean = '0,0,0' rgb_std = '0,0,0' - calib_layer = lambda name: name.endswith('_output') # add layer names you donnot want to quantize. - # add conv/pool layer names that has negative inputs - # since Intel® MKL-DNN only support uint8 quantization temporary. - # add all fc layer names since Intel® MKL-DNN does not support temporary. + logger.info('Please set proper excluded_sym_names for model %s' % args.model) excluded_sym_names += ['layers'] - # add your first conv layer names since Intel® MKL-DNN only support uint8 quantization temporary. if exclude_first_conv: excluded_sym_names += ['layers'] ``` @@ -230,7 +291,7 @@ Some tips on quantization configs: python imagenet_inference.py --symbol-file=./model/custom-symbol.json --param-file=./model/custom-0000.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu ``` -3. Then, you should add `rgb_mean`, `rgb_std` and `excluded_sym_names` in this script. Notice that you should exclude conv/pool layers that have negative data since Intel® MKL-DNN only supports `uint8` quantization temporarily. You should also exclude all fc layers in your model. +3. Then, you should add `rgb_mean`, `rgb_std` and `excluded_sym_names` in this script. 4. Then, you can run the following command for quantization: diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py index 482127ba355c..302a04449885 100644 --- a/example/quantization/imagenet_gen_qsym_mkldnn.py +++ b/example/quantization/imagenet_gen_qsym_mkldnn.py @@ -92,21 +92,12 @@ def save_params(fname, arg_params, aux_params, logger=None): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN support') - parser.add_argument('--model', type=str, choices=['resnet18_v1', - 'resnet50_v1', - 'resnet101_v1', - 'inceptionv3', - 'squeezenet1.0', - 'mobilenet1.0', - 'mobilenetv2_1.0', - 'imagenet1k-resnet-152', - 'imagenet1k-inception-bn', - 'custom'], - help='currently only supports imagenet1k-resnet-50_v1, imagenet1k-resnet-152 or imagenet1k-inception-bn.' - 'you can set to custom to load your pre-trained model.') - parser.add_argument('--use-gluon-model', type=bool, default=False, - help='If enabled, will download pretrained model from Gluon-CV ' - 'and convert to symbolic model ') + parser.add_argument('--model', type=str, default='resnet50_v1', + help='model to be quantized.') + parser.add_argument('--epoch', type=int, default=0, + help='number of epochs, default is 0') + parser.add_argument('--no-pretrained', action='store_true', default=False, + help='If enabled, will not download pretrained model from MXNet or Gluon-CV modelzoo.') parser.add_argument('--batch-size', type=int, default=32) parser.add_argument('--label-name', type=str, default='softmax_label') parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec', @@ -155,6 +146,7 @@ def save_params(fname, arg_params, aux_params, logger=None): logger = logging.getLogger('logger') logger.setLevel(logging.INFO) + logger.info(args) logger.info('shuffle_dataset=%s' % args.shuffle_dataset) calib_mode = args.calib_mode @@ -165,29 +157,24 @@ def save_params(fname, arg_params, aux_params, logger=None): download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset) # download model - if args.model in ['resnet18_v1', - 'resnet50_v1', - 'resnet101_v1', - 'squeezenet1.0', - 'mobilenet1.0', - 'mobilenetv2_1.0', - 'inceptionv3']: - logger.info('model %s is converted from GluonCV' % args.model) - args.use_gluon_model = True - if args.use_gluon_model == True: - prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger) - epoch = 0 - sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) - elif args.model == 'custom': + if not args.no_pretrained: + logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.') + logger.info('If you want to use custom model, please set --no-pretrained.') + if args.model in ['imagenet1k-resnet-152', 'imagenet1k-inception-bn']: + logger.info('model %s is downloaded from MXNet modelzoo' % args.model) + prefix, epoch = download_model(model_name=args.model, logger=logger) + else: + logger.info('model %s is converted from GluonCV' % args.model) + prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger) + rgb_mean = '123.68,116.779,103.939' + rgb_std = '58.393, 57.12, 57.375' + epoch = 0 + else: dir_path = os.path.dirname(os.path.realpath(__file__)) prefix = os.path.join(dir_path, 'model', args.model) - epoch = 0 - sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) - else: - prefix, epoch = download_model(model_name=args.model, logger=logger) - sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) + epoch = args.epoch - sym = sym.get_backend_symbol('MKLDNN_QUANTIZE') + sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) # get batch size batch_size = args.batch_size @@ -212,57 +199,59 @@ def save_params(fname, arg_params, aux_params, logger=None): logger.info('quantized dtype is set to uint8, will exclude first conv.') exclude_first_conv = True excluded_sym_names = [] - if args.model == 'imagenet1k-resnet-152': - rgb_mean = '0,0,0' - rgb_std = '1,1,1' - excluded_sym_names += ['flatten0'] - if exclude_first_conv: - excluded_sym_names += ['conv0'] - elif args.model == 'imagenet1k-inception-bn': - rgb_mean = '123.68,116.779,103.939' - rgb_std = '1,1,1' - excluded_sym_names += ['flatten'] - if exclude_first_conv: - excluded_sym_names += ['conv_1'] - elif args.model in ['resnet18_v1', 'resnet50_v1', 'resnet101_v1']: - rgb_mean = '123.68,116.779,103.939' - rgb_std = '58.393, 57.12, 57.375' - if exclude_first_conv: - excluded_sym_names += ['resnetv10_conv0_fwd'] - elif args.model == 'squeezenet1.0': - rgb_mean = '123.68,116.779,103.939' - rgb_std = '58.393, 57.12, 57.375' - excluded_sym_names += ['squeezenet0_flatten0_flatten0'] - if exclude_first_conv: - excluded_sym_names += ['squeezenet0_conv0_fwd'] - elif args.model == 'mobilenet1.0': - rgb_mean = '123.68,116.779,103.939' - rgb_std = '58.393, 57.12, 57.375' - excluded_sym_names += ['mobilenet0_flatten0_flatten0', - 'mobilenet0_pool0_fwd'] - if exclude_first_conv: - excluded_sym_names += ['mobilenet0_conv0_fwd'] - elif args.model == 'mobilenetv2_1.0': - rgb_mean = '123.68,116.779,103.939' - rgb_std = '58.393, 57.12, 57.375' - excluded_sym_names += ['mobilenetv20_output_flatten0_flatten0'] - if exclude_first_conv: - excluded_sym_names += ['mobilenetv20_conv0_fwd'] - elif args.model == 'inceptionv3': - rgb_mean = '123.68,116.779,103.939' - rgb_std = '58.393, 57.12, 57.375' - if exclude_first_conv: - excluded_sym_names += ['inception30_conv0_fwd'] - elif args.model == 'custom': + if not args.no_pretrained: + if args.model == 'imagenet1k-resnet-152': + rgb_mean = '0,0,0' + rgb_std = '1,1,1' + excluded_sym_names += ['flatten0'] + if exclude_first_conv: + excluded_sym_names += ['conv0'] + elif args.model == 'imagenet1k-inception-bn': + rgb_mean = '123.68,116.779,103.939' + rgb_std = '1,1,1' + excluded_sym_names += ['flatten'] + if exclude_first_conv: + excluded_sym_names += ['conv_1'] + elif args.model.find('resnet') != -1 and args.model.find('v1') != -1: + if exclude_first_conv: + excluded_sym_names += ['resnetv10_conv0_fwd'] + elif args.model.find('resnet') != -1 and args.model.find('v2') != -1: + excluded_sym_names += ['resnetv20_flatten0_flatten0'] + if exclude_first_conv: + excluded_sym_names += ['resnetv20_conv0_fwd'] + elif args.model.find('vgg') != -1: + if exclude_first_conv: + excluded_sym_names += ['vgg0_conv0_fwd'] + elif args.model.find('squeezenet1') != -1: + excluded_sym_names += ['squeezenet0_flatten0_flatten0'] + if exclude_first_conv: + excluded_sym_names += ['squeezenet0_conv0_fwd'] + elif args.model.find('mobilenet') != -1 and args.model.find('v2') == -1: + excluded_sym_names += ['mobilenet0_flatten0_flatten0', + 'mobilenet0_pool0_fwd'] + if exclude_first_conv: + excluded_sym_names += ['mobilenet0_conv0_fwd'] + elif args.model.find('mobilenet') != -1 and args.model.find('v2') != -1: + excluded_sym_names += ['mobilenetv20_output_flatten0_flatten0'] + if exclude_first_conv: + excluded_sym_names += ['mobilenetv20_conv0_fwd'] + elif args.model == 'inceptionv3': + if exclude_first_conv: + excluded_sym_names += ['inception30_conv0_fwd'] + else: + raise ValueError('Currently, model %s is not supported in this script' % args.model) + else: + logger.info('Please set proper RGB configs for model %s' % args.model) # add rgb mean/std of your model. rgb_mean = '0,0,0' rgb_std = '0,0,0' # add layer names you donnot want to quantize. + logger.info('Please set proper excluded_sym_names for model %s' % args.model) excluded_sym_names += ['layers'] if exclude_first_conv: excluded_sym_names += ['layers'] - else: - raise ValueError('model %s is not supported in this script' % args.model) + + logger.info('These layers have been excluded %s' % excluded_sym_names) label_name = args.label_name logger.info('label_name = %s' % label_name) @@ -281,10 +270,10 @@ def save_params(fname, arg_params, aux_params, logger=None): combine_mean_std.update(std_args) if calib_mode == 'none': logger.info('Quantizing FP32 model %s' % args.model) - qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, - ctx=ctx, excluded_sym_names=excluded_sym_names, - calib_mode=calib_mode, quantized_dtype=args.quantized_dtype, - logger=logger) + qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params, + ctx=ctx, excluded_sym_names=excluded_sym_names, + calib_mode=calib_mode, quantized_dtype=args.quantized_dtype, + logger=logger) sym_name = '%s-symbol.json' % (prefix + '-quantized') else: logger.info('Creating ImageRecordIter for reading calibration dataset') @@ -301,12 +290,12 @@ def save_params(fname, arg_params, aux_params, logger=None): seed=args.shuffle_seed, **combine_mean_std) - qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, - ctx=ctx, excluded_sym_names=excluded_sym_names, - calib_mode=calib_mode, calib_data=data, - num_calib_examples=num_calib_batches * batch_size, - calib_layer=calib_layer, quantized_dtype=args.quantized_dtype, - label_names=(label_name,), logger=logger) + qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params, + ctx=ctx, excluded_sym_names=excluded_sym_names, + calib_mode=calib_mode, calib_data=data, + num_calib_examples=num_calib_batches * batch_size, + calib_layer=calib_layer, quantized_dtype=args.quantized_dtype, + label_names=(label_name,), logger=logger) if calib_mode == 'entropy': suffix = '-quantized-%dbatches-entropy' % num_calib_batches elif calib_mode == 'naive': @@ -315,7 +304,6 @@ def save_params(fname, arg_params, aux_params, logger=None): raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`' % calib_mode) sym_name = '%s-symbol.json' % (prefix + suffix) - qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE') save_symbol(sym_name, qsym, logger) param_name = '%s-%04d.params' % (prefix + '-quantized', epoch) save_params(param_name, qarg_params, aux_params, logger) diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py index b94b5a8da32a..fa2ab1842f5f 100644 --- a/python/mxnet/contrib/quantization.py +++ b/python/mxnet/contrib/quantization.py @@ -543,3 +543,240 @@ def quantize_model(sym, arg_params, aux_params, qarg_params = _quantize_params(qsym, arg_params, th_dict) return qsym, qarg_params, aux_params + +def quantize_model_mkldnn(sym, arg_params, aux_params, + data_names=('data',), label_names=('softmax_label',), + ctx=cpu(), excluded_sym_names=None, calib_mode='entropy', + calib_data=None, num_calib_examples=None, calib_layer=None, + quantized_dtype='int8', logger=logging): + """User-level API for generating a fusion + quantized model from a FP32 model + w/ or w/o calibration with Intel MKL-DNN. + The backend quantized operators are only enabled for Linux systems. Please do not run + inference using the quantized models on Windows for now. + + Parameters + ---------- + sym : str or Symbol + Defines the structure of a neural network for FP32 data types. + arg_params : dict + Dictionary of name to `NDArray`. + aux_params : dict + Dictionary of name to `NDArray`. + data_names : a list of strs + Data names required for creating a Module object to run forward propagation on the + calibration dataset. + label_names : a list of strs + Label names required for creating a Module object to run forward propagation on the + calibration dataset. + ctx : Context + Defines the device that users want to run forward propagation on the calibration + dataset for collecting layer output statistics. Currently, only supports single context. + excluded_sym_names : list of strings + A list of strings representing the names of the symbols that users want to excluding + from being quantized. + calib_mode : str + If calib_mode='none', no calibration will be used and the thresholds for + requantization after the corresponding layers will be calculated at runtime by + calling min and max operators. The quantized models generated in this + mode are normally 10-20% slower than those with calibrations during inference. + If calib_mode='naive', the min and max values of the layer outputs from a calibration + dataset will be directly taken as the thresholds for quantization. + If calib_mode='entropy' (default mode), the thresholds for quantization will be + derived such that the KL divergence between the distributions of FP32 layer outputs and + quantized layer outputs is minimized based upon the calibration dataset. + calib_data : DataIter + A data iterator initialized by the calibration dataset. + num_calib_examples : int or None + The maximum number of examples that user would like to use for calibration. If not provided, + the whole calibration dataset will be used. + calib_layer : function + Given a layer's output name in string, return True or False for deciding whether to + calibrate this layer. If yes, the statistics of the layer's output will be collected; + otherwise, no information of the layer's output will be collected. If not provided, + all the layers' outputs that need requantization will be collected. + quantized_dtype : str + The quantized destination type for input data. Currently support 'int8' + , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result. + Default value is 'int8'. + logger : Object + A logging object for printing information during the process of quantization. + + Returns + ------- + tuple + A tuple of quantized symbol, quantized arg_params, and aux_params. + ------- + """ + if ctx != cpu(): + raise ValueError( + 'quantize_model_mkldnn only support Intel cpu platform with MKL-DNN Backend') + + sym = sym.get_backend_symbol('MKLDNN_QUANTIZE') + + qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, + data_names=data_names, label_names=label_names, + ctx=ctx, excluded_sym_names=excluded_sym_names, + calib_mode=calib_mode, calib_data=calib_data, + num_calib_examples=num_calib_examples, calib_layer=calib_layer, + quantized_dtype=quantized_dtype, logger=logger) + + qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE') + + return qsym, qarg_params, aux_params + +def quantize_graph(sym, arg_params, aux_params, + excluded_sym_names=None, calib_mode='entropy', + calib_layer=None, quantized_dtype='int8', logger=logging): + """User-level API for generating a quantized model from a FP32 model w/o calibration + and a collector for naive or entropy calibration. + The backend quantized operators are only enabled for Linux systems. Please do not run + inference using the quantized models on Windows for now. + The quantization implementation adopts the TensorFlow's approach: + https://www.tensorflow.org/performance/quantization. + The calibration implementation borrows the idea of Nvidia's 8-bit Inference with TensorRT: + http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + and adapts the method to MXNet. + Parameters + ---------- + sym : str or Symbol + Defines the structure of a neural network for FP32 data types. + arg_params : dict + Dictionary of name to `NDArray`. + aux_params : dict + Dictionary of name to `NDArray`. + excluded_sym_names : list of strings + A list of strings representing the names of the symbols that users want to excluding + from being quantized. + calib_mode : str + If calib_mode='none', no calibration will be used and the thresholds for + requantization after the corresponding layers will be calculated at runtime by + calling min and max operators. The quantized models generated in this + mode are normally 10-20% slower than those with calibrations during inference. + If calib_mode='naive', the min and max values of the layer outputs from a calibration + dataset will be directly taken as the thresholds for quantization. + If calib_mode='entropy' (default mode), the thresholds for quantization will be + derived such that the KL divergence between the distributions of FP32 layer outputs and + quantized layer outputs is minimized based upon the calibration dataset. + calib_layer : function + Given a layer's output name in string, return True or False for deciding whether to + calibrate this layer. If yes, the statistics of the layer's output will be collected; + otherwise, no information of the layer's output will be collected. If not provided, + all the layers' outputs that need requantization will be collected. + quantized_dtype : str + The quantized destination type for input data. Currently support 'int8' + , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result. + Default value is 'int8'. + logger : Object + A logging object for printing information during the process of quantization. + Returns + ------- + tuple + A tuple of quantized symbol, quantized arg_params, aux_params and collector. + ------- + """ + if excluded_sym_names is None: + excluded_sym_names = [] + if not isinstance(excluded_sym_names, list): + raise ValueError('excluded_sym_names must be a list of strings representing' + ' the names of the symbols that will not be quantized,' + ' while received type %s' % str(type(excluded_sym_names))) + + logger.info('Quantizing graph') + if quantized_dtype not in ('int8', 'uint8', 'auto'): + raise ValueError('unknown quantized_dtype %s received,' + ' expected `int8`, `uint8` or `auto`' % quantized_dtype) + qsym = _quantize_symbol(sym, excluded_symbols=excluded_sym_names, + offline_params=list(arg_params.keys()), + quantized_dtype=quantized_dtype) + + th_dict = {} + collector = None + if calib_mode is not None and calib_mode != 'none': + if calib_mode == 'entropy': + collector = _LayerOutputCollector( + include_layer=calib_layer, logger=logger) + logger.info( + 'Create a layer output collector for entropy calibration.') + elif calib_mode == 'naive': + collector = _LayerOutputMinMaxCollector( + include_layer=calib_layer, logger=logger) + logger.info( + 'Create a layer output minmax collector for naive calibration') + else: + raise ValueError('unknown calibration mode %s received,' + ' expected `none`, `naive`, or `entropy`' % calib_mode) + logger.info('Collector created, please use set_monitor_callback' + ' to collect calibration information.') + + logger.info('Quantizing parameters') + qarg_params = _quantize_params(qsym, arg_params, th_dict) + + return qsym, qarg_params, aux_params, collector + +def calib_graph(qsym, arg_params, aux_params, collector, + calib_mode='entropy', quantized_dtype='int8', logger=logging): + """User-level API for calibrating a quantized model using a filled collector. + The backend quantized operators are only enabled for Linux systems. Please do not run + inference using the quantized models on Windows for now. + The quantization implementation adopts the TensorFlow's approach: + https://www.tensorflow.org/performance/quantization. + The calibration implementation borrows the idea of Nvidia's 8-bit Inference with TensorRT: + http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + and adapts the method to MXNet. + Parameters + ---------- + qsym : str or Symbol + Defines the structure of a neural network for INT8 data types. + arg_params : dict + Dictionary of name to `NDArray`. + aux_params : dict + Dictionary of name to `NDArray`. + collector : function + layer collector for naive or entropy calibration. + calib_mode : str + If calib_mode='none', no calibration will be used and the thresholds for + requantization after the corresponding layers will be calculated at runtime by + calling min and max operators. The quantized models generated in this + mode are normally 10-20% slower than those with calibrations during inference. + If calib_mode='naive', the min and max values of the layer outputs from a calibration + dataset will be directly taken as the thresholds for quantization. + If calib_mode='entropy' (default mode), the thresholds for quantization will be + derived such that the KL divergence between the distributions of FP32 layer outputs and + quantized layer outputs is minimized based upon the calibration dataset. + calib_layer : function + Given a layer's output name in string, return True or False for deciding whether to + calibrate this layer. If yes, the statistics of the layer's output will be collected; + otherwise, no information of the layer's output will be collected. If not provided, + all the layers' outputs that need requantization will be collected. + quantized_dtype : str + The quantized destination type for input data. Currently support 'int8' + , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result. + Default value is 'int8'. + logger : Object + A logging object for printing information during the process of quantization. + Returns + ------- + tuple + A tuple of calibrated symbol, quantized arg_params, aux_params. + ------- + """ + th_dict = {} + if calib_mode is not None and calib_mode != 'none': + if calib_mode == 'entropy': + logger.info('Calculating optimal thresholds for quantization') + th_dict = _get_optimal_thresholds( + collector.nd_dict, quantized_dtype, logger=logger) + elif calib_mode == 'naive': + th_dict = collector.min_max_dict + else: + raise ValueError('unknown calibration mode %s received,' + ' expected `none`, `naive`, or `entropy`' % calib_mode) + logger.info('Calibrating quantized symbol') + qsym = _calibrate_quantized_sym(qsym, th_dict) + else: + raise ValueError('please set calibration mode to naive or entropy.') + + logger.info('Quantizing parameters') + qarg_params = _quantize_params(qsym, arg_params, th_dict) + + return qsym, qarg_params, aux_params diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py index 0c4954acbd8b..5fe6a03eae7b 100644 --- a/tests/tutorials/test_tutorials.py +++ b/tests/tutorials/test_tutorials.py @@ -213,3 +213,6 @@ def test_control_flow(): def test_amp(): assert _test_tutorial_nb('amp/amp_tutorial') + +def test_mkldnn_quantization(): + assert _test_tutorial_nb('mkldnn/mkldnn_quantization') \ No newline at end of file From 0b1c8f6d31113c6ce1b1a1c35dc03925da77a890 Mon Sep 17 00:00:00 2001 From: dtracz <41399548+dtracz@users.noreply.github.com> Date: Thu, 1 Aug 2019 14:41:13 -0700 Subject: [PATCH 16/24] make TransposeShape infer shape form both sides (#15713) * make TransposeShape infer shape form both sides * small fixes * remove redundant lines * unit tests --- src/operator/tensor/matrix_op-inl.h | 19 +++++++++++++++++-- tests/python/unittest/test_operator.py | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index 5cd7bf6652d3..cd98cb020c6b 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -344,19 +344,34 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs, CHECK_EQ(in_attrs->size(), 1U); CHECK_EQ(out_attrs->size(), 1U); mxnet::TShape& shp = (*in_attrs)[0]; + mxnet::TShape& out_shp = (*out_attrs)[0]; CHECK_LE(shp.ndim(), 6) << "Transpose support at most 6 dimensions"; - mxnet::TShape ret(shp.ndim(), -1); + CHECK_NE(shp.ndim(), 0) << "Number of dimensions cannot be 0"; + CHECK_NE(out_shp.ndim(), 0) << "Number of dimensions cannot be 0"; + if (shp.ndim() == -1 && out_shp.ndim() == -1) + return false; // none of the shapes is known + if (out_shp.ndim() > 0 && shp.ndim() > 0) + CHECK_EQ(out_shp.ndim(), shp.ndim()); + mxnet::TShape get(std::max(shp.ndim(), out_shp.ndim()), -1); + mxnet::TShape ret(std::max(shp.ndim(), out_shp.ndim()), -1); if (param.axes.ndim() == 0) { for (int i = 0; i < shp.ndim(); ++i) { ret[i] = shp[shp.ndim()-1-i]; } + for (int i = 0; i < out_shp.ndim(); ++i) { + get[shp.ndim()-1-i] = out_shp[i]; + } } else { - CHECK_EQ(shp.ndim(), param.axes.ndim()); + CHECK_EQ(std::max(shp.ndim(), out_shp.ndim()), param.axes.ndim()); for (int i = 0; i < shp.ndim(); ++i) { CHECK(param.axes[i] < static_cast(shp.ndim())); ret[i] = shp[param.axes[i]]; } + for (int i = 0; i < out_shp.ndim(); ++i) { + get[param.axes[i]] = out_shp[i]; + } } + SHAPE_ASSIGN_CHECK(*in_attrs, 0, get); SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret); return shape_is_known(ret); } diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index adc52a1dd50f..5d7e51af7467 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -8997,6 +8997,26 @@ def test_get_operator_arguments(): ok_(operator_arguments.narg == 2) +def test_transpose_infer_shape_back(): + o1 = mx.sym.ones(shape=[2,3]) + o2 = mx.sym.ones(shape=[-1,-1]) + t = mx.sym.transpose(o2) + b = o1 + t + x = b.bind(mx.cpu(), args={}) + y = x.forward() + assert(y[0].shape == (2,3)) + + +def test_transpose_infer_shape_mixed(): + o1 = mx.sym.ones(shape=[2,-1]) + o2 = mx.sym.ones(shape=[3,-1]) + t = mx.sym.transpose(o2) + b = o1 + t + x = b.bind(mx.cpu(), args={}) + y = x.forward() + assert(y[0].shape == (2,3)) + + if __name__ == '__main__': import nose nose.runmodule() From a3d32e4e476699edcdfbdb1acba739c19069598b Mon Sep 17 00:00:00 2001 From: Lai Wei Date: Thu, 1 Aug 2019 15:09:04 -0700 Subject: [PATCH 17/24] [MXNET-1358] Fit api tutorial (#15353) * Added tutorial for FIT API * Added tests for Fit API tutorial * Updated index.md for the new tutorial to show up * Addressed PR feedback * Addressed PR feedback * Removed spurious comment for Py2 and Py3 compatibility * Address PR feedback * Addressed PR feedback * Fixed typo * Added example to showcase custom event handler * Fixed imports as estimator moved to contrib package * Added a side note to inform about estimator reference being updated by the handlers * Corrected typo * update tutorial * address comments * new line * fix import * fix cached graph * fix import * address comments * fix doc gen * add softmax * add to website index * fix doc string * Fix doc gen (#12) * fix warining * fix test * fix * fix * fix print * fix test (#13) * fix warning (#14) * fix href (#15) --- docs/api/python/gluon/contrib.md | 30 ++ docs/tutorials/gluon/fit_api_tutorial.md | 271 ++++++++++++++++++ docs/tutorials/index.md | 2 + python/mxnet/gluon/contrib/__init__.py | 2 + .../mxnet/gluon/contrib/estimator/__init__.py | 2 + .../gluon/contrib/estimator/estimator.py | 64 +++-- .../gluon/contrib/estimator/event_handler.py | 23 +- tests/python/unittest/test_gluon_estimator.py | 7 +- tests/tutorials/test_tutorials.py | 3 + 9 files changed, 367 insertions(+), 37 deletions(-) create mode 100644 docs/tutorials/gluon/fit_api_tutorial.md diff --git a/docs/api/python/gluon/contrib.md b/docs/api/python/gluon/contrib.md index a940f697de69..22cdebb53b85 100644 --- a/docs/api/python/gluon/contrib.md +++ b/docs/api/python/gluon/contrib.md @@ -114,6 +114,33 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p WikiText103 ``` +### Estimator + +```eval_rst +.. currentmodule:: mxnet.gluon.contrib.estimator + +.. autosummary:: + :nosignatures: + + Estimator +``` + +#### EventHandler + +```eval_rst +.. currentmodule:: mxnet.gluon.contrib.estimator + +.. autosummary:: + :nosignatures: + + StoppingHandler + MetricHandler + ValidationHandler + LoggingHandler + CheckpointHandler + EarlyStoppingHandler +``` + ## API Reference @@ -144,6 +171,9 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p :members: :imported-members: +.. automodule:: mxnet.gluon.contrib.estimator + :members: + :imported-members: ``` diff --git a/docs/tutorials/gluon/fit_api_tutorial.md b/docs/tutorials/gluon/fit_api_tutorial.md new file mode 100644 index 000000000000..bc50690ac1a2 --- /dev/null +++ b/docs/tutorials/gluon/fit_api_tutorial.md @@ -0,0 +1,271 @@ + + + + + + + + + + + + + + + + + + +# MXNet Gluon Fit API + +In this tutorial, you will learn how to use the [Gluon Fit API](https://cwiki.apache.org/confluence/display/MXNET/Gluon+Fit+API+-+Tech+Design) which is the easiest way to train deep learning models using the [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html) in Apache MXNet. + +With the Fit API, you can train a deep learning model with a minimal amount of code. Just specify the network, loss function and the data you want to train on. You don't need to worry about the boiler plate code to loop through the dataset in batches (often called as 'training loop'). Advanced users can train with bespoke training loops, and many of these use cases will be covered by the Fit API. + +To demonstrate the Fit API, you will train an image classification model using the [ResNet-18](https://arxiv.org/abs/1512.03385) neural network architecture. The model will be trained using the [Fashion-MNIST dataset](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/). + +## Prerequisites + +To complete this tutorial, you will need: + +- [MXNet](https://mxnet.incubator.apache.org/install/#overview) (The version of MXNet will be >= 1.5.0, you can use `pip install mxnet` to get 1.5.0 release pip package or build from source with master, refer to [MXNet installation](http://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Linux&language=Python&processor=CPU) +- [Jupyter Notebook](https://jupyter.org/index.html) (For interactively running the provided .ipynb file) + + + + +```python +import mxnet as mx +from mxnet import gluon +from mxnet.gluon.model_zoo import vision +from mxnet.gluon.contrib.estimator import estimator +from mxnet.gluon.contrib.estimator.event_handler import TrainBegin, TrainEnd, EpochEnd, CheckpointHandler + +gpu_count = mx.context.num_gpus() +ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu() +``` + +## Dataset + +[Fashion-MNIST](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/) dataset consists of fashion items divided into ten categories: t-shirt/top, trouser, pullover, dress, coat, sandal, shirt, sneaker, bag and ankle boot. + +- It has 60,000 grayscale images of size 28 * 28 for training. +- It has 10,000 grayscale images of size 28 * 28 for testing/validation. + +We will use the ```gluon.data.vision``` package to directly import the Fashion-MNIST dataset and perform pre-processing on it. + + +```python +# Get the training data +fashion_mnist_train = gluon.data.vision.FashionMNIST(train=True) + +# Get the validation data +fashion_mnist_val = gluon.data.vision.FashionMNIST(train=False) +``` + + +```python +transforms = [gluon.data.vision.transforms.Resize(224), # We pick 224 as the model we use takes an input of size 224. + gluon.data.vision.transforms.ToTensor()] + +# Now we will stack all these together. +transforms = gluon.data.vision.transforms.Compose(transforms) +``` + + +```python +# Apply the transformations +fashion_mnist_train = fashion_mnist_train.transform_first(transforms) +fashion_mnist_val = fashion_mnist_val.transform_first(transforms) +``` + + +```python +batch_size = 256 # Batch size of the images +num_workers = 4 # The number of parallel workers for loading the data using Data Loaders. + +train_data_loader = gluon.data.DataLoader(fashion_mnist_train, batch_size=batch_size, + shuffle=True, num_workers=num_workers) +val_data_loader = gluon.data.DataLoader(fashion_mnist_val, batch_size=batch_size, + shuffle=False, num_workers=num_workers) +``` + +## Model and Optimizers + +Let's load the resnet-18 model architecture from [Gluon Model Zoo](http://mxnet.apache.org/api/python/gluon/model_zoo.html) and initialize its parameters. The Gluon Model Zoo contains a repository of pre-trained models as well the model architecture definitions. We are using the model architecture from the model zoo in order to train it from scratch. + + +```python +resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10) +resnet_18_v1.initialize(init = mx.init.Xavier(), ctx=ctx) +``` + +We will be using `SoftmaxCrossEntropyLoss` as the loss function since this is a multi-class classification problem. We will be using `sgd` (Stochastic Gradient Descent) as the optimizer. +You can experiment with a [different loss](http://mxnet.incubator.apache.org/versions/master/api/python/gluon/loss.html) or [optimizer](http://mxnet.incubator.apache.org/versions/master/api/python/optimization/optimization.html) as well. + + +```python +loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() +``` + +Let's define the trainer object for training the model. + + +```python +learning_rate = 0.04 # You can experiment with your own learning rate here +num_epochs = 2 # You can run training for more epochs +trainer = gluon.Trainer(resnet_18_v1.collect_params(), + 'sgd', {'learning_rate': learning_rate}) +``` + +## Train using Fit API + +As stated earlier, the Fit API greatly simplifies the boiler plate code and complexity for training using MXNet Gluon. + +In the basic usage example, with just 2 lines of code, we will set up our model for training. + +### Basic Usage + + +```python +train_acc = mx.metric.Accuracy() # Metric to monitor + +# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context +est = estimator.Estimator(net=resnet_18_v1, + loss=loss_fn, + metrics=train_acc, + trainer=trainer, + context=ctx) + +# ignore warnings for nightly test on CI only +import warnings +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Magic line + est.fit(train_data=train_data_loader, + epochs=num_epochs) +``` + + Training begin: using optimizer SGD with current learning rate 0.0400 + Train for 2 epochs. + + [Epoch 0] finished in 25.110s: train_accuracy : 0.7877 train_softmaxcrossentropyloss0 : 0.5905 + + [Epoch 1] finished in 23.595s: train_accuracy : 0.8823 train_softmaxcrossentropyloss0 : 0.3197 + Train finished using total 48s at epoch 1. train_accuracy : 0.8823 train_softmaxcrossentropyloss0 : 0.3197 + + +### Advanced Usage + +The Fit API is also customizable with several `Event Handlers` which give a fine grained control over the steps in training and exposes callback methods that provide control over the stages involved in training. Available callback methods are: `train_begin`, `train_end`, `batch_begin`, `batch_end`, `epoch_begin` and `epoch_end`. + +You can use built-in event handlers such as `LoggingHandler`, `CheckpointHandler` or `EarlyStoppingHandler` to log and save the model at certain time-steps during training. You can also stop the training when the model's performance plateaus. +There are also some default utility handlers that will be added to your estimator by default. For example, `StoppingHandler` is used to control when the training ends, based on number of epochs or number of batches trained. +`MetricHandler` is used to calculate training metrics at end of each batch and epoch. +`ValidationHandler` is used to validate your model on test data at each epoch's end and then calculate validation metrics. +You can create these utility handlers with different configurations and pass to estimator. This will override the default handler configuration. +You can create a custom handler by inheriting one or multiple +[base event handlers](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/contrib/estimator/event_handler.py#L32) + including: `TrainBegin`, `TrainEnd`, `EpochBegin`, `EpochEnd`, `BatchBegin`, `BatchEnd`. + + +### Custom Event Handler + +Here we will showcase an example custom event handler the inherits features from a few base handler classes. +Our custom event handler is a simple one: record the loss values at the end of every epoch in our training phase. + +Note: For each of the method, the `Estimator` object is passed along, so you can access training metrics. + +```python +class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd): + def __init__(self): + super(LossRecordHandler, self).__init__() + self.loss_history = {} + + def train_begin(self, estimator, *args, **kwargs): + print("Training begin") + + def train_end(self, estimator, *args, **kwargs): + # Print all the losses at the end of training + print("Training ended") + for loss_name in self.loss_history: + for i, loss_val in enumerate(self.loss_history[loss_name]): + print("Epoch: {}, Loss name: {}, Loss value: {}".format(i, loss_name, loss_val)) + + def epoch_end(self, estimator, *args, **kwargs): + for metric in estimator.train_metrics: + # look for train Loss in training metrics + # we wrapped loss value as a metric to record it + if isinstance(metric, mx.metric.Loss): + loss_name, loss_val = metric.get() + # append loss value for this epoch + self.loss_history.setdefault(loss_name, []).append(loss_val) +``` + + +```python +# Let's reset the model, trainer and accuracy objects from above + +resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx) +trainer = gluon.Trainer(resnet_18_v1.collect_params(), + 'sgd', {'learning_rate': learning_rate}) +train_acc = mx.metric.Accuracy() +``` + + +```python +# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context +est = estimator.Estimator(net=resnet_18_v1, + loss=loss_fn, + metrics=train_acc, + trainer=trainer, + context=ctx) + +# Define the handlers, let's say in built Checkpointhandler +checkpoint_handler = CheckpointHandler(model_dir='./', + model_prefix='my_model', + monitor=train_acc, # Monitors a metric + save_best=True) # Save the best model in terms of +# Let's instantiate another handler which we defined above +loss_record_handler = LossRecordHandler() +# ignore warnings for nightly test on CI only +import warnings +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Magic line + est.fit(train_data=train_data_loader, + val_data=val_data_loader, + epochs=num_epochs, + event_handlers=[checkpoint_handler, loss_record_handler]) # Add the event handlers +``` + + Training begin: using optimizer SGD with current learning rate 0.0400 + Train for 2 epochs. + + [Epoch 0] finished in 25.236s: train_accuracy : 0.7917 train_softmaxcrossentropyloss0 : 0.5741 val_accuracy : 0.6612 val_softmaxcrossentropyloss0 : 0.8627 + + [Epoch 1] finished in 24.892s: train_accuracy : 0.8826 train_softmaxcrossentropyloss0 : 0.3229 val_accuracy : 0.8474 val_softmaxcrossentropyloss0 : 0.4262 + + Train finished using total 50s at epoch 1. train_accuracy : 0.8826 train_softmaxcrossentropyloss0 : 0.3229 val_accuracy : 0.8474 val_softmaxcrossentropyloss0 : 0.4262 + + Training begin + Epoch 1, loss 0.5741 + Epoch 2, loss 0.3229 + +You can load the saved model, by using the `load_parameters` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](save_load_params.html#saving-model-parameters-to-file) + + +```python +resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes=10) +resnet_18_v1.load_parameters('./my_model-best.params', ctx=ctx) +``` + +## Summary + +- To learn more about deep learning with MXNeT, see [Dive Into Deep Learning](http://gluon.io) + +## Next Steps + +- For more hands on learning about deep learning, check out [Dive into Deep Learning](https://d2l.ai) + + diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index e01a30dbe68c..f773a79f63a7 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -139,6 +139,8 @@ Select API:  * [Data Transforms](/tutorials/gluon/transforms.html) * [Applying Data Augmentation](/tutorials/gluon/data_augmentation.html) * [Data Augmentation with Masks (for Object Segmentation)](https://mxnet.incubator.apache.org/tutorials/python/data_augmentation_with_masks.html) + * Fit API + * [Using Fit API](/tutorials/gluon/fit_api_tutorial.html)
diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py index 83be8a39ba32..7590eb740f67 100644 --- a/python/mxnet/gluon/contrib/__init__.py +++ b/python/mxnet/gluon/contrib/__init__.py @@ -25,3 +25,5 @@ from . import cnn from . import data + +from . import estimator diff --git a/python/mxnet/gluon/contrib/estimator/__init__.py b/python/mxnet/gluon/contrib/estimator/__init__.py index 58600dadffb4..bb0a0917c363 100644 --- a/python/mxnet/gluon/contrib/estimator/__init__.py +++ b/python/mxnet/gluon/contrib/estimator/__init__.py @@ -17,5 +17,7 @@ # pylint: disable=wildcard-import """Gluon Estimator Module""" +from . import estimator +from . import event_handler from .estimator import * from .event_handler import * diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py index da1a3915caec..b6142e100d96 100644 --- a/python/mxnet/gluon/contrib/estimator/estimator.py +++ b/python/mxnet/gluon/contrib/estimator/estimator.py @@ -24,9 +24,15 @@ from .event_handler import MetricHandler, ValidationHandler, LoggingHandler, StoppingHandler from .event_handler import TrainBegin, EpochBegin, BatchBegin, BatchEnd, EpochEnd, TrainEnd -from .... import gluon, autograd +from ...data import DataLoader +from ...loss import SoftmaxCrossEntropyLoss +from ...loss import Loss as gluon_loss +from ...trainer import Trainer +from ...utils import split_and_load +from .... import autograd from ....context import Context, cpu, gpu, num_gpus -from ....metric import EvalMetric, Loss, Accuracy +from ....metric import EvalMetric, Accuracy +from ....metric import Loss as metric_loss __all__ = ['Estimator'] @@ -69,9 +75,9 @@ def __init__(self, net, self.trainer = self._check_trainer(trainer) def _check_loss(self, loss): - if isinstance(loss, gluon.loss.Loss): + if isinstance(loss, gluon_loss): loss = [loss] - elif isinstance(loss, list) and all([isinstance(l, gluon.loss.Loss) for l in loss]): + elif isinstance(loss, list) and all([isinstance(l, gluon_loss) for l in loss]): loss = loss else: raise ValueError("loss must be a Loss or a list of Loss, " @@ -146,9 +152,9 @@ def _check_trainer(self, trainer): if not trainer: warnings.warn("No trainer specified, default SGD optimizer " "with learning rate 0.001 is used.") - trainer = gluon.Trainer(self.net.collect_params(), - 'sgd', {'learning_rate': 0.001}) - elif not isinstance(trainer, gluon.Trainer): + trainer = Trainer(self.net.collect_params(), + 'sgd', {'learning_rate': 0.001}) + elif not isinstance(trainer, Trainer): raise ValueError("Trainer must be a Gluon Trainer instance, refer to " "gluon.Trainer:{}".format(trainer)) return trainer @@ -165,8 +171,8 @@ def _is_initialized(self): def _get_data_and_label(self, batch, ctx, batch_axis=0): data = batch[0] label = batch[1] - data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=batch_axis) - label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=batch_axis) + data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis) + label = split_and_load(label, ctx_list=ctx, batch_axis=batch_axis) return data, label def prepare_loss_and_metrics(self): @@ -179,13 +185,13 @@ def prepare_loss_and_metrics(self): """ if any(not hasattr(self, attribute) for attribute in ['train_metrics', 'val_metrics']): - # Use default mx.metric.Accuracy() for gluon.loss.SoftmaxCrossEntropyLoss() - if not self.train_metrics and any([isinstance(l, gluon.loss.SoftmaxCrossEntropyLoss) for l in self.loss]): + # Use default mx.metric.Accuracy() for SoftmaxCrossEntropyLoss() + if not self.train_metrics and any([isinstance(l, SoftmaxCrossEntropyLoss) for l in self.loss]): self.train_metrics = [Accuracy()] self.val_metrics = [] for loss in self.loss: # remove trailing numbers from loss name to avoid confusion - self.train_metrics.append(Loss(loss.name.rstrip('1234567890'))) + self.train_metrics.append(metric_loss(loss.name.rstrip('1234567890'))) for metric in self.train_metrics: val_metric = copy.deepcopy(metric) metric.name = "train " + metric.name @@ -208,10 +214,10 @@ def evaluate(self, batch_axis : int, default 0 Batch axis to split the validation data into devices. """ - if not isinstance(val_data, gluon.data.DataLoader): + if not isinstance(val_data, DataLoader): raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you " "can transform your DataIter or any NDArray into Gluon DataLoader. " - "Refer to gluon.data.dataloader") + "Refer to gluon.data.DataLoader") for metric in val_metrics: metric.reset() @@ -222,7 +228,7 @@ def evaluate(self, loss = [self.loss[0](y_hat, y) for y_hat, y in zip(pred, label)] # update metrics for metric in val_metrics: - if isinstance(metric, Loss): + if isinstance(metric, metric_loss): metric.update(0, loss) else: metric.update(label, pred) @@ -254,7 +260,7 @@ def fit(self, train_data, batch_axis : int, default 0 Batch axis to split the training data into devices. """ - if not isinstance(train_data, gluon.data.DataLoader): + if not isinstance(train_data, DataLoader): raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you " "can transform your DataIter or any NDArray into Gluon DataLoader. " "Refer to gluon.data.dataloader") @@ -328,28 +334,36 @@ def fit(self, train_data, def _prepare_default_handlers(self, val_data, event_handlers): event_handlers = event_handlers or [] default_handlers = [] - train_metrics, val_metrics = self.prepare_loss_and_metrics() + self.prepare_loss_and_metrics() # no need to add to default handler check as StoppingHandler does not use metrics event_handlers.append(StoppingHandler(self.max_epoch, self.max_batch)) + default_handlers.append("StoppingHandler") if not any(isinstance(handler, MetricHandler) for handler in event_handlers): - event_handlers.append(MetricHandler(train_metrics=train_metrics)) + event_handlers.append(MetricHandler(train_metrics=self.train_metrics)) default_handlers.append("MetricHandler") - if val_data and not any(isinstance(handler, ValidationHandler) for handler in event_handlers): - event_handlers.append(ValidationHandler(val_data=val_data, eval_fn=self.evaluate, - val_metrics=val_metrics)) - default_handlers.append("ValidationHandler") + if not any(isinstance(handler, ValidationHandler) for handler in event_handlers): + # no validation handler + if val_data: + # add default validation handler if validation data found + event_handlers.append(ValidationHandler(val_data=val_data, eval_fn=self.evaluate, + val_metrics=self.val_metrics)) + default_handlers.append("ValidationHandler") + val_metrics = self.val_metrics + else: + # set validation metrics to None if no validation data and no validation handler + val_metrics = [] if not any(isinstance(handler, LoggingHandler) for handler in event_handlers): - event_handlers.append(LoggingHandler(train_metrics=train_metrics, + event_handlers.append(LoggingHandler(train_metrics=self.train_metrics, val_metrics=val_metrics)) default_handlers.append("LoggingHandler") # if there is a mix of user defined event handlers and default event handlers # they should have the same set of loss and metrics - if default_handlers: + if default_handlers and len(event_handlers) != len(default_handlers): msg = "You are training with the following default event handlers: %s. " \ "They use loss and metrics from estimator.prepare_loss_and_metrics(). " \ "Please use the same set of metrics for all your other handlers." % \ @@ -368,7 +382,7 @@ def _prepare_default_handlers(self, val_data, event_handlers): # remove None metric references references = set([ref for ref in references if ref]) for metric in references: - if metric not in train_metrics + val_metrics: + if metric not in self.train_metrics + self.val_metrics: msg = "We have added following default handlers for you: %s and used " \ "estimator.prepare_loss_and_metrics() to pass metrics to " \ "those handlers. Please use the same set of metrics " \ diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py index ed97c7bc3d19..da2c84455e35 100644 --- a/python/mxnet/gluon/contrib/estimator/event_handler.py +++ b/python/mxnet/gluon/contrib/estimator/event_handler.py @@ -26,7 +26,12 @@ import numpy as np -from ....metric import EvalMetric, Loss +from ....metric import EvalMetric +from ....metric import Loss as metric_loss + +__all__ = ['TrainBegin', 'TrainEnd', 'EpochBegin', 'EpochEnd', 'BatchBegin', 'BatchEnd', + 'StoppingHandler', 'MetricHandler', 'ValidationHandler', + 'LoggingHandler', 'CheckpointHandler', 'EarlyStoppingHandler'] class TrainBegin(object): @@ -127,7 +132,7 @@ def batch_end(self, estimator, *args, **kwargs): label = kwargs['label'] loss = kwargs['loss'] for metric in self.train_metrics: - if isinstance(metric, Loss): + if isinstance(metric, metric_loss): # metric wrapper for loss values metric.update(0, loss) else: @@ -135,7 +140,7 @@ def batch_end(self, estimator, *args, **kwargs): class ValidationHandler(TrainBegin, BatchEnd, EpochEnd): - """"Validation Handler that evaluate model on validation dataset + """Validation Handler that evaluate model on validation dataset :py:class:`ValidationHandler` takes validation dataset, an evaluation function, metrics to be evaluated, and how often to run the validation. You can provide custom @@ -430,7 +435,7 @@ def train_begin(self, estimator, *args, **kwargs): self.current_epoch = 0 self.current_batch = 0 if self.save_best: - self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable + self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable if self.resume_from_checkpoint: error_msg = "To use resume from checkpoint, you must only specify " \ "the same type of period you used for training." \ @@ -506,12 +511,12 @@ def _save_checkpoint(self, estimator): def _save_symbol(self, estimator): symbol_file = os.path.join(self.model_dir, self.model_prefix + '-symbol.json') - if hasattr(estimator.net, '_cached_graph'): + if hasattr(estimator.net, '_cached_graph') and estimator.net._cached_graph: sym = estimator.net._cached_graph[1] sym.save(symbol_file) else: - self.logger.info("Model architecture(symbol file) is not saved, please use HybridBlock" - "to construct your model, can call net.hybridize() before passing to" + self.logger.info("Model architecture(symbol file) is not saved, please use HybridBlock " + "to construct your model, can call net.hybridize() before passing to " "Estimator in order to save model architecture as %s.", symbol_file) def _save_params_and_trainer(self, estimator, file_prefix): @@ -666,7 +671,7 @@ def __init__(self, "if you want otherwise", self.monitor.get()[0]) self.monitor_op = np.less - if self.monitor_op == np.greater: # pylint: disable=comparison-with-callable + if self.monitor_op == np.greater: # pylint: disable=comparison-with-callable self.min_delta *= 1 else: self.min_delta *= -1 @@ -679,7 +684,7 @@ def train_begin(self, estimator, *args, **kwargs): if self.baseline is not None: self.best = self.baseline else: - self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable + self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable def epoch_end(self, estimator, *args, **kwargs): monitor_name, monitor_value = self.monitor.get() diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py index d2e8c082aa08..ae47d925670f 100644 --- a/tests/python/unittest/test_gluon_estimator.py +++ b/tests/python/unittest/test_gluon_estimator.py @@ -19,11 +19,13 @@ import sys import unittest +import warnings import mxnet as mx from mxnet import gluon from mxnet.gluon import nn from mxnet.gluon.contrib.estimator import * +from mxnet.gluon.contrib.estimator.event_handler import * from nose.tools import assert_raises @@ -335,10 +337,9 @@ def test_default_handlers(): metrics=train_acc, trainer=trainer, context=ctx) - # no handler + # no handler(all default handlers), no warning with warnings.catch_warnings(record=True) as w: est.fit(train_data=train_data, epochs=num_epochs) - assert 'You are training with the' in str(w[-1].message) # handler with prepared loss and metrics # use mix of default and user defined handlers @@ -353,7 +354,7 @@ def test_default_handlers(): # handler with all user defined metrics # use mix of default and user defined handlers metric = MetricHandler(train_metrics=[train_acc]) - logging = LoggingHandler(train_metrics=[train_acc], val_metrics=[mx.metric.RMSE("val acc")]) + logging = LoggingHandler(train_metrics=[train_acc]) est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[metric, logging]) # handler with mixed metrics, some handler use metrics prepared by estimator diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py index 5fe6a03eae7b..c2173a7dc071 100644 --- a/tests/tutorials/test_tutorials.py +++ b/tests/tutorials/test_tutorials.py @@ -133,6 +133,9 @@ def test_gluon_learning_rate_schedules_advanced(): def test_gluon_info_gan(): assert _test_tutorial_nb('gluon/info_gan') +def test_gluon_fit_api_fashion_mnist(): + assert _test_tutorial_nb('gluon/fit_api_tutorial') + def test_nlp_cnn(): assert _test_tutorial_nb('nlp/cnn') From e15605637ed9c60b00ce257b77059d4dcefa7ce5 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Thu, 18 Jul 2019 20:06:12 -0700 Subject: [PATCH 18/24] remove mshadow submodule --- .gitmodules | 3 --- 3rdparty/mshadow | 1 - 2 files changed, 4 deletions(-) delete mode 160000 3rdparty/mshadow diff --git a/.gitmodules b/.gitmodules index e0ffec11bfd0..90ef157f0eec 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "3rdparty/mshadow"] - path = 3rdparty/mshadow - url = https://github.com/dmlc/mshadow.git [submodule "3rdparty/dmlc-core"] path = 3rdparty/dmlc-core url = https://github.com/dmlc/dmlc-core.git diff --git a/3rdparty/mshadow b/3rdparty/mshadow deleted file mode 160000 index 1d79ecfdb4c9..000000000000 --- a/3rdparty/mshadow +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1d79ecfdb4c9234537e1bf5148f44a1af54501ec From 3f60274f30ae2130af33b7cea17010fc877babd4 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Thu, 18 Jul 2019 20:11:57 -0700 Subject: [PATCH 19/24] import mshadow source tree --- 3rdparty/mshadow/.gitignore | 21 + 3rdparty/mshadow/.travis.yml | 43 + 3rdparty/mshadow/CHANGES.md | 12 + 3rdparty/mshadow/CMakeLists.txt | 6 + 3rdparty/mshadow/LICENSE | 13 + 3rdparty/mshadow/README.md | 37 + 3rdparty/mshadow/cmake/Cuda.cmake | 324 +++ 3rdparty/mshadow/cmake/Utils.cmake | 398 +++ 3rdparty/mshadow/cmake/mshadow.cmake | 91 + 3rdparty/mshadow/cmake/mshadowUtils.cmake | 2 + 3rdparty/mshadow/doc/Doxyfile | 2358 +++++++++++++++++ 3rdparty/mshadow/doc/README.md | 321 +++ 3rdparty/mshadow/doc/mkdoc.sh | 4 + 3rdparty/mshadow/guide/.gitignore | 3 + 3rdparty/mshadow/guide/Makefile | 37 + 3rdparty/mshadow/guide/README.md | 226 ++ 3rdparty/mshadow/guide/basic.cpp | 161 ++ 3rdparty/mshadow/guide/basic_stream.cu | 35 + 3rdparty/mshadow/guide/config.mk | 39 + 3rdparty/mshadow/guide/defop.cpp | 49 + .../mshadow/guide/exp-template/.gitignore | 1 + 3rdparty/mshadow/guide/exp-template/Makefile | 20 + 3rdparty/mshadow/guide/exp-template/README.md | 340 +++ .../mshadow/guide/exp-template/exp_lazy.cpp | 45 + .../guide/exp-template/exp_template.cpp | 72 + .../guide/exp-template/exp_template_op.cpp | 92 + 3rdparty/mshadow/guide/mshadow-ps/.gitignore | 4 + .../mshadow/guide/mshadow-ps/2-levels.png | Bin 0 -> 59413 bytes 3rdparty/mshadow/guide/mshadow-ps/Makefile | 45 + 3rdparty/mshadow/guide/mshadow-ps/README.md | 227 ++ 3rdparty/mshadow/guide/mshadow-ps/config.mk | 40 + 3rdparty/mshadow/guide/mshadow-ps/dbstr.h | 35 + .../guide/mshadow-ps/dist_async_sum-inl.h | 124 + .../guide/mshadow-ps/dist_async_sum.cpp | 11 + 3rdparty/mshadow/guide/mshadow-ps/local.sh | 39 + .../mshadow/guide/mshadow-ps/local_sum-inl.h | 119 + .../mshadow/guide/mshadow-ps/local_sum.cpp | 4 + .../mshadow/guide/mshadow-ps/local_sum.cu | 4 + 3rdparty/mshadow/guide/neuralnet/Makefile | 38 + 3rdparty/mshadow/guide/neuralnet/README.md | 16 + 3rdparty/mshadow/guide/neuralnet/config.mk | 35 + 3rdparty/mshadow/guide/neuralnet/convnet.cu | 282 ++ 3rdparty/mshadow/guide/neuralnet/nnet.cu | 202 ++ 3rdparty/mshadow/guide/neuralnet/nnet_ps.cu | 312 +++ 3rdparty/mshadow/guide/neuralnet/util.h | 86 + 3rdparty/mshadow/make/README.md | 18 + 3rdparty/mshadow/make/mshadow.mk | 166 ++ 3rdparty/mshadow/mshadow-ps/.gitignore | 3 + 3rdparty/mshadow/mshadow-ps/README.md | 4 + 3rdparty/mshadow/mshadow-ps/mshadow_ps.h | 358 +++ 3rdparty/mshadow/mshadow-ps/ps_dist-inl.h | 126 + 3rdparty/mshadow/mshadow-ps/ps_local-inl.h | 814 ++++++ 3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h | 113 + 3rdparty/mshadow/mshadow-ps/thread.h | 261 ++ 3rdparty/mshadow/mshadow-ps/thread_util.h | 169 ++ 3rdparty/mshadow/mshadow/README.md | 8 + 3rdparty/mshadow/mshadow/base.h | 1110 ++++++++ 3rdparty/mshadow/mshadow/cuda/reduce.cuh | 120 + .../mshadow/mshadow/cuda/tensor_gpu-inl.cuh | 828 ++++++ 3rdparty/mshadow/mshadow/dot_engine-inl.h | 936 +++++++ 3rdparty/mshadow/mshadow/expr_engine-inl.h | 482 ++++ 3rdparty/mshadow/mshadow/expr_scalar-inl.h | 165 ++ 3rdparty/mshadow/mshadow/expression.h | 416 +++ 3rdparty/mshadow/mshadow/extension.h | 41 + .../mshadow/mshadow/extension/broadcast.h | 165 ++ .../mshadow/extension/broadcast_with_axis.h | 258 ++ .../mshadow/mshadow/extension/channel_pool.h | 108 + .../mshadow/extension/channel_unpool.h | 137 + 3rdparty/mshadow/mshadow/extension/choose.h | 90 + 3rdparty/mshadow/mshadow/extension/complex.h | 525 ++++ 3rdparty/mshadow/mshadow/extension/concat.h | 194 ++ 3rdparty/mshadow/mshadow/extension/crop.h | 119 + 3rdparty/mshadow/mshadow/extension/fill.h | 103 + 3rdparty/mshadow/mshadow/extension/flip.h | 132 + .../mshadow/mshadow/extension/implicit_gemm.h | 128 + 3rdparty/mshadow/mshadow/extension/mask.h | 97 + 3rdparty/mshadow/mshadow/extension/mirror.h | 62 + 3rdparty/mshadow/mshadow/extension/one_hot.h | 87 + .../mshadow/extension/pack_col2patch.h | 154 ++ 3rdparty/mshadow/mshadow/extension/pad.h | 111 + 3rdparty/mshadow/mshadow/extension/range.h | 118 + .../mshadow/extension/reduce_with_axis.h | 136 + .../mshadow/mshadow/extension/reduceto1d.h | 104 + 3rdparty/mshadow/mshadow/extension/reshape.h | 87 + 3rdparty/mshadow/mshadow/extension/slice.h | 156 ++ 3rdparty/mshadow/mshadow/extension/slice_ex.h | 135 + .../mshadow/mshadow/extension/spatial_pool.h | 152 ++ .../mshadow/extension/spatial_unpool.h | 135 + .../extension/spatial_upsampling_nearest.h | 71 + 3rdparty/mshadow/mshadow/extension/swapaxis.h | 110 + 3rdparty/mshadow/mshadow/extension/take.h | 99 + .../mshadow/mshadow/extension/take_grad.h | 111 + .../mshadow/mshadow/extension/transpose.h | 200 ++ .../mshadow/extension/unpack_patch2col.h | 151 ++ 3rdparty/mshadow/mshadow/half.h | 354 +++ 3rdparty/mshadow/mshadow/half2.h | 143 + 3rdparty/mshadow/mshadow/io.h | 137 + 3rdparty/mshadow/mshadow/logging.h | 234 ++ 3rdparty/mshadow/mshadow/packet-inl.h | 413 +++ 3rdparty/mshadow/mshadow/packet/plain-inl.h | 76 + 3rdparty/mshadow/mshadow/packet/sse-inl.h | 147 + 3rdparty/mshadow/mshadow/random.h | 570 ++++ 3rdparty/mshadow/mshadow/stream_gpu-inl.h | 214 ++ 3rdparty/mshadow/mshadow/tensor.h | 1081 ++++++++ 3rdparty/mshadow/mshadow/tensor_container.h | 208 ++ 3rdparty/mshadow/mshadow/tensor_cpu-inl.h | 627 +++++ 3rdparty/mshadow/mshadow/tensor_gpu-inl.h | 245 ++ 3rdparty/mshadow/scripts/travis_script.sh | 19 + 3rdparty/mshadow/test/Makefile | 35 + 3rdparty/mshadow/test/pairtest.cu | 105 + 3rdparty/mshadow/test/pool.cu | 69 + 3rdparty/mshadow/test/reshape.cu | 74 + 3rdparty/mshadow/test/test.cu | 79 + 3rdparty/mshadow/test/test.h | 67 + 3rdparty/mshadow/test/unpack.cu | 85 + 115 files changed, 21728 insertions(+) create mode 100644 3rdparty/mshadow/.gitignore create mode 100644 3rdparty/mshadow/.travis.yml create mode 100644 3rdparty/mshadow/CHANGES.md create mode 100644 3rdparty/mshadow/CMakeLists.txt create mode 100644 3rdparty/mshadow/LICENSE create mode 100644 3rdparty/mshadow/README.md create mode 100644 3rdparty/mshadow/cmake/Cuda.cmake create mode 100644 3rdparty/mshadow/cmake/Utils.cmake create mode 100644 3rdparty/mshadow/cmake/mshadow.cmake create mode 100644 3rdparty/mshadow/cmake/mshadowUtils.cmake create mode 100644 3rdparty/mshadow/doc/Doxyfile create mode 100644 3rdparty/mshadow/doc/README.md create mode 100755 3rdparty/mshadow/doc/mkdoc.sh create mode 100644 3rdparty/mshadow/guide/.gitignore create mode 100644 3rdparty/mshadow/guide/Makefile create mode 100644 3rdparty/mshadow/guide/README.md create mode 100644 3rdparty/mshadow/guide/basic.cpp create mode 100644 3rdparty/mshadow/guide/basic_stream.cu create mode 100644 3rdparty/mshadow/guide/config.mk create mode 100644 3rdparty/mshadow/guide/defop.cpp create mode 100644 3rdparty/mshadow/guide/exp-template/.gitignore create mode 100644 3rdparty/mshadow/guide/exp-template/Makefile create mode 100644 3rdparty/mshadow/guide/exp-template/README.md create mode 100644 3rdparty/mshadow/guide/exp-template/exp_lazy.cpp create mode 100644 3rdparty/mshadow/guide/exp-template/exp_template.cpp create mode 100644 3rdparty/mshadow/guide/exp-template/exp_template_op.cpp create mode 100644 3rdparty/mshadow/guide/mshadow-ps/.gitignore create mode 100644 3rdparty/mshadow/guide/mshadow-ps/2-levels.png create mode 100644 3rdparty/mshadow/guide/mshadow-ps/Makefile create mode 100644 3rdparty/mshadow/guide/mshadow-ps/README.md create mode 100644 3rdparty/mshadow/guide/mshadow-ps/config.mk create mode 100644 3rdparty/mshadow/guide/mshadow-ps/dbstr.h create mode 100644 3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h create mode 100644 3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp create mode 100755 3rdparty/mshadow/guide/mshadow-ps/local.sh create mode 100644 3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h create mode 100644 3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp create mode 100644 3rdparty/mshadow/guide/mshadow-ps/local_sum.cu create mode 100644 3rdparty/mshadow/guide/neuralnet/Makefile create mode 100644 3rdparty/mshadow/guide/neuralnet/README.md create mode 100644 3rdparty/mshadow/guide/neuralnet/config.mk create mode 100644 3rdparty/mshadow/guide/neuralnet/convnet.cu create mode 100644 3rdparty/mshadow/guide/neuralnet/nnet.cu create mode 100644 3rdparty/mshadow/guide/neuralnet/nnet_ps.cu create mode 100644 3rdparty/mshadow/guide/neuralnet/util.h create mode 100644 3rdparty/mshadow/make/README.md create mode 100644 3rdparty/mshadow/make/mshadow.mk create mode 100644 3rdparty/mshadow/mshadow-ps/.gitignore create mode 100644 3rdparty/mshadow/mshadow-ps/README.md create mode 100644 3rdparty/mshadow/mshadow-ps/mshadow_ps.h create mode 100644 3rdparty/mshadow/mshadow-ps/ps_dist-inl.h create mode 100644 3rdparty/mshadow/mshadow-ps/ps_local-inl.h create mode 100644 3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h create mode 100644 3rdparty/mshadow/mshadow-ps/thread.h create mode 100644 3rdparty/mshadow/mshadow-ps/thread_util.h create mode 100644 3rdparty/mshadow/mshadow/README.md create mode 100755 3rdparty/mshadow/mshadow/base.h create mode 100644 3rdparty/mshadow/mshadow/cuda/reduce.cuh create mode 100755 3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh create mode 100644 3rdparty/mshadow/mshadow/dot_engine-inl.h create mode 100644 3rdparty/mshadow/mshadow/expr_engine-inl.h create mode 100644 3rdparty/mshadow/mshadow/expr_scalar-inl.h create mode 100644 3rdparty/mshadow/mshadow/expression.h create mode 100644 3rdparty/mshadow/mshadow/extension.h create mode 100644 3rdparty/mshadow/mshadow/extension/broadcast.h create mode 100644 3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h create mode 100644 3rdparty/mshadow/mshadow/extension/channel_pool.h create mode 100644 3rdparty/mshadow/mshadow/extension/channel_unpool.h create mode 100644 3rdparty/mshadow/mshadow/extension/choose.h create mode 100644 3rdparty/mshadow/mshadow/extension/complex.h create mode 100644 3rdparty/mshadow/mshadow/extension/concat.h create mode 100644 3rdparty/mshadow/mshadow/extension/crop.h create mode 100644 3rdparty/mshadow/mshadow/extension/fill.h create mode 100644 3rdparty/mshadow/mshadow/extension/flip.h create mode 100644 3rdparty/mshadow/mshadow/extension/implicit_gemm.h create mode 100644 3rdparty/mshadow/mshadow/extension/mask.h create mode 100644 3rdparty/mshadow/mshadow/extension/mirror.h create mode 100644 3rdparty/mshadow/mshadow/extension/one_hot.h create mode 100644 3rdparty/mshadow/mshadow/extension/pack_col2patch.h create mode 100644 3rdparty/mshadow/mshadow/extension/pad.h create mode 100644 3rdparty/mshadow/mshadow/extension/range.h create mode 100644 3rdparty/mshadow/mshadow/extension/reduce_with_axis.h create mode 100644 3rdparty/mshadow/mshadow/extension/reduceto1d.h create mode 100644 3rdparty/mshadow/mshadow/extension/reshape.h create mode 100644 3rdparty/mshadow/mshadow/extension/slice.h create mode 100644 3rdparty/mshadow/mshadow/extension/slice_ex.h create mode 100644 3rdparty/mshadow/mshadow/extension/spatial_pool.h create mode 100644 3rdparty/mshadow/mshadow/extension/spatial_unpool.h create mode 100644 3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h create mode 100644 3rdparty/mshadow/mshadow/extension/swapaxis.h create mode 100644 3rdparty/mshadow/mshadow/extension/take.h create mode 100644 3rdparty/mshadow/mshadow/extension/take_grad.h create mode 100644 3rdparty/mshadow/mshadow/extension/transpose.h create mode 100644 3rdparty/mshadow/mshadow/extension/unpack_patch2col.h create mode 100644 3rdparty/mshadow/mshadow/half.h create mode 100755 3rdparty/mshadow/mshadow/half2.h create mode 100644 3rdparty/mshadow/mshadow/io.h create mode 100644 3rdparty/mshadow/mshadow/logging.h create mode 100644 3rdparty/mshadow/mshadow/packet-inl.h create mode 100644 3rdparty/mshadow/mshadow/packet/plain-inl.h create mode 100644 3rdparty/mshadow/mshadow/packet/sse-inl.h create mode 100644 3rdparty/mshadow/mshadow/random.h create mode 100644 3rdparty/mshadow/mshadow/stream_gpu-inl.h create mode 100755 3rdparty/mshadow/mshadow/tensor.h create mode 100644 3rdparty/mshadow/mshadow/tensor_container.h create mode 100755 3rdparty/mshadow/mshadow/tensor_cpu-inl.h create mode 100755 3rdparty/mshadow/mshadow/tensor_gpu-inl.h create mode 100755 3rdparty/mshadow/scripts/travis_script.sh create mode 100644 3rdparty/mshadow/test/Makefile create mode 100644 3rdparty/mshadow/test/pairtest.cu create mode 100644 3rdparty/mshadow/test/pool.cu create mode 100644 3rdparty/mshadow/test/reshape.cu create mode 100644 3rdparty/mshadow/test/test.cu create mode 100644 3rdparty/mshadow/test/test.h create mode 100644 3rdparty/mshadow/test/unpack.cu diff --git a/3rdparty/mshadow/.gitignore b/3rdparty/mshadow/.gitignore new file mode 100644 index 000000000000..3da5172aeb2a --- /dev/null +++ b/3rdparty/mshadow/.gitignore @@ -0,0 +1,21 @@ +# Compiled Object files +*.slo +*.lo +*.o + +# Compiled Dynamic libraries +*.so +*.dylib + +# Compiled Static libraries +*.lai +*.la +*.a +*~ +doc/html +doc/latex +rabit +dmlc-core +*.db +*.bak +build diff --git a/3rdparty/mshadow/.travis.yml b/3rdparty/mshadow/.travis.yml new file mode 100644 index 000000000000..a4d6223d8ea7 --- /dev/null +++ b/3rdparty/mshadow/.travis.yml @@ -0,0 +1,43 @@ +# disable sudo to use container based build +sudo: false + +# Use Build Matrix to do lint and build seperately +env: + matrix: + - TASK=lint LINT_LANG=cpp + - TASK=doc + - TASK=build CXX=g++ + +# dependent apt packages +addons: + apt: + packages: + - doxygen + - wget + - unzip + - libblas-dev + - python3-pip + +before_install: + - git clone https://github.com/dmlc/dmlc-core + - export TRAVIS=dmlc-core/scripts/travis + - source ${TRAVIS}/travis_setup_env.sh + +install: + - pip3 install --upgrade pip --user + - pip3 install --user cpplint pylint + +script: scripts/travis_script.sh + +before_cache: + - ${TRAVIS}/travis_before_cache.sh + +cache: + directories: + - ${HOME}/.cache/usr + +notifications: + email: + on_success: change + on_failure: always + diff --git a/3rdparty/mshadow/CHANGES.md b/3rdparty/mshadow/CHANGES.md new file mode 100644 index 000000000000..03bb16936acd --- /dev/null +++ b/3rdparty/mshadow/CHANGES.md @@ -0,0 +1,12 @@ +Change Log +===== + +mshadow-1.0 +===== +* Initial release + +mshadow-2.0: in progress +===== +* Support multiple data type +* Great refactoring of code +* Parameter server interface for MultiGPU and distributed learning diff --git a/3rdparty/mshadow/CMakeLists.txt b/3rdparty/mshadow/CMakeLists.txt new file mode 100644 index 000000000000..b89e9028a77b --- /dev/null +++ b/3rdparty/mshadow/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 2.8.7) + +project(mshadow C CXX) + +set(mshadow_LINT_DIRS mshadow mshadow-ps) +add_custom_target(mshadow_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${mshadow_LINT_DIRS} -DPROJECT_SOURCE_DIR=${PROJECT_SOURCE_DIR} -DPROJECT_NAME=mshadow -P ${PROJECT_SOURCE_DIR}/../dmlc-core/cmake/lint.cmake) diff --git a/3rdparty/mshadow/LICENSE b/3rdparty/mshadow/LICENSE new file mode 100644 index 000000000000..ebf9611d76cd --- /dev/null +++ b/3rdparty/mshadow/LICENSE @@ -0,0 +1,13 @@ +Copyright (c) 2014 by Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/3rdparty/mshadow/README.md b/3rdparty/mshadow/README.md new file mode 100644 index 000000000000..cc18964a65f7 --- /dev/null +++ b/3rdparty/mshadow/README.md @@ -0,0 +1,37 @@ +mshadow: Matrix Shadow +====== +[![Build Status](https://travis-ci.org/dmlc/mshadow.svg?branch=master)](https://travis-ci.org/dmlc/mshadow) + +MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The goal of mshadow is to support ***efficient***, +***device invariant*** and ***simple*** tensor library for machine learning project that aims for maximum performance and control, while also emphasize simplicity. + +MShadow also provides interface that allows writing Multi-GPU and distributed deep learning programs in an easy and unified way. + +* [Contributors](https://github.com/tqchen/mshadow/graphs/contributors) +* [Tutorial](guide) +* [Documentation](doc) +* [Parameter Server Interface for GPU Tensor](guide/mshadow-ps) + +Features +-------- +* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code + - No temporal memory allocation will happen for expression you write + - mshadow will generate specific kernel for every expression you write in compile time. +* Device invariant: you can write one code and it will run on both CPU and GPU +* Simple: mshadow allows you to write machine learning code using expressions. +* Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called +* Lightweight library: light amount of code to support frequently used functions in machine learning +* Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required. +* MultiGPU and Distributed ML: mshadow-ps interface allows user to write efficient MultiGPU and distributed programs in an unified way. + +Version +------- +* This version mshadow-2.x, there are a lot of changes in the interface and it is not backward compatible with mshadow-1.0 + - If you use older version of cxxnet, you will need to use the legacy mshadow code +* For legacy code, refer to [Here](https://github.com/tqchen/mshadow/releases/tag/v1.1) +* Change log in [CHANGES.md](CHANGES.md) + +Projects Using MShadow +---------------------- +* [MXNet: Efficient and Flexible Distributed Deep Learning Framework](https://github.com/dmlc/mxnet) +* [CXXNet: A lightweight C++ based deep learnig framework](https://github.com/dmlc/cxxnet) diff --git a/3rdparty/mshadow/cmake/Cuda.cmake b/3rdparty/mshadow/cmake/Cuda.cmake new file mode 100644 index 000000000000..bc09a3905076 --- /dev/null +++ b/3rdparty/mshadow/cmake/Cuda.cmake @@ -0,0 +1,324 @@ +if(NOT USE_CUDA) + return() +endif() + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11) + +################################################################################################ +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# mshadow_detect_installed_gpus(out_variable) +function(mshadow_detect_installed_gpus out_variable) +set(CUDA_gpu_detect_output "") + if(NOT CUDA_gpu_detect_output) + message(STATUS "Running GPU architecture autodetection") + set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${__cufile} "" + "#include \n" + "#include \n" + "using namespace std;\n" + "int main()\n" + "{\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) { return -1; }\n" + " if (count == 0) { cerr << \"No cuda devices detected\" << endl; return -1; }\n" + " for (int device = 0; device < count; ++device)\n" + " {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + if(MSVC) + #find vcvarsall.bat and run it building msvc environment + get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY) + find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..") + execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out + OUTPUT_STRIP_TRAILING_WHITESPACE) + else() + if(CUDA_LIBRARY_PATH) + set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}") + endif() + execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH} + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() + if(__nvcc_res EQUAL 0) + # nvcc outputs text containing line breaks when building with MSVC. + # The line below prevents CMake from inserting a variable with line + # breaks in the cache + message(STATUS "Found CUDA arch ${__nvcc_out}") + string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}") + string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}") + set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from mshadow_detect_gpus tool" FORCE) + else() + message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}") + endif() + endif() + + if(NOT CUDA_gpu_detect_output) + message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mshadow_known_gpu_archs}).") + set(${out_variable} ${mshadow_known_gpu_archs} PARENT_SCOPE) + else() + set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + endif() +endfunction() + + +################################################################################################ +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# mshadow_select_nvcc_arch_flags(out_variable) +function(mshadow_select_nvcc_arch_flags out_variable) + # List of arch names + set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual") + set(__archs_name_default "All") + if(NOT CMAKE_CROSSCOMPILING) + list(APPEND __archs_names "Auto") + set(__archs_name_default "Auto") + endif() + + # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) + set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} ) + mark_as_advanced(CUDA_ARCH_NAME) + + # verify CUDA_ARCH_NAME value + if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " __archs_names "${__archs_names}") + message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN ${mshadow_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Fermi") + set(__cuda_arch_bin "20 21(20)") + elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(__cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(__cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(__cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(__cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(__cuda_arch_bin ${mshadow_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") + mshadow_detect_installed_gpus(__cuda_arch_bin) + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + set(__cuda_arch_bin ${CUDA_ARCH_BIN}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}") + string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}") + mshadow_list_unique(__cuda_arch_bin __cuda_arch_ptx) + + set(__nvcc_flags "") + set(__nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(__arch ${__cuda_arch_bin}) + if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch}) + list(APPEND __nvcc_archs_readable sm_${__arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(__arch ${__cuda_arch_ptx}) + list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch}) + list(APPEND __nvcc_archs_readable compute_${__arch}) + endforeach() + + string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}") + set(${out_variable} ${__nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE) +endfunction() + +################################################################################################ +# Short command for cuda comnpilation +# Usage: +# mshadow_cuda_compile( ) +macro(mshadow_cuda_compile objlist_variable) + foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) + set(${var}_backup_in_cuda_compile_ "${${var}}") + + # we remove /EHa as it generates warnings under windows + string(REPLACE "/EHa" "" ${var} "${${var}}") + + endforeach() + if(UNIX OR APPLE) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC) + endif() + + if(APPLE) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function) + endif() + + set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G") + + if(MSVC) + # disable noisy warnings: + # 4819: The file contains a character that cannot be represented in the current code page (number). + list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819") + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) + endif() + + # If the build system is a container, make sure the nvcc intermediate files + # go into the build output area rather than in /tmp, which may run out of space + if(IS_CONTAINER_BUILD) + set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}") + message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}") + list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}") + endif() + + cuda_compile(cuda_objcs ${ARGN}) + + foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) + set(${var} "${${var}_backup_in_cuda_compile_}") + unset(${var}_backup_in_cuda_compile_) + endforeach() + + set(${objlist_variable} ${cuda_objcs}) +endmacro() + +################################################################################################ +# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution. +# That's why not FindcuDNN.cmake file, but just the macro +# Usage: +# detect_cuDNN() +function(detect_cuDNN) + set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder") + + find_path(CUDNN_INCLUDE cudnn.h + PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDA_TOOLKIT_INCLUDE} + DOC "Path to cuDNN include directory." ) + + get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) + find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a + PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist} + DOC "Path to cuDNN library.") + + if(CUDNN_INCLUDE AND CUDNN_LIBRARY) + set(HAVE_CUDNN TRUE PARENT_SCOPE) + set(CUDNN_FOUND TRUE PARENT_SCOPE) + + mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT) + message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})") + endif() +endfunction() + + +################################################################################################ +### Non macro section +################################################################################################ + +# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so +if(NOT CUDA_TOOLKIT_ROOT_DIR) + find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64) + if(CUDA_LIBRARY_PATH) + get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY) + set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..") + endif() +endif() + +find_package(CUDA 5.5 QUIET REQUIRED) +find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand + +if(NOT CUDA_FOUND) + return() +endif() + +set(HAVE_CUDA TRUE) +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) +list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY} + ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + +# Known NVIDIA GPU achitectures mshadow can be compiled for. +# This list will be used for CUDA_ARCH_NAME = All option +if(CUDA_ARCH_ALL) + set(mshadow_known_gpu_archs "${CUDA_ARCH_ALL}") +else() + if(${CUDA_VERSION} EQUAL 9.0 OR ${CUDA_VERSION} GREATER 9.0) + set(mshadow_known_gpu_archs "30 35 50 52 60 61 70") + elseif(${CUDA_VERSION} EQUAL 8.0 OR ${CUDA_VERSION} GREATER 8.0) + set(mshadow_known_gpu_archs "30 35 50 52 60 61") + else() + set(mshadow_known_gpu_archs "30 35 50 52") + endif() +endif() + +# cudnn detection +if(USE_CUDNN) + detect_cuDNN() + if(HAVE_CUDNN) + add_definitions(-DUSE_CUDNN) + include_directories(SYSTEM ${CUDNN_INCLUDE}) + list(APPEND mshadow_LINKER_LIBS ${CUDNN_LIBRARY}) + endif() +endif() + +# setting nvcc arch flags +mshadow_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or +# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt +if(Boost_VERSION EQUAL 105500) + message(STATUS "Cuda + Boost 1.55: Applying noinline work around") + # avoid warning for CMake >= 2.8.12 + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ") +endif() + +# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc. +foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag}) +endforeach() + +# setting default testing device +if(NOT CUDA_TEST_DEVICE) + set(CUDA_TEST_DEVICE -1) +endif() + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) + +# Handle clang/libc++ issue +if(APPLE) + mshadow_detect_darwin_version(OSX_VERSION) + + # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits + if(OSX_VERSION VERSION_GREATER 10.8) + # enabled by default if and only if CUDA version is less than 7.0 + mshadow_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0)) + endif() +endif() diff --git a/3rdparty/mshadow/cmake/Utils.cmake b/3rdparty/mshadow/cmake/Utils.cmake new file mode 100644 index 000000000000..dc464f0092f5 --- /dev/null +++ b/3rdparty/mshadow/cmake/Utils.cmake @@ -0,0 +1,398 @@ +################################################################################################ +# Command alias for debugging messages +# Usage: +# dmsg() +function(dmsg) + message(STATUS ${ARGN}) +endfunction() + +################################################################################################ +# Removes duplicates from list(s) +# Usage: +# mshadow_list_unique( [] [...]) +macro(mshadow_list_unique) + foreach(__lst ${ARGN}) + if(${__lst}) + list(REMOVE_DUPLICATES ${__lst}) + endif() + endforeach() +endmacro() + +################################################################################################ +# Clears variables from list +# Usage: +# mshadow_clear_vars() +macro(mshadow_clear_vars) + foreach(_var ${ARGN}) + unset(${_var}) + endforeach() +endmacro() + +################################################################################################ +# Removes duplicates from string +# Usage: +# mshadow_string_unique() +function(mshadow_string_unique __string) + if(${__string}) + set(__list ${${__string}}) + separate_arguments(__list) + list(REMOVE_DUPLICATES __list) + foreach(__e ${__list}) + set(__str "${__str} ${__e}") + endforeach() + set(${__string} ${__str} PARENT_SCOPE) + endif() +endfunction() + +################################################################################################ +# Prints list element per line +# Usage: +# mshadow_print_list() +function(mshadow_print_list) + foreach(e ${ARGN}) + message(STATUS ${e}) + endforeach() +endfunction() + +################################################################################################ +# Function merging lists of compiler flags to single string. +# Usage: +# mshadow_merge_flag_lists(out_variable [] [] ...) +function(mshadow_merge_flag_lists out_var) + set(__result "") + foreach(__list ${ARGN}) + foreach(__flag ${${__list}}) + string(STRIP ${__flag} __flag) + set(__result "${__result} ${__flag}") + endforeach() + endforeach() + string(STRIP ${__result} __result) + set(${out_var} ${__result} PARENT_SCOPE) +endfunction() + +################################################################################################ +# Converts all paths in list to absolute +# Usage: +# mshadow_convert_absolute_paths() +function(mshadow_convert_absolute_paths variable) + set(__dlist "") + foreach(__s ${${variable}}) + get_filename_component(__abspath ${__s} ABSOLUTE) + list(APPEND __list ${__abspath}) + endforeach() + set(${variable} ${__list} PARENT_SCOPE) +endfunction() + +################################################################################################ +# Reads set of version defines from the header file +# Usage: +# mshadow_parse_header( ..) +macro(mshadow_parse_header FILENAME FILE_VAR) + set(vars_regex "") + set(__parnet_scope OFF) + set(__add_cache OFF) + foreach(name ${ARGN}) + if("${name}" STREQUAL "PARENT_SCOPE") + set(__parnet_scope ON) + elseif("${name}" STREQUAL "CACHE") + set(__add_cache ON) + elseif(vars_regex) + set(vars_regex "${vars_regex}|${name}") + else() + set(vars_regex "${name}") + endif() + endforeach() + if(EXISTS "${FILENAME}") + file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" ) + else() + unset(${FILE_VAR}) + endif() + foreach(name ${ARGN}) + if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE") + if(${FILE_VAR}) + if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*") + string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}") + else() + set(${name} "") + endif() + if(__add_cache) + set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE) + elseif(__parnet_scope) + set(${name} "${${name}}" PARENT_SCOPE) + endif() + else() + unset(${name} CACHE) + endif() + endif() + endforeach() +endmacro() + +################################################################################################ +# Reads single version define from the header file and parses it +# Usage: +# mshadow_parse_header_single_define( ) +function(mshadow_parse_header_single_define LIBNAME HDR_PATH VARNAME) + set(${LIBNAME}_H "") + if(EXISTS "${HDR_PATH}") + file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1) + endif() + + if(${LIBNAME}_H) + string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}") + string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR "${${LIBNAME}_H}") + string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}") + set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE) + set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE) + set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE) + set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE) + + # append a TWEAK version if it exists: + set(${LIBNAME}_VERSION_TWEAK "") + if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$") + set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE) + endif() + if(${LIBNAME}_VERSION_TWEAK) + set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE) + else() + set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE) + endif() + endif() +endfunction() + +######################################################################################################## +# An option that the user can select. Can accept condition to control when option is available for user. +# Usage: +# mshadow_option( "doc string" [IF ]) +function(mshadow_option variable description value) + set(__value ${value}) + set(__condition "") + set(__varname "__value") + foreach(arg ${ARGN}) + if(arg STREQUAL "IF" OR arg STREQUAL "if") + set(__varname "__condition") + else() + list(APPEND ${__varname} ${arg}) + endif() + endforeach() + unset(__varname) + if("${__condition}" STREQUAL "") + set(__condition 2 GREATER 1) + endif() + + if(${__condition}) + if("${__value}" MATCHES ";") + if(${__value}) + option(${variable} "${description}" ON) + else() + option(${variable} "${description}" OFF) + endif() + elseif(DEFINED ${__value}) + if(${__value}) + option(${variable} "${description}" ON) + else() + option(${variable} "${description}" OFF) + endif() + else() + option(${variable} "${description}" ${__value}) + endif() + else() + unset(${variable} CACHE) + endif() +endfunction() + +################################################################################################ +# Utility macro for comparing two lists. Used for CMake debugging purposes +# Usage: +# mshadow_compare_lists( [description]) +function(mshadow_compare_lists list1 list2 desc) + set(__list1 ${${list1}}) + set(__list2 ${${list2}}) + list(SORT __list1) + list(SORT __list2) + list(LENGTH __list1 __len1) + list(LENGTH __list2 __len2) + + if(NOT ${__len1} EQUAL ${__len2}) + message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}") + endif() + + foreach(__i RANGE 1 ${__len1}) + math(EXPR __index "${__i}- 1") + list(GET __list1 ${__index} __item1) + list(GET __list2 ${__index} __item2) + if(NOT ${__item1} STREQUAL ${__item2}) + message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}") + endif() + endforeach() +endfunction() + +################################################################################################ +# Command for disabling warnings for different platforms (see below for gcc and VisualStudio) +# Usage: +# mshadow_warnings_disable( -Wshadow /wd4996 ..,) +macro(mshadow_warnings_disable) + set(_flag_vars "") + set(_msvc_warnings "") + set(_gxx_warnings "") + + foreach(arg ${ARGN}) + if(arg MATCHES "^CMAKE_") + list(APPEND _flag_vars ${arg}) + elseif(arg MATCHES "^/wd") + list(APPEND _msvc_warnings ${arg}) + elseif(arg MATCHES "^-W") + list(APPEND _gxx_warnings ${arg}) + endif() + endforeach() + + if(NOT _flag_vars) + set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + endif() + + if(MSVC AND _msvc_warnings) + foreach(var ${_flag_vars}) + foreach(warning ${_msvc_warnings}) + set(${var} "${${var}} ${warning}") + endforeach() + endforeach() + elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings) + foreach(var ${_flag_vars}) + foreach(warning ${_gxx_warnings}) + if(NOT warning MATCHES "^-Wno-") + string(REPLACE "${warning}" "" ${var} "${${var}}") + string(REPLACE "-W" "-Wno-" warning "${warning}") + endif() + set(${var} "${${var}} ${warning}") + endforeach() + endforeach() + endif() + mshadow_clear_vars(_flag_vars _msvc_warnings _gxx_warnings) +endmacro() + +################################################################################################ +# Helper function get current definitions +# Usage: +# mshadow_get_current_definitions() +function(mshadow_get_current_definitions definitions_var) + get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS) + set(result "") + + foreach(d ${current_definitions}) + list(APPEND result -D${d}) + endforeach() + + mshadow_list_unique(result) + set(${definitions_var} ${result} PARENT_SCOPE) +endfunction() + +################################################################################################ +# Helper function get current includes/definitions +# Usage: +# mshadow_get_current_cflags() +function(mshadow_get_current_cflags cflags_var) + get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES) + mshadow_convert_absolute_paths(current_includes) + mshadow_get_current_definitions(cflags) + + foreach(i ${current_includes}) + list(APPEND cflags "-I${i}") + endforeach() + + mshadow_list_unique(cflags) + set(${cflags_var} ${cflags} PARENT_SCOPE) +endfunction() + +################################################################################################ +# Helper function to parse current linker libs into link directories, libflags and osx frameworks +# Usage: +# mshadow_parse_linker_libs( ) +function(mshadow_parse_linker_libs mshadow_LINKER_LIBS_variable folders_var flags_var frameworks_var) + + set(__unspec "") + set(__debug "") + set(__optimized "") + set(__framework "") + set(__varname "__unspec") + + # split libs into debug, optimized, unspecified and frameworks + foreach(list_elem ${${mshadow_LINKER_LIBS_variable}}) + if(list_elem STREQUAL "debug") + set(__varname "__debug") + elseif(list_elem STREQUAL "optimized") + set(__varname "__optimized") + elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)") + list(APPEND __framework -framework ${CMAKE_MATCH_1}) + else() + list(APPEND ${__varname} ${list_elem}) + set(__varname "__unspec") + endif() + endforeach() + + # attach debug or optimized libs to unspecified according to current configuration + if(CMAKE_BUILD_TYPE MATCHES "Debug") + set(__libs ${__unspec} ${__debug}) + else() + set(__libs ${__unspec} ${__optimized}) + endif() + + set(libflags "") + set(folders "") + + # convert linker libraries list to link flags + foreach(lib ${__libs}) + if(TARGET ${lib}) + list(APPEND folders $) + list(APPEND libflags -l${lib}) + elseif(lib MATCHES "^-l.*") + list(APPEND libflags ${lib}) + elseif(IS_ABSOLUTE ${lib}) + get_filename_component(name_we ${lib} NAME_WE) + get_filename_component(folder ${lib} PATH) + + string(REGEX MATCH "^lib(.*)" __match ${name_we}) + list(APPEND libflags -l${CMAKE_MATCH_1}) + list(APPEND folders ${folder}) + else() + message(FATAL_ERROR "Logic error. Need to update cmake script") + endif() + endforeach() + + mshadow_list_unique(libflags folders) + + set(${folders_var} ${folders} PARENT_SCOPE) + set(${flags_var} ${libflags} PARENT_SCOPE) + set(${frameworks_var} ${__framework} PARENT_SCOPE) +endfunction() + +################################################################################################ +# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, .... +# Usage: +# mshadow_detect_darwin_version() +function(mshadow_detect_darwin_version output_var) + if(APPLE) + execute_process(COMMAND /usr/bin/sw_vers -productVersion + RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + set(${output_var} ${__sw_vers_out} PARENT_SCOPE) + else() + set(${output_var} "" PARENT_SCOPE) + endif() +endfunction() + +################################################################################################ +# Convenient command to setup source group for IDEs that support this feature (VS, XCode) +# Usage: +# caffe_source_group( GLOB[_RECURSE] ) +function(mshadow_source_group group) + cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN}) + if(CAFFE_SOURCE_GROUP_GLOB) + file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB}) + source_group(${group} FILES ${srcs1}) + endif() + + if(CAFFE_SOURCE_GROUP_GLOB_RECURSE) + file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE}) + source_group(${group} FILES ${srcs2}) + endif() +endfunction() \ No newline at end of file diff --git a/3rdparty/mshadow/cmake/mshadow.cmake b/3rdparty/mshadow/cmake/mshadow.cmake new file mode 100644 index 000000000000..1ef76988d8d0 --- /dev/null +++ b/3rdparty/mshadow/cmake/mshadow.cmake @@ -0,0 +1,91 @@ +set(mshadow_LINKER_LIBS "") + +set(BLAS "Open" CACHE STRING "Selected BLAS library") +set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL") + +if(DEFINED USE_BLAS) + set(BLAS "${USE_BLAS}") +else() + if(USE_MKL_IF_AVAILABLE) + if(NOT MKL_FOUND) + find_package(MKL) + endif() + if(MKL_FOUND) + set(BLAS "MKL") + endif() + endif() +endif() + +if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas") + find_package(Atlas REQUIRED) + include_directories(SYSTEM ${Atlas_INCLUDE_DIR}) + list(APPEND mshadow_LINKER_LIBS ${Atlas_LIBRARIES}) + add_definitions(-DMSHADOW_USE_CBLAS=1) + add_definitions(-DMSHADOW_USE_MKL=0) +elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open") + find_package(OpenBLAS REQUIRED) + include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR}) + list(APPEND mshadow_LINKER_LIBS ${OpenBLAS_LIB}) + add_definitions(-DMSHADOW_USE_CBLAS=1) + add_definitions(-DMSHADOW_USE_MKL=0) +elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl") + find_package(MKL REQUIRED) + include_directories(SYSTEM ${MKL_INCLUDE_DIR}) + list(APPEND mshadow_LINKER_LIBS ${MKL_LIBRARIES}) + add_definitions(-DMSHADOW_USE_CBLAS=0) + add_definitions(-DMSHADOW_USE_MKL=1) +elseif(BLAS STREQUAL "apple") + find_package(Accelerate REQUIRED) + include_directories(SYSTEM ${Accelerate_INCLUDE_DIR}) + list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES}) + add_definitions(-DMSHADOW_USE_MKL=0) + add_definitions(-DMSHADOW_USE_CBLAS=1) +endif() + +if(SUPPORT_MSSE2) + add_definitions(-DMSHADOW_USE_SSE=1) +else() + add_definitions(-DMSHADOW_USE_SSE=0) +endif() + +if(NOT DEFINED SUPPORT_F16C AND NOT MSVC) + check_cxx_compiler_flag("-mf16c" COMPILER_SUPPORT_MF16C) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + execute_process(COMMAND cat /proc/cpuinfo + COMMAND grep flags + COMMAND grep f16c + OUTPUT_VARIABLE CPU_SUPPORT_F16C) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + execute_process(COMMAND sysctl -a + COMMAND grep machdep.cpu.features + COMMAND grep F16C + OUTPUT_VARIABLE CPU_SUPPORT_F16C) + endif() + if(NOT CPU_SUPPORT_F16C) + message("CPU does not support F16C instructions") + endif() + if(CPU_SUPPORT_F16C AND COMPILER_SUPPORT_MF16C) + set(SUPPORT_F16C TRUE) + endif() +endif() + +if(SUPPORT_F16C) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c") +else() + add_definitions(-DMSHADOW_USE_F16C=0) +endif() + +if(USE_CUDA) + find_package(CUDA 5.5 QUIET) + find_cuda_helper_libs(curand) + if(NOT CUDA_FOUND) + message(FATAL_ERROR "-- CUDA is disabled.") + endif() + add_definitions(-DMSHADOW_USE_CUDA=1) + add_definitions(-DMSHADOW_FORCE_STREAM) + include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) + list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY} + ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) +else() + add_definitions(-DMSHADOW_USE_CUDA=0) +endif() diff --git a/3rdparty/mshadow/cmake/mshadowUtils.cmake b/3rdparty/mshadow/cmake/mshadowUtils.cmake new file mode 100644 index 000000000000..d4b8bfc89b7a --- /dev/null +++ b/3rdparty/mshadow/cmake/mshadowUtils.cmake @@ -0,0 +1,2 @@ +include("${CMAKE_CURRENT_LIST_DIR}/Utils.cmake") + diff --git a/3rdparty/mshadow/doc/Doxyfile b/3rdparty/mshadow/doc/Doxyfile new file mode 100644 index 000000000000..3e83d471844c --- /dev/null +++ b/3rdparty/mshadow/doc/Doxyfile @@ -0,0 +1,2358 @@ +# Doxyfile 1.8.8 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "mshadow" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify an logo or icon that is included in +# the documentation. The maximum height of the logo should not exceed 55 pixels +# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo +# to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a +# new page for each member. If set to NO, the documentation of a member will be +# part of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by by putting a % sign in front of the word +# or globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO these classes will be included in the various overviews. This option has +# no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the +# todo list. This list is created by putting \todo commands in the +# documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the +# test list. This list is created by putting \test commands in the +# documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES the list +# will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO doxygen will only warn about wrong or incomplete parameter +# documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = YES + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. +# Note: If this tag is empty the current directory is searched. + +INPUT = mshadow \ + mshadow-ps + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank the +# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii, +# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, +# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, +# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, +# *.qsf, *.as and *.js. + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = *-inl.* \ + utils.h \ + thread_util.h \ + thread.h \ + kv_array.h + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = mshadow::expr::Plan* \ + mshadow::expr::*Engine* + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER ) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES, then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefor more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra stylesheet files is of importance (e.g. the last +# stylesheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the stylesheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to NO can help when comparing the output of multiple runs. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler ( hhc.exe). If non-empty +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated ( +# YES) or that it should be included in the master .chm file ( NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated ( +# YES) or a normal table of contents ( NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using prerendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://www.mathjax.org/mathjax + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /