From de81af22ac729f2a5688a96715cd87b84abb1dc7 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Mon, 21 Sep 2015 19:23:38 -0600
Subject: [PATCH 1/5] cifar-recipe draft

---
 example/notebooks/alexnet.ipynb          |   2 +-
 example/notebooks/cifar-recipe.ipynb     | 249 +++++++++++++++++++++++
 example/notebooks/composite_symbol.ipynb |  18 +-
 python/mxnet/visualization.py            |   2 +-
 4 files changed, 260 insertions(+), 11 deletions(-)
 create mode 100644 example/notebooks/cifar-recipe.ipynb

diff --git a/example/notebooks/alexnet.ipynb b/example/notebooks/alexnet.ipynb
index c030d873cd08..e6f2ad94e296 100644
--- a/example/notebooks/alexnet.ipynb
+++ b/example/notebooks/alexnet.ipynb
@@ -401,7 +401,7 @@
     }
    ],
    "source": [
-    "mx.viz.plot_network(\"AlexNet\", softmax)"
+    "mx.viz.plot_network(softmax)"
    ]
   },
   {
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
new file mode 100644
index 000000000000..73e7df042945
--- /dev/null
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CIFAR-10 Recipe\n",
+    "In this notebook, we will show how to train a state-of-art CIFAR-10 network with MXNet and extract feature from the network.\n",
+    "This example wiil cover\n",
+    "\n",
+    "- Network/Data definition \n",
+    "- Model saving and loading\n",
+    "- Learning rate schedule\n",
+    "- Extracting feature from network\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "import logging\n",
+    "\n",
+    "# setup logging\n",
+    "logging.basicConfig(level=logging.DEBUG)\n",
+    "console = logging.StreamHandler()\n",
+    "console.setLevel(logging.DEBUG)\n",
+    "logging.getLogger('').addHandler(console)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's make some helper function to let us build a simplified Inception Network. More details about how to composite symbol into component can be found at [component demo](composite_symbol.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Basic Conv + BN + ReLU factory\n",
+    "def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), act_type=\"relu\"):\n",
+    "    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)\n",
+    "    bn = mx.symbol.BatchNorm(data=conv)\n",
+    "    act = mx.symbol.Activation(data = bn, act_type=act_type)\n",
+    "    return act"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# A Simple Downsampling Factory\n",
+    "def DownsampleFactory(data, ch_3x3):\n",
+    "    # conv 3x3\n",
+    "    conv = ConvFactory(data=data, kernel=(3, 3), stride=(2, 2), num_filter=ch_3x3, pad=(1, 1))\n",
+    "    # pool\n",
+    "    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type='max')\n",
+    "    # concat\n",
+    "    concat = mx.symbol.Concat(*[conv, pool])\n",
+    "    return concat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# A Simple module\n",
+    "def SimpleFactory(data, ch_1x1, ch_3x3):\n",
+    "    # 1x1\n",
+    "    conv1x1 = ConvFactory(data=data, kernel=(1, 1), pad=(0, 0), num_filter=ch_1x1)\n",
+    "    # 3x3\n",
+    "    conv3x3 = ConvFactory(data=data, kernel=(3, 3), pad=(1, 1), num_filter=ch_3x3)\n",
+    "    #concat\n",
+    "    concat = mx.symbol.Concat(*[conv1x1, conv3x3])\n",
+    "    return concat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can build a network with these component factories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data = mx.symbol.Variable(name=\"data\")\n",
+    "conv1 = ConvFactory(data=data, kernel=(3,3), pad=(1,1), num_filter=96, act_type=\"relu\")\n",
+    "in3a = SimpleFactory(conv1, 32, 32)\n",
+    "in3b = SimpleFactory(in3a, 32, 48)\n",
+    "in3c = DownsampleFactory(in3b, 80)\n",
+    "in4a = SimpleFactory(in3c, 112, 48)\n",
+    "in4b = SimpleFactory(in4a, 96, 64)\n",
+    "in4c = SimpleFactory(in4b, 80, 80)\n",
+    "in4d = SimpleFactory(in4c, 48, 96)\n",
+    "in4e = DownsampleFactory(in4d, 96)\n",
+    "in5a = SimpleFactory(in4e, 176, 160)\n",
+    "in5b = SimpleFactory(in5a, 176, 160)\n",
+    "pool = mx.symbol.Pooling(data=in5b, pool_type=\"avg\", kernel=(7,7))\n",
+    "flatten = mx.symbol.Flatten(data=pool)\n",
+    "fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10)\n",
+    "loss = mx.symbol.Softmax(data=fc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# If you'd like to see the network structure, run the plot_network function\n",
+    "# mx.viz.plot_network(loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Find duplicated argument name \"weight\", please make the weight name non-duplicated(using name arguments), arguments are ['data', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'label']",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-13-b112c5ebf964>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# For demo purpose, this model only train 1 round\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round = 1,\n\u001b[1;32m----> 4\u001b[1;33m                               learning_rate=0.05, momentum=0.9, wd=0.00001)\n\u001b[0m",
+      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, symbol, ctx, num_round, optimizer, initializer, arg_params, aux_params, **kwargs)\u001b[0m\n\u001b[0;32m    418\u001b[0m                  **kwargs):\n\u001b[0;32m    419\u001b[0m         \u001b[1;31m# check if symbol contain duplicated names.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 420\u001b[1;33m         \u001b[0m_check_arguments\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msymbol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    421\u001b[0m         \u001b[1;31m# basic configuration\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    422\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msymbol\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36m_check_arguments\u001b[1;34m(symbol)\u001b[0m\n\u001b[0;32m     61\u001b[0m             raise ValueError(('Find duplicated argument name \\\"%s\\\", ' +\n\u001b[0;32m     62\u001b[0m                               \u001b[1;34m'please make the weight name non-duplicated(using name arguments), '\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m                               'arguments are %s') % (name, str(arg_names)))\n\u001b[0m\u001b[0;32m     64\u001b[0m         \u001b[0marg_set\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     65\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mValueError\u001b[0m: Find duplicated argument name \"weight\", please make the weight name non-duplicated(using name arguments), arguments are ['data', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'label']"
+     ]
+    }
+   ],
+   "source": [
+    "# We will make model with current current symbol\n",
+    "# For demo purpose, this model only train 1 round\n",
+    "model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round = 1,\n",
+    "                             learning_rate=0.05, momentum=0.9, wd=0.00001)\n",
+    "# To make automatic model saving after each round, we can add check_point callback\n",
+    "# model_prefix = \"cifar\"\n",
+    "# model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round = 1,\n",
+    "#                              learning_rate=0.05, momentum=0.9, wd=0.00001,\n",
+    "#                              iter_end_callback=mx.model.do_checkpoint(model_prefix))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next step is declaring data iterator. The original CIFAR-10 data is 3x32x32 in binary format, we provides RecordIO format, so we can use Image RecordIO format. For more infomation about Image RecordIO Iterator, check [document](https://mxnet.readthedocs.org/en/latest/python/io.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Use utility function in test to download the data\n",
+    "import sys\n",
+    "sys.path.append(\"../../tests/python/common\")\n",
+    "import get_data\n",
+    "get_data.GetCifar10()\n",
+    "# After we get the data, we can declare our data iterator\n",
+    "# The iterator will automatically create mean image file if it doesn't exist\n",
+    "batch_size = 128\n",
+    "# Train iterator make batch of 128 image, and random crop each image into 3x28x28 from original 3x32x32\n",
+    "train_dataiter = mx.io.ImageRecordIter(\n",
+    "        shuffle=True,\n",
+    "        path_imgrec=\"data/cifar/train.rec\",\n",
+    "        mean_img=\"data/cifar/cifar_mean.bin\",\n",
+    "        rand_crop=True,\n",
+    "        rand_mirror=True,\n",
+    "        data_shape=(3,28,28),\n",
+    "        batch_size=batch_size,\n",
+    "        preprocess_threads=1)\n",
+    "# test iterator make batch of 128 image, and center crop each image into 3x28x28 from original 3x32x32\n",
+    "test_dataiter = mx.io.ImageRecordIter(\n",
+    "        path_imgrec=\"data/cifar/test.rec\",\n",
+    "        mean_img=\"data/cifar/cifar_mean.bin\",\n",
+    "        rand_crop=False,\n",
+    "        rand_mirror=False,\n",
+    "        data_shape=(3,28,28),\n",
+    "        batch_size=batch_size,\n",
+    "        preprocess_threads=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/example/notebooks/composite_symbol.ipynb b/example/notebooks/composite_symbol.ipynb
index dc97fa22e5dc..b43b796ccf9b 100644
--- a/example/notebooks/composite_symbol.ipynb
+++ b/example/notebooks/composite_symbol.ipynb
@@ -71,11 +71,11 @@
        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
        "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
        " -->\n",
-       "<!-- Title: conv Pages: 1 -->\n",
+       "<!-- Title: plot Pages: 1 -->\n",
        "<svg width=\"102pt\" height=\"348pt\"\n",
        " viewBox=\"0.00 0.00 102.00 348.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 344)\">\n",
-       "<title>conv</title>\n",
+       "<title>plot</title>\n",
        "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-344 98,-344 98,4 -4,4\"/>\n",
        "<!-- null_0 -->\n",
        "<g id=\"node1\" class=\"node\"><title>null_0</title>\n",
@@ -118,7 +118,7 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7fe1ba8a4978>"
+       "<graphviz.dot.Digraph at 0x7eff39b005f8>"
       ]
      },
      "execution_count": 3,
@@ -129,7 +129,7 @@
    "source": [
     "prev = mx.symbol.Variable(name=\"Previos Output\")\n",
     "conv_comp = ConvFactory(data=prev, num_filter=64, kernel=(7,7), stride=(2, 2))\n",
-    "mx.visualization.plot_network(title=\"conv\", symbol=conv_comp)"
+    "mx.visualization.plot_network(symbol=conv_comp)"
    ]
   },
   {
@@ -187,11 +187,11 @@
        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
        "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
        " -->\n",
-       "<!-- Title: in3a Pages: 1 -->\n",
+       "<!-- Title: plot Pages: 1 -->\n",
        "<svg width=\"438pt\" height=\"724pt\"\n",
        " viewBox=\"0.00 0.00 438.00 724.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 720)\">\n",
-       "<title>in3a</title>\n",
+       "<title>plot</title>\n",
        "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-720 434,-720 434,4 -4,4\"/>\n",
        "<!-- null_0 -->\n",
        "<g id=\"node1\" class=\"node\"><title>null_0</title>\n",
@@ -430,7 +430,7 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7fe1ba8a4e48>"
+       "<graphviz.dot.Digraph at 0x7eff39b1a160>"
       ]
      },
      "execution_count": 5,
@@ -441,7 +441,7 @@
    "source": [
     "prev = mx.symbol.Variable(name=\"Previos Output\")\n",
     "in3a = InceptionFactoryA(prev, 64, 64, 64, 64, 96, \"avg\", 32)\n",
-    "mx.visualization.plot_network(title=\"in3a\", symbol=in3a)"
+    "mx.visualization.plot_network(symbol=in3a)"
    ]
   },
   {
@@ -681,7 +681,7 @@
    "source": [
     "prev = mx.symbol.Variable(name=\"Previos Output\")\n",
     "in3c = InceptionFactoryB(prev, 128, 160, 64, 96)\n",
-    "mx.visualization.plot_network(title=\"in3c\", symbol=in3c)"
+    "mx.visualization.plot_network(symbol=in3c)"
    ]
   },
   {
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 86fc53c37311..efa875b472ec 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -23,7 +23,7 @@ def _str2tuple(string):
     return re.findall(r"\d+", string)
 
 
-def plot_network(title, symbol, shape=None):
+def plot_network(symbol, title="plot", shape=None):
     """convert symbol to dot object for visualization
 
     Parameters

From 632fa7318df8ef4c1b2db4254951b9bf7dd7a921 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Mon, 21 Sep 2015 20:42:34 -0600
Subject: [PATCH 2/5] update draft

---
 example/notebooks/cifar-recipe.ipynb | 215 ++++++++++++++++++++++-----
 1 file changed, 179 insertions(+), 36 deletions(-)

diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index 73e7df042945..9d6f06a40d4b 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -9,14 +9,14 @@
     "This example wiil cover\n",
     "\n",
     "- Network/Data definition \n",
+    "- Multi GPU training\n",
     "- Model saving and loading\n",
-    "- Learning rate schedule\n",
-    "- Extracting feature from network\n"
+    "- Prediction/Extracting Feature\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "metadata": {
     "collapsed": true
    },
@@ -24,12 +24,11 @@
    "source": [
     "import mxnet as mx\n",
     "import logging\n",
+    "import numpy as np\n",
     "\n",
     "# setup logging\n",
-    "logging.basicConfig(level=logging.DEBUG)\n",
-    "console = logging.StreamHandler()\n",
-    "console.setLevel(logging.DEBUG)\n",
-    "logging.getLogger('').addHandler(console)"
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.DEBUG)"
    ]
   },
   {
@@ -41,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {
     "collapsed": true
    },
@@ -57,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
@@ -76,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {
     "collapsed": true
    },
@@ -102,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
@@ -128,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {
     "collapsed": true
    },
@@ -140,35 +139,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 7,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "Find duplicated argument name \"weight\", please make the weight name non-duplicated(using name arguments), arguments are ['data', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'label']",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-13-b112c5ebf964>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# For demo purpose, this model only train 1 round\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round = 1,\n\u001b[1;32m----> 4\u001b[1;33m                               learning_rate=0.05, momentum=0.9, wd=0.00001)\n\u001b[0m",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, symbol, ctx, num_round, optimizer, initializer, arg_params, aux_params, **kwargs)\u001b[0m\n\u001b[0;32m    418\u001b[0m                  **kwargs):\n\u001b[0;32m    419\u001b[0m         \u001b[1;31m# check if symbol contain duplicated names.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 420\u001b[1;33m         \u001b[0m_check_arguments\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msymbol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    421\u001b[0m         \u001b[1;31m# basic configuration\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    422\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msymbol\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36m_check_arguments\u001b[1;34m(symbol)\u001b[0m\n\u001b[0;32m     61\u001b[0m             raise ValueError(('Find duplicated argument name \\\"%s\\\", ' +\n\u001b[0;32m     62\u001b[0m                               \u001b[1;34m'please make the weight name non-duplicated(using name arguments), '\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m                               'arguments are %s') % (name, str(arg_names)))\n\u001b[0m\u001b[0;32m     64\u001b[0m         \u001b[0marg_set\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     65\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mValueError\u001b[0m: Find duplicated argument name \"weight\", please make the weight name non-duplicated(using name arguments), arguments are ['data', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'gamma', 'beta', 'weight', 'bias', 'label']"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# We will make model with current current symbol\n",
     "# For demo purpose, this model only train 1 round\n",
-    "model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round = 1,\n",
-    "                             learning_rate=0.05, momentum=0.9, wd=0.00001)\n",
-    "# To make automatic model saving after each round, we can add check_point callback\n",
-    "# model_prefix = \"cifar\"\n",
-    "# model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round = 1,\n",
-    "#                              learning_rate=0.05, momentum=0.9, wd=0.00001,\n",
-    "#                              iter_end_callback=mx.model.do_checkpoint(model_prefix))\n"
+    "# We will use the first GPU to do training\n",
+    "num_round = 1\n",
+    "model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round=num_round,\n",
+    "                             learning_rate=0.05, momentum=0.9, wd=0.00001)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we have multiple GPU, for eaxmple, 4 GPU, we can utilize them without any difficulty"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# num_devs = 4\n",
+    "# model = mx.model.FeedForward(ctx=[mx.gpu(i) for i in range(num_devs)], symbol=loss, num_round = 1,\n",
+    "#                              learning_rate=0.05, momentum=0.9, wd=0.00001)"
    ]
   },
   {
@@ -180,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {
     "collapsed": true
    },
@@ -215,6 +217,147 @@
     "        preprocess_threads=1)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "Now we can fit the model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Start training with 1 devices\n",
+      "INFO:root:Iteration[0] Train-accuracy=0.530910\n",
+      "INFO:root:Iteration[0] Time cost=48.399\n",
+      "INFO:root:Iteration[0] Validation-accuracy=0.660403\n"
+     ]
+    }
+   ],
+   "source": [
+    "# On Titan X with CuDNN, it will takes about 45 second\n",
+    "model.fit(X=train_dataiter,\n",
+    "          eval_data=test_dataiter,\n",
+    "          eval_metric=\"accuracy\")\n",
+    "# if we want to save model after every round, we can add check_point call back\n",
+    "# model_prefix = './cifar_'\n",
+    "# model.fit(X=train_dataiter,\n",
+    "#           eval_data=test_dataiter,\n",
+    "#           eval_metric=\"accuracy\"),\n",
+    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After only 1 epoch, our model is able to acheive 66.04% accuracy on testset.\n",
+    "We can save our model by calling either ```save``` or using ```pickle```.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Saved checkpoint to \"cifar-0001.params\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "# using pickle\n",
+    "import pickle\n",
+    "smodel = pickle.dumps(model)\n",
+    "# using saving (recommended)\n",
+    "# We get the benefit being able to directly load/save from cloud storage(S3, HDFS)\n",
+    "prefix = \"cifar\"\n",
+    "model.save(prefix)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To load saved model, you can use ```pickle``` if the model is generated by ```pickle```, or use ```load``` if it is generated by ```save```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# use pickle\n",
+    "model2 = pickle.loads(smodel)\n",
+    "# using load method (able to load from S3/HDFS directly)\n",
+    "model3 = mx.model.FeedForward.load(prefix, num_round)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the model to do prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Finish predict...\n",
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:11: DeprecationWarning: elementwise comparison failed; this will raise the error in the future.\n",
+      "INFO:root:final accuracy = 0.000000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(9984,)\n",
+      "(10112, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "prob = model.predict(test_dataiter)\n",
+    "logging.info('Finish predict...')\n",
+    "# Check the accuracy from prediction\n",
+    "test_dataiter.reset()\n",
+    "# get label\n",
+    "y = np.concatenate([label.asnumpy() for _, label in test_dataiter]).astype('int')\n",
+    "print(y.shape)\n",
+    "print(prob.shape)\n",
+    "# get prediction label from \n",
+    "py = np.argmax(prob, axis=1)\n",
+    "acc1 = float(np.sum(py == y)) / len(y)\n",
+    "logging.info('final accuracy = %f', acc1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 26b1b71ee9933a4abda00a0eaedd023278daa4c1 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 22 Sep 2015 09:39:02 -0600
Subject: [PATCH 3/5] save

---
 example/cifar10/cifar10.py           |  2 +-
 example/notebooks/cifar-recipe.ipynb | 58 +++++++++++++++++++++-------
 mshadow                              |  2 +-
 3 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 7944985caa4c..e5eb1cb0b41e 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -100,7 +100,7 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 
 get_data.GetCifar10()
 batch_size = 128
-epoch = 3
+epoch = 10
 num_gpus = 1
 
 train_dataiter = mx.io.ImageRecordIter(
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index 9d6f06a40d4b..c92478ebaaba 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
    "metadata": {
     "collapsed": true
    },
@@ -162,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {
     "collapsed": true
    },
@@ -182,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {
     "collapsed": true
    },
@@ -228,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
@@ -238,14 +238,14 @@
      "output_type": "stream",
      "text": [
       "INFO:root:Start training with 1 devices\n",
-      "INFO:root:Iteration[0] Train-accuracy=0.530910\n",
-      "INFO:root:Iteration[0] Time cost=48.399\n",
-      "INFO:root:Iteration[0] Validation-accuracy=0.660403\n"
+      "INFO:root:Iteration[0] Train-accuracy=0.520540\n",
+      "INFO:root:Iteration[0] Time cost=47.702\n",
+      "INFO:root:Iteration[0] Validation-accuracy=0.651701\n"
      ]
     }
    ],
    "source": [
-    "# On Titan X with CuDNN, it will takes about 45 second\n",
+    "# On Titan X with CuDNN, it will takes about 55 second\n",
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\")\n",
@@ -261,22 +261,28 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After only 1 epoch, our model is able to acheive 66.04% accuracy on testset.\n",
+    "After only 1 epoch, our model is able to acheive about 66% accuracy on testset.\n",
     "We can save our model by calling either ```save``` or using ```pickle```.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:root:Saved checkpoint to \"cifar-0001.params\"\n"
+     "ename": "AttributeError",
+     "evalue": "'NoneType' object has no attribute 'items'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-9-f36270579a75>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;31m# We get the benefit being able to directly load/save from cloud storage(S3, HDFS)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[0mprefix\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"cifar\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprefix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36msave\u001b[1;34m(self, prefix, iteration)\u001b[0m\n\u001b[0;32m    599\u001b[0m             \u001b[0miteration\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnum_round\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    600\u001b[0m         \u001b[1;32massert\u001b[0m \u001b[0miteration\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 601\u001b[1;33m         \u001b[0msave_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marg_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maux_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    602\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    603\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36msave_checkpoint\u001b[1;34m(prefix, iteration, symbol, arg_params, aux_params)\u001b[0m\n\u001b[0;32m    326\u001b[0m     \"\"\"\n\u001b[0;32m    327\u001b[0m     \u001b[0msymbol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'%s-symbol.json'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mprefix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m     \u001b[0msave_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'arg:%s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0marg_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    329\u001b[0m     \u001b[0msave_dict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'aux:%s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0maux_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    330\u001b[0m     \u001b[0mparam_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'%s-%04d.params'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mprefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'items'"
      ]
     }
    ],
@@ -299,7 +305,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
@@ -358,6 +364,28 @@
     "logging.info('final accuracy = %f', acc1)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "Extract feature requre bind symbol with the feature layer with "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "group = mx.symbol.Group([pool, loss])\n",
+    "group.list_outputs()\n",
+    "model2 = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, arg_params=model3.arg_params, aux_params=model3.aux_params)\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/mshadow b/mshadow
index bf678e6ac05d..7a3ccdee3018 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit bf678e6ac05d5115f92db0b668e4424401f31b14
+Subproject commit 7a3ccdee30189d9a01d2e6c823c4b76b4c92f558

From d95fdb04c4fd14722f2138bea9e1165548e67fc6 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 22 Sep 2015 15:56:13 -0600
Subject: [PATCH 4/5] finish notebook

---
 example/cifar10/cifar10.py           |  11 +--
 example/notebooks/cifar-recipe.ipynb | 111 +++++++++++++++++----------
 example/python-howto/data_iter.py    |   6 +-
 include/mxnet/c_api.h                |  10 +++
 include/mxnet/io.h                   |   2 +
 python/mxnet/__init__.py             |   2 +
 python/mxnet/helper.py               |  70 +++++++++++++++++
 python/mxnet/io.py                   |   8 ++
 python/mxnet/model.py                |  35 ++++++++-
 python/mxnet/scheduler.py            |  54 +++++++++++++
 python/mxnet/visualization.py        |   2 +
 src/c_api.cc                         |   7 ++
 src/io/iter_batchloader.h            |   2 +-
 src/io/iter_prefetcher.h             |   2 +
 14 files changed, 270 insertions(+), 52 deletions(-)
 create mode 100644 python/mxnet/helper.py
 create mode 100644 python/mxnet/scheduler.py

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index e5eb1cb0b41e..5f00b367ff9a 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -122,14 +122,15 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 
 def test_cifar():
     logging.basicConfig(level=logging.DEBUG)
-    console = logging.StreamHandler()
-    console.setLevel(logging.DEBUG)
-    logging.getLogger('').addHandler(console)
+    #console = logging.StreamHandler()
+    #console.setLevel(logging.DEBUG)
+    #logging.getLogger('').addHandler(console)
+    total_batch = 50000 / batch_size + 1
     gpus = [mx.gpu(i) for i in range(num_gpus)]
     model = mx.model.FeedForward(ctx=gpus, symbol=loss, num_round = epoch,
                                  learning_rate=0.05, momentum=0.9, wd=0.00001)
-    model.fit(X=train_dataiter, eval_data=test_dataiter)
-
+    model.fit(X=train_dataiter, eval_data=test_dataiter,
+              epoch_end_callback=mx.helper.Speedometer(batch_size, 100))
 
 if __name__ == "__main__":
     test_cifar()
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index c92478ebaaba..54558de66c43 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -162,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {
     "collapsed": true
    },
@@ -182,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {
     "collapsed": true
    },
@@ -196,6 +196,7 @@
     "# After we get the data, we can declare our data iterator\n",
     "# The iterator will automatically create mean image file if it doesn't exist\n",
     "batch_size = 128\n",
+    "total_batch = 50000 / 128 + 1\n",
     "# Train iterator make batch of 128 image, and random crop each image into 3x28x28 from original 3x32x32\n",
     "train_dataiter = mx.io.ImageRecordIter(\n",
     "        shuffle=True,\n",
@@ -207,6 +208,7 @@
     "        batch_size=batch_size,\n",
     "        preprocess_threads=1)\n",
     "# test iterator make batch of 128 image, and center crop each image into 3x28x28 from original 3x32x32\n",
+    "# Note: We don't need round batch in test because we only test once at one time\n",
     "test_dataiter = mx.io.ImageRecordIter(\n",
     "        path_imgrec=\"data/cifar/test.rec\",\n",
     "        mean_img=\"data/cifar/cifar_mean.bin\",\n",
@@ -214,6 +216,7 @@
     "        rand_mirror=False,\n",
     "        data_shape=(3,28,28),\n",
     "        batch_size=batch_size,\n",
+    "        round_batch=False,\n",
     "        preprocess_threads=1)"
    ]
   },
@@ -223,12 +226,12 @@
     "collapsed": true
    },
    "source": [
-    "Now we can fit the model. "
+    "Now we can fit the model with data. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {
     "collapsed": false
    },
@@ -238,23 +241,41 @@
      "output_type": "stream",
      "text": [
       "INFO:root:Start training with 1 devices\n",
-      "INFO:root:Iteration[0] Train-accuracy=0.520540\n",
-      "INFO:root:Iteration[0] Time cost=47.702\n",
-      "INFO:root:Iteration[0] Validation-accuracy=0.651701\n"
+      "INFO:root:Batch [50]\tSpeed: 1110.69 samples/sec\n",
+      "INFO:root:Batch [100]\tSpeed: 1094.86 samples/sec\n",
+      "INFO:root:Batch [150]\tSpeed: 1090.16 samples/sec\n",
+      "INFO:root:Batch [200]\tSpeed: 1088.40 samples/sec\n",
+      "INFO:root:Batch [250]\tSpeed: 1083.11 samples/sec\n",
+      "INFO:root:Batch [300]\tSpeed: 1080.53 samples/sec\n",
+      "INFO:root:Batch [350]\tSpeed: 1075.29 samples/sec\n",
+      "INFO:root:Iteration[0] Train-accuracy=0.523477\n",
+      "INFO:root:Iteration[0] Time cost=46.563\n",
+      "INFO:root:Iteration[0] Validation-accuracy=0.649921\n"
      ]
     }
    ],
    "source": [
-    "# On Titan X with CuDNN, it will takes about 55 second\n",
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
-    "          eval_metric=\"accuracy\")\n",
+    "          eval_metric=\"accuracy\",\n",
+    "          epoch_end_callback=mx.helper.Speedometer(batch_size))\n",
     "# if we want to save model after every round, we can add check_point call back\n",
     "# model_prefix = './cifar_'\n",
     "# model.fit(X=train_dataiter,\n",
     "#           eval_data=test_dataiter,\n",
     "#           eval_metric=\"accuracy\"),\n",
-    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))"
+    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))\n",
+    "\n",
+    "# if we want to schelue learning rate, we can add scheduler in fit\n",
+    "# model.fit(X=train_dataiter,\n",
+    "#           eval_data=test_dataiter,\n",
+    "#           learning_rate_scheduler=mx.scheduler.factor(base_lr=0.05, step=3900, factor=0.1)\n",
+    "\n",
+    "# base_lr is learning rate at starting\n",
+    "# The unit for step is batch\n",
+    "# In this example, we have 50k training data, and batch_size is 128, so we will have 390 batch per round\n",
+    "# If we set step to 3900, means we will make new learning rate multiply factor after 10 round\n",
+    "# Which means at round 11, the learning rate will be 0.005"
    ]
   },
   {
@@ -267,22 +288,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'NoneType' object has no attribute 'items'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-9-f36270579a75>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;31m# We get the benefit being able to directly load/save from cloud storage(S3, HDFS)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[0mprefix\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"cifar\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprefix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36msave\u001b[1;34m(self, prefix, iteration)\u001b[0m\n\u001b[0;32m    599\u001b[0m             \u001b[0miteration\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnum_round\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    600\u001b[0m         \u001b[1;32massert\u001b[0m \u001b[0miteration\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 601\u001b[1;33m         \u001b[0msave_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marg_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maux_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    602\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    603\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36msave_checkpoint\u001b[1;34m(prefix, iteration, symbol, arg_params, aux_params)\u001b[0m\n\u001b[0;32m    326\u001b[0m     \"\"\"\n\u001b[0;32m    327\u001b[0m     \u001b[0msymbol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'%s-symbol.json'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mprefix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m     \u001b[0msave_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'arg:%s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0marg_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    329\u001b[0m     \u001b[0msave_dict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'aux:%s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0maux_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    330\u001b[0m     \u001b[0mparam_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'%s-%04d.params'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mprefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'items'"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Saved checkpoint to \"cifar-0001.params\"\n"
      ]
     }
    ],
@@ -305,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {
     "collapsed": false
    },
@@ -314,7 +329,7 @@
     "# use pickle\n",
     "model2 = pickle.loads(smodel)\n",
     "# using load method (able to load from S3/HDFS directly)\n",
-    "model3 = mx.model.FeedForward.load(prefix, num_round)"
+    "model3 = mx.model.FeedForward.load(prefix, num_round, ctx=mx.gpu())"
    ]
   },
   {
@@ -326,7 +341,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 13,
    "metadata": {
     "collapsed": false
    },
@@ -336,28 +351,26 @@
      "output_type": "stream",
      "text": [
       "INFO:root:Finish predict...\n",
-      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:11: DeprecationWarning: elementwise comparison failed; this will raise the error in the future.\n",
-      "INFO:root:final accuracy = 0.000000\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(9984,)\n",
-      "(10112, 10)\n"
+      "INFO:root:final accuracy = 0.651000\n"
      ]
     }
    ],
    "source": [
-    "prob = model.predict(test_dataiter)\n",
+    "prob = model3.predict(test_dataiter)\n",
     "logging.info('Finish predict...')\n",
     "# Check the accuracy from prediction\n",
     "test_dataiter.reset()\n",
     "# get label\n",
-    "y = np.concatenate([label.asnumpy() for _, label in test_dataiter]).astype('int')\n",
-    "print(y.shape)\n",
-    "print(prob.shape)\n",
+    "# Because the iterator pad each batch same shape, we want to remove paded samples here\n",
+    "\n",
+    "y_batch = []\n",
+    "for _, label in test_dataiter:\n",
+    "    label = label.asnumpy()\n",
+    "    pad = test_dataiter.getpad()\n",
+    "    real_size = label.shape[0] - pad\n",
+    "    y_batch.append(label[0:real_size])\n",
+    "y = np.concatenate(y_batch)\n",
+    "\n",
     "# get prediction label from \n",
     "py = np.argmax(prob, axis=1)\n",
     "acc1 = float(np.sum(py == y)) / len(y)\n",
@@ -370,20 +383,34 @@
     "collapsed": true
    },
    "source": [
-    "Extract feature requre bind symbol with the feature layer with "
+    "Extract feature requre bind symbol with the feature symbol. We can create a new model object with grouped output symbol and original parameters."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(10000, 336, 1, 1)\n"
+     ]
+    }
+   ],
    "source": [
+    "# In current implmentation of model, we can only get one output at one time\n",
+    "# So we need put desired feature at the first place of group\n",
+    "# By using a lower level python API, we are able to get any features we grouped at same time\n",
+    "\n",
     "group = mx.symbol.Group([pool, loss])\n",
     "group.list_outputs()\n",
-    "model2 = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, arg_params=model3.arg_params, aux_params=model3.aux_params)\n"
+    "feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, arg_params=model3.arg_params, aux_params=model3.aux_params)\n",
+    "global_pooling_feature = feature_extractor.predict(test_dataiter)\n",
+    "print(global_pooling_feature.shape)"
    ]
   },
   {
diff --git a/example/python-howto/data_iter.py b/example/python-howto/data_iter.py
index d1cebc0a470d..ea541b6985ef 100644
--- a/example/python-howto/data_iter.py
+++ b/example/python-howto/data_iter.py
@@ -42,7 +42,11 @@
         # Backend Parameter
         # Optional
         # Prefetch buffer size
-        prefetch_buffer=4)
+        prefetch_buffer=4,
+        # Backend Parameter,
+        # Optional
+        # Whether round batch,
+        round_batch=True)
 
 batchidx = 0
 for data, label in dataiter:
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index f6bf4e5ad862..d97b35afbe94 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -675,6 +675,16 @@ MXNET_DLL int MXDataIterBeforeFirst(DataIterHandle handle);
  */
 MXNET_DLL int MXDataIterGetData(DataIterHandle handle,
                                 NDArrayHandle *out);
+
+/*!
+ * \brief Get the padding number in current data batch
+ * \param handle the handle pointer to the data iterator
+ * \param pad pad number ptr
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetPadNum(DataIterHandle handle,
+                                  int *pad);
+
 /*!
  * \brief Get the handle to the NDArray of underlying label
  * \param handle the handle pointer to the data iterator
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 1c9a6bc8d61a..8f65cac2214e 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -62,6 +62,8 @@ struct DataBatch {
   std::vector<NDArray> data;
   /*! \brief extra data to be fed to the network */
   std::string extra_data;
+  /*! \brief num of example padded to batch */
+  int num_batch_padd;
 };  // struct DataBatch
 
 /*! \brief typedef the factory function of data iterator */
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index e9630b678ee0..4791651ca391 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -25,5 +25,7 @@
 from . import visualization
 # use viz as short for mx.ndarray
 from . import visualization as viz
+from . import helper
+from . import scheduler
 
 __version__ = "0.1.0"
diff --git a/python/mxnet/helper.py b/python/mxnet/helper.py
new file mode 100644
index 000000000000..e82c1bd21249
--- /dev/null
+++ b/python/mxnet/helper.py
@@ -0,0 +1,70 @@
+# pylint: disable=logging-not-lazy, blacklisted-name
+"""model helper for knowing training status"""
+import sys
+import math
+import logging
+import time
+
+class Speedometer(object):
+    """Calculate training speed in frequent
+
+    Parameters
+    ----------
+    batch_size: int
+        batch_size of data
+    frequent: int
+        calcutaion frequent
+    """
+    def __init__(self, batch_size, frequent=50):
+        self.batch_size = batch_size
+        self.frequent = frequent
+        self.init = False
+        self.tic = 0
+
+    def __call__(self, count):
+        """
+        Show speed
+
+        Parameters
+        ----------
+        count: int
+            current batch count
+        """
+
+        if self.init:
+            if count % self.frequent == 0:
+                speed = self.frequent * self.batch_size / (time.time() - self.tic)
+                logging.info("Batch [%d]\tSpeed: %.2f samples/sec" % (count, speed))
+                self.tic = time.time()
+        else:
+            self.init = True
+            self.tic = time.time()
+
+class ProgressBar(object):
+    """Show a progress bar
+
+    Parameters
+    ----------
+    total: int
+        total batch size
+    length: int
+        length or progress bar
+    """
+    def __init__(self, total, length=80):
+        self.bar_len = length
+        self.total = total
+
+    def __call__(self, count):
+        """
+        Update progress bar
+
+        Parameters
+        ----------
+        count: int
+            current batch count
+        """
+
+        filled_len = int(round(self.bar_len * count / float(self.total)))
+        percents = math.ceil(100.0 * count / float(self.total))
+        bar = '=' * filled_len + '-' * (self.bar_len - filled_len)
+        sys.stdout.write('[%s] %s%s\r' % (bar, percents, '%'))
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index e4e6905aba3a..5ac381d99e38 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -82,6 +82,14 @@ def getlabel(self):
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
         return NDArray(hdl, False)
 
+    def getpad(self):
+        """get padded sample num in the batch
+
+        """
+        pad = ctypes.c_int(0)
+        check_call(_LIB.MXDataIterGetPadNum(self.handle, ctypes.byref(pad)))
+        return pad.value
+
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
     name = ctypes.c_char_p()
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index e19aa580a0a5..319b17926a42 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -121,7 +121,8 @@ def _train_multi_device(symbol, ctx, input_shape,
                         arg_params, aux_params,
                         begin_round, end_round, optimizer,
                         train_data, eval_data=None, eval_metric=None,
-                        iter_end_callback=None, logger=None):
+                        iter_end_callback=None, learning_rate_scheduler=None,
+                        epoch_end_callback=None, logger=None):
     """Internal training function on multiple devices.
 
     This function will also work for single device as well.
@@ -165,6 +166,12 @@ def _train_multi_device(symbol, ctx, input_shape,
         A callback that is invoked at end of each iteration.
         This can be used to checkpoint model each iteration.
 
+    learning_rate_scheduler: Scheduler
+        A Scheduler to adjust learning rate
+
+    epoch_end_callback: callable(iteration)
+        A callback that is invoked at end of each batch
+
     logger : logging logger
         When not specified, default logger will be used.
 
@@ -230,6 +237,7 @@ def _train_multi_device(symbol, ctx, input_shape,
         train_data.reset()
         optimizer.begin_round(iteration)
         eval_metric.reset()
+        nbatch = 0
         # Iterate over training data.
         for data, label in train_data:
             # Copy data into the target
@@ -258,6 +266,13 @@ def _train_multi_device(symbol, ctx, input_shape,
                 # optimizea
                 for w, g, state in zip(arg_list, grad_list, opt_list):
                     optimizer.update(index, w, g, state)
+            nbatch += 1
+            # epoch callback (for print purpose)
+            if epoch_end_callback:
+                epoch_end_callback(nbatch)
+            # learning rate sceduler
+            if learning_rate_scheduler:
+                learning_rate_scheduler(optimizer, nbatch, iteration)
             # evaluate at end, so out_cpu_array can lazy copy
             eval_metric.update(out_cpu_array, label)
 
@@ -524,11 +539,16 @@ def predict(self, X):
         for data, _ in X:
             data.copyto(self._pred_exec_input)
             self._pred_exec.forward(is_train=False)
-            outputs.append(self._pred_exec.outputs[0].asnumpy())
+            out_batch = self._pred_exec.outputs[0].asnumpy()
+            padded = X.getpad()
+            real_size = out_batch.shape[0] - padded
+            out_batch = out_batch[0:real_size, :]
+            outputs.append(out_batch)
         return np.concatenate(outputs)
 
     def fit(self, X, y=None, eval_data=None, eval_metric='acc',
-            iter_end_callback=None, logger=None):
+            iter_end_callback=None, learning_rate_scheduler=None,
+            epoch_end_callback=None, logger=None):
         """Fit the model.
 
         Parameters
@@ -551,6 +571,13 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
             A callback that is invoked at end of each iteration.
             This can be used to checkpoint model each iteration.
 
+        learning_rate_scheduler: Scheduler
+            A Scheduler to adjust learning rate
+
+        epoch_end_callback: callable(iteration)
+            A callback that is invoked at end of each batch
+            For print purpose
+
         logger : logging logger, optional
             When not specified, default logger will be used.
         """
@@ -573,6 +600,8 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
                             train_data=X, eval_data=eval_data,
                             eval_metric=eval_metric,
                             iter_end_callback=iter_end_callback,
+                            learning_rate_scheduler=learning_rate_scheduler,
+                            epoch_end_callback=epoch_end_callback,
                             logger=logger)
 
     def save(self, prefix, iteration=None):
diff --git a/python/mxnet/scheduler.py b/python/mxnet/scheduler.py
new file mode 100644
index 000000000000..1ba93e0a57ed
--- /dev/null
+++ b/python/mxnet/scheduler.py
@@ -0,0 +1,54 @@
+# pylint: disable=invalid-name, logging-not-lazy
+"""learning rate scheduler"""
+
+import math
+import logging
+class Factor(object):
+    """Reduce learning rate in factor
+
+    Parameters
+    ----------
+    base_lr: float
+        learning rate at start time
+    step: int
+        schedule learning rate after every step batches
+    factor: float
+        reduce learning rate factor
+    batch_per_round: int
+        how many batches per round, must set when continue training
+    """
+    def __init__(self, base_lr, step, factor=0.1, batch_per_round=1):
+        self.base_lr = base_lr
+        self.step = step
+        self.factor = factor
+        self.old_lr = base_lr
+        self.batch_per_round = batch_per_round
+        self.epoch = 0
+        self.init = False
+
+    def __call__(self, optimizer, nbatch, iteration):
+        """
+        Call to schedule current learning rate
+
+        Parameters
+        ----------
+        optimizer: Optimizer
+            Optimizer which contains learning rate field
+        nbatch: int
+            Current batch count
+        iteration: int
+            Current iteration count
+        """
+
+        if self.init == False:
+            self.init = True
+            self.epoch = max(self.epoch, iteration * self.batch_per_round + nbatch)
+        self.epoch += 1
+        lr = self.base_lr * math.pow(self.factor, int(self.epoch / self.step))
+        optimizer.learning_rate = lr
+        if lr != self.old_lr:
+            self.old_lr = lr
+            logging.info("At Iteration [%d], Batch [%d]: Swith to new learning rate %.5f" \
+                    % (iteration, nbatch, lr))
+
+
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 15584ee6ce18..3ed08be1d2df 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -137,3 +137,5 @@ def plot_network(symbol, title="plot", shape=None):
                     dot.edge(tail_name=name, head_name=input_name, **attr)
 
     return dot
+
+
diff --git a/src/c_api.cc b/src/c_api.cc
index 5787ac877f4d..b8621bd530fc 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -921,6 +921,13 @@ int MXDataIterGetData(DataIterHandle handle, NDArrayHandle *out) {
   API_END();
 }
 
+int MXDataIterGetPadNum(DataIterHandle handle, int *pad) {
+  API_BEGIN();
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  *pad = db.num_batch_padd;
+  API_END();
+}
+
 int MXKVStoreCreate(const char *type,
                     KVStoreHandle *out) {
   API_BEGIN();
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 2a082c57f4ff..fdef92880d72 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -105,7 +105,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
       mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
               d.data[0].get<mshadow::cpu, 3, float>());
       if (++ top >= param_.batch_size) {
-          return true;
+        return true;
       }
     }
     if (top != 0) {
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 2449d4a38bc5..b3bbdb40c07e 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -66,6 +66,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
         if (*dptr == nullptr) {
           // allocate databatch
           *dptr = new DataBatch();
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
           (*dptr)->data.resize(batch.data.size());
           for (size_t i = 0; i < batch.data.size(); ++i) {
             (*dptr)->data.at(i) = NDArray(batch.data[i].shape_, Context::CPU());
@@ -77,6 +78,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
           CHECK_EQ((*dptr)->data.at(i).shape(), batch.data[i].shape_);
           mshadow::Copy(((*dptr)->data)[i].data().FlatTo2D<cpu, real_t>(),
                         batch.data[i].FlatTo2D<cpu, real_t>());
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
         }
         return true;
       },

From a9b66fb74daa91f446b9a24e71e34a86b15bb9fc Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 22 Sep 2015 22:39:33 -0600
Subject: [PATCH 5/5] update

---
 example/cifar10/cifar10.py              |  5 +-
 example/notebooks/cifar-recipe.ipynb    | 87 +++++++++++++------------
 python/mxnet/__init__.py                |  4 +-
 python/mxnet/{helper.py => callback.py} | 23 ++++++-
 python/mxnet/misc.py                    | 58 +++++++++++++++++
 python/mxnet/model.py                   | 46 +++++--------
 python/mxnet/optimizer.py               | 22 +++++--
 python/mxnet/scheduler.py               | 54 ---------------
 tests/python/train/test_mlp.py          |  2 +-
 9 files changed, 163 insertions(+), 138 deletions(-)
 rename python/mxnet/{helper.py => callback.py} (74%)
 create mode 100644 python/mxnet/misc.py
 delete mode 100644 python/mxnet/scheduler.py

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index e3d3fcc70a22..92eba39f72b8 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -125,9 +125,10 @@ def test_cifar():
     total_batch = 50000 / batch_size + 1
     gpus = [mx.gpu(i) for i in range(num_gpus)]
     model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_round = num_round,
-                                 learning_rate=0.05, momentum=0.9, wd=0.00001)
+                                 learning_rate=0.05, momentum=0.9, wd=0.00001,
+                                 lr_scheduler=mx.misc.FactorScheduler(2))
     model.fit(X=train_dataiter, eval_data=test_dataiter,
-              epoch_end_callback=mx.helper.Speedometer(batch_size))
+              epoch_end_callback=mx.callback.Speedometer(batch_size))
 
 if __name__ == "__main__":
     test_cifar()
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index 54558de66c43..fccdfcb47e43 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -119,10 +119,10 @@
     "in4e = DownsampleFactory(in4d, 96)\n",
     "in5a = SimpleFactory(in4e, 176, 160)\n",
     "in5b = SimpleFactory(in5a, 176, 160)\n",
-    "pool = mx.symbol.Pooling(data=in5b, pool_type=\"avg\", kernel=(7,7))\n",
+    "pool = mx.symbol.Pooling(data=in5b, pool_type=\"avg\", kernel=(7,7), name=\"global_avg\")\n",
     "flatten = mx.symbol.Flatten(data=pool)\n",
     "fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10)\n",
-    "loss = mx.symbol.Softmax(data=fc)"
+    "softmax = mx.symbol.Softmax(data=fc)"
    ]
   },
   {
@@ -149,8 +149,14 @@
     "# For demo purpose, this model only train 1 round\n",
     "# We will use the first GPU to do training\n",
     "num_round = 1\n",
-    "model = mx.model.FeedForward(ctx=mx.gpu(), symbol=loss, num_round=num_round,\n",
-    "                             learning_rate=0.05, momentum=0.9, wd=0.00001)\n"
+    "model = mx.model.FeedForward(ctx=mx.gpu(), symbol=softmax, num_round=num_round,\n",
+    "                             learning_rate=0.05, momentum=0.9, wd=0.00001)\n",
+    "\n",
+    "# we can add learning rate scheduler to the model\n",
+    "# model = mx.model.FeedForward(ctx=mx.gpu(), symbol=softmax, num_round=num_round,\n",
+    "#                              learning_rate=0.05, momentum=0.9, wd=0.00001,\n",
+    "#                              lr_scheduler=mx.misc.FactorScheduler(2))\n",
+    "# In this example. learning rate will be reduced to 0.1 * previous learning rate for every two round"
    ]
   },
   {
@@ -231,7 +237,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
@@ -240,17 +246,17 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO:root:Start training with 1 devices\n",
-      "INFO:root:Batch [50]\tSpeed: 1110.69 samples/sec\n",
-      "INFO:root:Batch [100]\tSpeed: 1094.86 samples/sec\n",
-      "INFO:root:Batch [150]\tSpeed: 1090.16 samples/sec\n",
-      "INFO:root:Batch [200]\tSpeed: 1088.40 samples/sec\n",
-      "INFO:root:Batch [250]\tSpeed: 1083.11 samples/sec\n",
-      "INFO:root:Batch [300]\tSpeed: 1080.53 samples/sec\n",
-      "INFO:root:Batch [350]\tSpeed: 1075.29 samples/sec\n",
-      "INFO:root:Iteration[0] Train-accuracy=0.523477\n",
-      "INFO:root:Iteration[0] Time cost=46.563\n",
-      "INFO:root:Iteration[0] Validation-accuracy=0.649921\n"
+      "INFO:root:Start training with [gpu(0)]\n",
+      "INFO:root:Batch [50]\tSpeed: 1091.84 samples/sec\n",
+      "INFO:root:Batch [100]\tSpeed: 1084.80 samples/sec\n",
+      "INFO:root:Batch [150]\tSpeed: 1084.55 samples/sec\n",
+      "INFO:root:Batch [200]\tSpeed: 1077.30 samples/sec\n",
+      "INFO:root:Batch [250]\tSpeed: 1074.73 samples/sec\n",
+      "INFO:root:Batch [300]\tSpeed: 1075.67 samples/sec\n",
+      "INFO:root:Batch [350]\tSpeed: 1067.09 samples/sec\n",
+      "INFO:root:Iteration[0] Train-accuracy=0.525695\n",
+      "INFO:root:Iteration[0] Time cost=47.012\n",
+      "INFO:root:Iteration[0] Validation-accuracy=0.660008\n"
      ]
     }
    ],
@@ -258,24 +264,15 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.helper.Speedometer(batch_size))\n",
+    "          epoch_end_callback=mx.callback.Speedometer(batch_size))\n",
+    "\n",
     "# if we want to save model after every round, we can add check_point call back\n",
     "# model_prefix = './cifar_'\n",
     "# model.fit(X=train_dataiter,\n",
     "#           eval_data=test_dataiter,\n",
-    "#           eval_metric=\"accuracy\"),\n",
-    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))\n",
-    "\n",
-    "# if we want to schelue learning rate, we can add scheduler in fit\n",
-    "# model.fit(X=train_dataiter,\n",
-    "#           eval_data=test_dataiter,\n",
-    "#           learning_rate_scheduler=mx.scheduler.factor(base_lr=0.05, step=3900, factor=0.1)\n",
-    "\n",
-    "# base_lr is learning rate at starting\n",
-    "# The unit for step is batch\n",
-    "# In this example, we have 50k training data, and batch_size is 128, so we will have 390 batch per round\n",
-    "# If we set step to 3900, means we will make new learning rate multiply factor after 10 round\n",
-    "# Which means at round 11, the learning rate will be 0.005"
+    "#           eval_metric=\"accuracy\",\n",
+    "#           epoch_end_callback=mx.helper.Speedometer(batch_size),\n",
+    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))\n"
    ]
   },
   {
@@ -383,32 +380,38 @@
     "collapsed": true
    },
    "source": [
-    "Extract feature requre bind symbol with the feature symbol. We can create a new model object with grouped output symbol and original parameters."
+    "From any symbol, we are able to know its internal feature_maps and bind a new model to extract that feature map"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 17,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(10000, 336, 1, 1)\n"
+     "ename": "TypeError",
+     "evalue": "Symbol only support integer index to fetch i-th output",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-17-0e3d13f4a151>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0minternals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msoftmax\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_internals\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mfea_symbol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minternals\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"global_avg_output\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, \n",
+      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/symbol.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, index)\u001b[0m\n\u001b[0;32m    156\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    157\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 158\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Symbol only support integer index to fetch i-th output'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    159\u001b[0m         \u001b[0mhandle\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSymbolHandle\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    160\u001b[0m         check_call(_LIB.MXSymbolGetOutput(\n",
+      "\u001b[1;31mTypeError\u001b[0m: Symbol only support integer index to fetch i-th output"
      ]
     }
    ],
    "source": [
-    "# In current implmentation of model, we can only get one output at one time\n",
-    "# So we need put desired feature at the first place of group\n",
-    "# By using a lower level python API, we are able to get any features we grouped at same time\n",
+    "# predict internal featuremaps\n",
+    "internals = softmax.get_internals()\n",
+    "\n",
+    "fea_symbol = internals[\"global_avg_output\"]\n",
     "\n",
-    "group = mx.symbol.Group([pool, loss])\n",
-    "group.list_outputs()\n",
-    "feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, arg_params=model3.arg_params, aux_params=model3.aux_params)\n",
+    "feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, \n",
+    "                                         arg_params=model.arg_params, aux_params=model.aux_params,\n",
+    "                                         allow_extra_params=True)\n",
     "global_pooling_feature = feature_extractor.predict(test_dataiter)\n",
     "print(global_pooling_feature.shape)"
    ]
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 4791651ca391..7bca6efbb46d 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -25,7 +25,7 @@
 from . import visualization
 # use viz as short for mx.ndarray
 from . import visualization as viz
-from . import helper
-from . import scheduler
+from . import callback
+from . import misc
 
 __version__ = "0.1.0"
diff --git a/python/mxnet/helper.py b/python/mxnet/callback.py
similarity index 74%
rename from python/mxnet/helper.py
rename to python/mxnet/callback.py
index e82c1bd21249..dca238a37709 100644
--- a/python/mxnet/helper.py
+++ b/python/mxnet/callback.py
@@ -1,9 +1,28 @@
-# pylint: disable=logging-not-lazy, blacklisted-name
+# pylint: disable=logging-not-lazy, blacklisted-name, invalid-name
 """model helper for knowing training status"""
 import sys
 import math
 import logging
 import time
+from .model import save_checkpoint
+
+def do_checkpoint(prefix):
+    """Callback to checkpoint the model to prefix every iteration.
+
+    Parameters
+    ----------
+    prefix : str
+        The file prefix to checkpoint to
+
+    Returns
+    -------
+    callback : function
+        The callback function that can be passed as iter_end_callback to fit.
+    """
+    def _callback(iter_no, s, arg, aux):
+        """The checkpoint function."""
+        save_checkpoint(prefix, iter_no + 1, s, arg, aux)
+    return _callback
 
 class Speedometer(object):
     """Calculate training speed in frequent
@@ -68,3 +87,5 @@ def __call__(self, count):
         percents = math.ceil(100.0 * count / float(self.total))
         bar = '=' * filled_len + '-' * (self.bar_len - filled_len)
         sys.stdout.write('[%s] %s%s\r' % (bar, percents, '%'))
+
+
diff --git a/python/mxnet/misc.py b/python/mxnet/misc.py
new file mode 100644
index 000000000000..43da2e1fc350
--- /dev/null
+++ b/python/mxnet/misc.py
@@ -0,0 +1,58 @@
+# pylint: disable=invalid-name, logging-not-lazy, arguments-differ
+"""learning rate scheduler"""
+
+import math
+import logging
+
+class LearningRateScheduler(object):
+    """Base class of learning rate scheduler"""
+    def __init__(self):
+        self.base_lr = 0.01
+
+    def __call__(self):
+        """lr calculation function"""
+        raise NotImplementedError("must override this")
+
+
+class FactorScheduler(LearningRateScheduler):
+    """Reduce learning rate in factor
+
+    Parameters
+    ----------
+    step: int
+        schedule learning rate after every round
+    factor: float
+        reduce learning rate factor
+    """
+    def __init__(self, step, factor=0.1):
+        super(FactorScheduler, self).__init__()
+        if step < 1:
+            raise ValueError("Schedule step must be greater or equal than 1 round")
+        if factor >= 1.0:
+            raise ValueError("Factor must be less than 1 to make lr reduce")
+        self.step = step
+        self.factor = factor
+        self.old_lr = self.base_lr
+        self.init = False
+
+    def __call__(self, iteration):
+        """
+        Call to schedule current learning rate
+
+        Parameters
+        ----------
+        iteration: int
+            Current iteration count
+        """
+
+        if self.init == False:
+            self.init = True
+            self.old_lr = self.base_lr
+        lr = self.base_lr * math.pow(self.factor, int(iteration / self.step))
+        if lr != self.old_lr:
+            self.old_lr = lr
+            logging.info("At Iteration [%d]: Swith to new learning rate %.5f" \
+                    % (iteration, lr))
+        return lr
+
+
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 0101a77bfb54..d5672644cab8 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -121,8 +121,8 @@ def _train_multi_device(symbol, ctx, input_shape,
                         arg_params, aux_params,
                         begin_round, end_round, optimizer,
                         train_data, eval_data=None, eval_metric=None,
-                        iter_end_callback=None, learning_rate_scheduler=None,
-                        epoch_end_callback=None, logger=None):
+                        iter_end_callback=None, epoch_end_callback=None,
+                        logger=None):
     """Internal training function on multiple devices.
 
     This function will also work for single device as well.
@@ -268,11 +268,12 @@ def _train_multi_device(symbol, ctx, input_shape,
                     optimizer.update(index, w, g, state)
             nbatch += 1
             # epoch callback (for print purpose)
-            if epoch_end_callback:
-                epoch_end_callback(nbatch)
-            # learning rate sceduler
-            if learning_rate_scheduler:
-                learning_rate_scheduler(optimizer, nbatch, iteration)
+            if epoch_end_callback != None:
+                if isinstance(epoch_end_callback, list):
+                    for call in epoch_end_callback:
+                        call(nbatch)
+                else:
+                    epoch_end_callback(nbatch)
             # evaluate at end, so out_cpu_array can lazy copy
             eval_metric.update(out_cpu_array, label)
 
@@ -308,8 +309,12 @@ def _train_multi_device(symbol, ctx, input_shape,
                 if name in aux_params:
                     weight = sum(w.copyto(cpu()) for w in block) / len(block)
                     weight.copyto(aux_params[name])
-        if iter_end_callback:
-            iter_end_callback(iteration, symbol, arg_params, aux_params)
+        if iter_end_callback != None:
+            if isinstance(iter_end_callback, list):
+                for call in iter_end_callback:
+                    call(iteration, symbol, arg_params, aux_params)
+            else:
+                iter_end_callback(iteration, symbol, arg_params, aux_params)
     # end of all iterations
     return
 
@@ -387,25 +392,6 @@ def load_checkpoint(prefix, iteration):
     return (symbol, arg_params, aux_params)
 
 
-def do_checkpoint(prefix):
-    """Callback to checkpoint the model to prefix every iteration.
-
-    Parameters
-    ----------
-    prefix : str
-        The file prefix to checkpoint to
-
-    Returns
-    -------
-    callback : function
-        The callback function that can be passed as iter_end_callback to fit.
-    """
-    def _callback(iter_no, s, arg, aux):
-        """The checkpoint function."""
-        save_checkpoint(prefix, iter_no + 1, s, arg, aux)
-    return _callback
-
-
 class FeedForward(BASE_ESTIMATOR):
     """Model class of MXNet for training and predicting feedforward nets.
 
@@ -547,8 +533,7 @@ def predict(self, X):
         return np.concatenate(outputs)
 
     def fit(self, X, y=None, eval_data=None, eval_metric='acc',
-            iter_end_callback=None, learning_rate_scheduler=None,
-            epoch_end_callback=None, logger=None):
+            iter_end_callback=None, epoch_end_callback=None, logger=None):
         """Fit the model.
 
         Parameters
@@ -600,7 +585,6 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
                             train_data=X, eval_data=eval_data,
                             eval_metric=eval_metric,
                             iter_end_callback=iter_end_callback,
-                            learning_rate_scheduler=learning_rate_scheduler,
                             epoch_end_callback=epoch_end_callback,
                             logger=logger)
 
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index d1f0ae4ef246..5dc444e21620 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,9 +1,12 @@
-# pylint: disable=fixme, invalid-name, unused-argument
+# pylint: disable=fixme, invalid-name, unused-argument, too-many-arguments
 """Common Optimization algorithms with regularizations."""
 from .ndarray import NDArray, zeros
 
 class Optimizer(object):
     """Base class of all optimizers."""
+    def __init__(self):
+        self.iteration = 0
+
     def begin_round(self, iteration):
         """Function called to notify beginning of iteration.
 
@@ -12,7 +15,7 @@ def begin_round(self, iteration):
         iteration : int
             The iteration number.
         """
-        pass
+        self.iteration = iteration
 
 
 class SGD(Optimizer):
@@ -33,11 +36,15 @@ class SGD(Optimizer):
         rescaling factor of gradient.
     """
     def __init__(self, learning_rate=0.01, momentum=0.0,
-                 wd=0.0001, rescale_grad=1):
+                 wd=0.0001, rescale_grad=1, lr_scheduler=None):
+        super(SGD, self).__init__()
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd
         self.rescale_grad = rescale_grad
+        self.lr_scheduler = lr_scheduler
+        if lr_scheduler != None:
+            self.lr_scheduler.base_lr = learning_rate
         self.momentums = {}
 
     def create_state(self, index, weight):
@@ -74,14 +81,19 @@ def update(self, index, weight, grad, state):
         # TODO(bing) implement wd_bias, wd_gamma, wd_beta
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+
+        if self.lr_scheduler != None:
+            lr = self.lr_scheduler(self.iteration)
+        else:
+            lr = self.lr
         if state:
             mom = state
             mom[:] *= self.momentum
-            mom[:] += -self.lr * (grad * self.rescale_grad + self.wd * weight)
+            mom[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
             weight[:] += mom
         else:
             assert self.momentum == 0.0
-            weight[:] += -self.lr * (grad * self.rescale_grad + self.wd * weight)
+            weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
 
 
 def create(name, rescale_grad=1, **kwargs):
diff --git a/python/mxnet/scheduler.py b/python/mxnet/scheduler.py
deleted file mode 100644
index 1ba93e0a57ed..000000000000
--- a/python/mxnet/scheduler.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# pylint: disable=invalid-name, logging-not-lazy
-"""learning rate scheduler"""
-
-import math
-import logging
-class Factor(object):
-    """Reduce learning rate in factor
-
-    Parameters
-    ----------
-    base_lr: float
-        learning rate at start time
-    step: int
-        schedule learning rate after every step batches
-    factor: float
-        reduce learning rate factor
-    batch_per_round: int
-        how many batches per round, must set when continue training
-    """
-    def __init__(self, base_lr, step, factor=0.1, batch_per_round=1):
-        self.base_lr = base_lr
-        self.step = step
-        self.factor = factor
-        self.old_lr = base_lr
-        self.batch_per_round = batch_per_round
-        self.epoch = 0
-        self.init = False
-
-    def __call__(self, optimizer, nbatch, iteration):
-        """
-        Call to schedule current learning rate
-
-        Parameters
-        ----------
-        optimizer: Optimizer
-            Optimizer which contains learning rate field
-        nbatch: int
-            Current batch count
-        iteration: int
-            Current iteration count
-        """
-
-        if self.init == False:
-            self.init = True
-            self.epoch = max(self.epoch, iteration * self.batch_per_round + nbatch)
-        self.epoch += 1
-        lr = self.base_lr * math.pow(self.factor, int(self.epoch / self.step))
-        optimizer.learning_rate = lr
-        if lr != self.old_lr:
-            self.old_lr = lr
-            logging.info("At Iteration [%d], Batch [%d]: Swith to new learning rate %.5f" \
-                    % (iteration, nbatch, lr))
-
-
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 5ad44fe0350b..3287ddb3e73d 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -51,7 +51,7 @@ def test_mlp():
         X=train_dataiter,
         eval_data=val_dataiter,
         eval_metric=accuracy,
-        iter_end_callback=mx.model.do_checkpoint(prefix),
+        iter_end_callback=mx.callback.do_checkpoint(prefix),
         ctx=[mx.cpu(i) for i in range(2)],
         num_round=num_round,
         learning_rate=0.01, wd=0.0004,