diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 1c3b75ccde70..92eba39f72b8 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -100,7 +100,7 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 
 get_data.GetCifar10()
 batch_size = 128
-num_round = 3
+num_round = 10
 num_gpus = 1
 
 train_dataiter = mx.io.ImageRecordIter(
@@ -120,22 +120,15 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
         batch_size=batch_size,
         preprocess_threads=1)
 
-logging.basicConfig(level=logging.DEBUG)
-
-gpus = [mx.gpu(i) for i in range(num_gpus)]
-# Use create functional style to train a model
-model = mx.model.FeedForward.create(
-    symbol=softmax, ctx=gpus,
-    X=train_dataiter, eval_data=test_dataiter,
-    num_round=num_round,
-    learning_rate=0.05, momentum=0.9, wd=0.00001)
-
-# Alternatively, you can use sklearn-style two-step API, as follows
-"""
-model = mx.model.FeedForward(
-    symbol=softmax, ctx=gpus,
-    num_round=num_round,
-    learning_rate=0.05, momentum=0.9, wd=0.00001)
-
-model.fit(X=train_dataiter, eval_data=test_dataiter)
-"""
+def test_cifar():
+    logging.basicConfig(level=logging.DEBUG)
+    total_batch = 50000 / batch_size + 1
+    gpus = [mx.gpu(i) for i in range(num_gpus)]
+    model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_round = num_round,
+                                 learning_rate=0.05, momentum=0.9, wd=0.00001,
+                                 lr_scheduler=mx.misc.FactorScheduler(2))
+    model.fit(X=train_dataiter, eval_data=test_dataiter,
+              epoch_end_callback=mx.callback.Speedometer(batch_size))
+
+if __name__ == "__main__":
+    test_cifar()
diff --git a/example/notebooks/alexnet.ipynb b/example/notebooks/alexnet.ipynb
index c030d873cd08..e6f2ad94e296 100644
--- a/example/notebooks/alexnet.ipynb
+++ b/example/notebooks/alexnet.ipynb
@@ -401,7 +401,7 @@
     }
    ],
    "source": [
-    "mx.viz.plot_network(\"AlexNet\", softmax)"
+    "mx.viz.plot_network(softmax)"
    ]
   },
   {
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
new file mode 100644
index 000000000000..fccdfcb47e43
--- /dev/null
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CIFAR-10 Recipe\n",
+    "In this notebook, we will show how to train a state-of-art CIFAR-10 network with MXNet and extract feature from the network.\n",
+    "This example wiil cover\n",
+    "\n",
+    "- Network/Data definition \n",
+    "- Multi GPU training\n",
+    "- Model saving and loading\n",
+    "- Prediction/Extracting Feature\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "import logging\n",
+    "import numpy as np\n",
+    "\n",
+    "# setup logging\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.DEBUG)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's make some helper function to let us build a simplified Inception Network. More details about how to composite symbol into component can be found at [component demo](composite_symbol.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Basic Conv + BN + ReLU factory\n",
+    "def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), act_type=\"relu\"):\n",
+    "    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)\n",
+    "    bn = mx.symbol.BatchNorm(data=conv)\n",
+    "    act = mx.symbol.Activation(data = bn, act_type=act_type)\n",
+    "    return act"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# A Simple Downsampling Factory\n",
+    "def DownsampleFactory(data, ch_3x3):\n",
+    "    # conv 3x3\n",
+    "    conv = ConvFactory(data=data, kernel=(3, 3), stride=(2, 2), num_filter=ch_3x3, pad=(1, 1))\n",
+    "    # pool\n",
+    "    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type='max')\n",
+    "    # concat\n",
+    "    concat = mx.symbol.Concat(*[conv, pool])\n",
+    "    return concat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# A Simple module\n",
+    "def SimpleFactory(data, ch_1x1, ch_3x3):\n",
+    "    # 1x1\n",
+    "    conv1x1 = ConvFactory(data=data, kernel=(1, 1), pad=(0, 0), num_filter=ch_1x1)\n",
+    "    # 3x3\n",
+    "    conv3x3 = ConvFactory(data=data, kernel=(3, 3), pad=(1, 1), num_filter=ch_3x3)\n",
+    "    #concat\n",
+    "    concat = mx.symbol.Concat(*[conv1x1, conv3x3])\n",
+    "    return concat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can build a network with these component factories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data = mx.symbol.Variable(name=\"data\")\n",
+    "conv1 = ConvFactory(data=data, kernel=(3,3), pad=(1,1), num_filter=96, act_type=\"relu\")\n",
+    "in3a = SimpleFactory(conv1, 32, 32)\n",
+    "in3b = SimpleFactory(in3a, 32, 48)\n",
+    "in3c = DownsampleFactory(in3b, 80)\n",
+    "in4a = SimpleFactory(in3c, 112, 48)\n",
+    "in4b = SimpleFactory(in4a, 96, 64)\n",
+    "in4c = SimpleFactory(in4b, 80, 80)\n",
+    "in4d = SimpleFactory(in4c, 48, 96)\n",
+    "in4e = DownsampleFactory(in4d, 96)\n",
+    "in5a = SimpleFactory(in4e, 176, 160)\n",
+    "in5b = SimpleFactory(in5a, 176, 160)\n",
+    "pool = mx.symbol.Pooling(data=in5b, pool_type=\"avg\", kernel=(7,7), name=\"global_avg\")\n",
+    "flatten = mx.symbol.Flatten(data=pool)\n",
+    "fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10)\n",
+    "softmax = mx.symbol.Softmax(data=fc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# If you'd like to see the network structure, run the plot_network function\n",
+    "# mx.viz.plot_network(loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# We will make model with current current symbol\n",
+    "# For demo purpose, this model only train 1 round\n",
+    "# We will use the first GPU to do training\n",
+    "num_round = 1\n",
+    "model = mx.model.FeedForward(ctx=mx.gpu(), symbol=softmax, num_round=num_round,\n",
+    "                             learning_rate=0.05, momentum=0.9, wd=0.00001)\n",
+    "\n",
+    "# we can add learning rate scheduler to the model\n",
+    "# model = mx.model.FeedForward(ctx=mx.gpu(), symbol=softmax, num_round=num_round,\n",
+    "#                              learning_rate=0.05, momentum=0.9, wd=0.00001,\n",
+    "#                              lr_scheduler=mx.misc.FactorScheduler(2))\n",
+    "# In this example. learning rate will be reduced to 0.1 * previous learning rate for every two round"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we have multiple GPU, for eaxmple, 4 GPU, we can utilize them without any difficulty"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# num_devs = 4\n",
+    "# model = mx.model.FeedForward(ctx=[mx.gpu(i) for i in range(num_devs)], symbol=loss, num_round = 1,\n",
+    "#                              learning_rate=0.05, momentum=0.9, wd=0.00001)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next step is declaring data iterator. The original CIFAR-10 data is 3x32x32 in binary format, we provides RecordIO format, so we can use Image RecordIO format. For more infomation about Image RecordIO Iterator, check [document](https://mxnet.readthedocs.org/en/latest/python/io.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Use utility function in test to download the data\n",
+    "import sys\n",
+    "sys.path.append(\"../../tests/python/common\")\n",
+    "import get_data\n",
+    "get_data.GetCifar10()\n",
+    "# After we get the data, we can declare our data iterator\n",
+    "# The iterator will automatically create mean image file if it doesn't exist\n",
+    "batch_size = 128\n",
+    "total_batch = 50000 / 128 + 1\n",
+    "# Train iterator make batch of 128 image, and random crop each image into 3x28x28 from original 3x32x32\n",
+    "train_dataiter = mx.io.ImageRecordIter(\n",
+    "        shuffle=True,\n",
+    "        path_imgrec=\"data/cifar/train.rec\",\n",
+    "        mean_img=\"data/cifar/cifar_mean.bin\",\n",
+    "        rand_crop=True,\n",
+    "        rand_mirror=True,\n",
+    "        data_shape=(3,28,28),\n",
+    "        batch_size=batch_size,\n",
+    "        preprocess_threads=1)\n",
+    "# test iterator make batch of 128 image, and center crop each image into 3x28x28 from original 3x32x32\n",
+    "# Note: We don't need round batch in test because we only test once at one time\n",
+    "test_dataiter = mx.io.ImageRecordIter(\n",
+    "        path_imgrec=\"data/cifar/test.rec\",\n",
+    "        mean_img=\"data/cifar/cifar_mean.bin\",\n",
+    "        rand_crop=False,\n",
+    "        rand_mirror=False,\n",
+    "        data_shape=(3,28,28),\n",
+    "        batch_size=batch_size,\n",
+    "        round_batch=False,\n",
+    "        preprocess_threads=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "Now we can fit the model with data. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Start training with [gpu(0)]\n",
+      "INFO:root:Batch [50]\tSpeed: 1091.84 samples/sec\n",
+      "INFO:root:Batch [100]\tSpeed: 1084.80 samples/sec\n",
+      "INFO:root:Batch [150]\tSpeed: 1084.55 samples/sec\n",
+      "INFO:root:Batch [200]\tSpeed: 1077.30 samples/sec\n",
+      "INFO:root:Batch [250]\tSpeed: 1074.73 samples/sec\n",
+      "INFO:root:Batch [300]\tSpeed: 1075.67 samples/sec\n",
+      "INFO:root:Batch [350]\tSpeed: 1067.09 samples/sec\n",
+      "INFO:root:Iteration[0] Train-accuracy=0.525695\n",
+      "INFO:root:Iteration[0] Time cost=47.012\n",
+      "INFO:root:Iteration[0] Validation-accuracy=0.660008\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.fit(X=train_dataiter,\n",
+    "          eval_data=test_dataiter,\n",
+    "          eval_metric=\"accuracy\",\n",
+    "          epoch_end_callback=mx.callback.Speedometer(batch_size))\n",
+    "\n",
+    "# if we want to save model after every round, we can add check_point call back\n",
+    "# model_prefix = './cifar_'\n",
+    "# model.fit(X=train_dataiter,\n",
+    "#           eval_data=test_dataiter,\n",
+    "#           eval_metric=\"accuracy\",\n",
+    "#           epoch_end_callback=mx.helper.Speedometer(batch_size),\n",
+    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After only 1 epoch, our model is able to acheive about 66% accuracy on testset.\n",
+    "We can save our model by calling either ```save``` or using ```pickle```.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Saved checkpoint to \"cifar-0001.params\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "# using pickle\n",
+    "import pickle\n",
+    "smodel = pickle.dumps(model)\n",
+    "# using saving (recommended)\n",
+    "# We get the benefit being able to directly load/save from cloud storage(S3, HDFS)\n",
+    "prefix = \"cifar\"\n",
+    "model.save(prefix)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To load saved model, you can use ```pickle``` if the model is generated by ```pickle```, or use ```load``` if it is generated by ```save```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# use pickle\n",
+    "model2 = pickle.loads(smodel)\n",
+    "# using load method (able to load from S3/HDFS directly)\n",
+    "model3 = mx.model.FeedForward.load(prefix, num_round, ctx=mx.gpu())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the model to do prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Finish predict...\n",
+      "INFO:root:final accuracy = 0.651000\n"
+     ]
+    }
+   ],
+   "source": [
+    "prob = model3.predict(test_dataiter)\n",
+    "logging.info('Finish predict...')\n",
+    "# Check the accuracy from prediction\n",
+    "test_dataiter.reset()\n",
+    "# get label\n",
+    "# Because the iterator pad each batch same shape, we want to remove paded samples here\n",
+    "\n",
+    "y_batch = []\n",
+    "for _, label in test_dataiter:\n",
+    "    label = label.asnumpy()\n",
+    "    pad = test_dataiter.getpad()\n",
+    "    real_size = label.shape[0] - pad\n",
+    "    y_batch.append(label[0:real_size])\n",
+    "y = np.concatenate(y_batch)\n",
+    "\n",
+    "# get prediction label from \n",
+    "py = np.argmax(prob, axis=1)\n",
+    "acc1 = float(np.sum(py == y)) / len(y)\n",
+    "logging.info('final accuracy = %f', acc1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "From any symbol, we are able to know its internal feature_maps and bind a new model to extract that feature map"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "Symbol only support integer index to fetch i-th output",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-17-0e3d13f4a151>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0minternals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msoftmax\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_internals\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mfea_symbol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minternals\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"global_avg_output\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, \n",
+      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/symbol.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, index)\u001b[0m\n\u001b[0;32m    156\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    157\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 158\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Symbol only support integer index to fetch i-th output'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    159\u001b[0m         \u001b[0mhandle\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSymbolHandle\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    160\u001b[0m         check_call(_LIB.MXSymbolGetOutput(\n",
+      "\u001b[1;31mTypeError\u001b[0m: Symbol only support integer index to fetch i-th output"
+     ]
+    }
+   ],
+   "source": [
+    "# predict internal featuremaps\n",
+    "internals = softmax.get_internals()\n",
+    "\n",
+    "fea_symbol = internals[\"global_avg_output\"]\n",
+    "\n",
+    "feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, \n",
+    "                                         arg_params=model.arg_params, aux_params=model.aux_params,\n",
+    "                                         allow_extra_params=True)\n",
+    "global_pooling_feature = feature_extractor.predict(test_dataiter)\n",
+    "print(global_pooling_feature.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/example/notebooks/composite_symbol.ipynb b/example/notebooks/composite_symbol.ipynb
index dc97fa22e5dc..b43b796ccf9b 100644
--- a/example/notebooks/composite_symbol.ipynb
+++ b/example/notebooks/composite_symbol.ipynb
@@ -71,11 +71,11 @@
        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
        "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
        " -->\n",
-       "<!-- Title: conv Pages: 1 -->\n",
+       "<!-- Title: plot Pages: 1 -->\n",
        "<svg width=\"102pt\" height=\"348pt\"\n",
        " viewBox=\"0.00 0.00 102.00 348.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 344)\">\n",
-       "<title>conv</title>\n",
+       "<title>plot</title>\n",
        "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-344 98,-344 98,4 -4,4\"/>\n",
        "<!-- null_0 -->\n",
        "<g id=\"node1\" class=\"node\"><title>null_0</title>\n",
@@ -118,7 +118,7 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7fe1ba8a4978>"
+       "<graphviz.dot.Digraph at 0x7eff39b005f8>"
       ]
      },
      "execution_count": 3,
@@ -129,7 +129,7 @@
    "source": [
     "prev = mx.symbol.Variable(name=\"Previos Output\")\n",
     "conv_comp = ConvFactory(data=prev, num_filter=64, kernel=(7,7), stride=(2, 2))\n",
-    "mx.visualization.plot_network(title=\"conv\", symbol=conv_comp)"
+    "mx.visualization.plot_network(symbol=conv_comp)"
    ]
   },
   {
@@ -187,11 +187,11 @@
        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
        "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
        " -->\n",
-       "<!-- Title: in3a Pages: 1 -->\n",
+       "<!-- Title: plot Pages: 1 -->\n",
        "<svg width=\"438pt\" height=\"724pt\"\n",
        " viewBox=\"0.00 0.00 438.00 724.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 720)\">\n",
-       "<title>in3a</title>\n",
+       "<title>plot</title>\n",
        "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-720 434,-720 434,4 -4,4\"/>\n",
        "<!-- null_0 -->\n",
        "<g id=\"node1\" class=\"node\"><title>null_0</title>\n",
@@ -430,7 +430,7 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7fe1ba8a4e48>"
+       "<graphviz.dot.Digraph at 0x7eff39b1a160>"
       ]
      },
      "execution_count": 5,
@@ -441,7 +441,7 @@
    "source": [
     "prev = mx.symbol.Variable(name=\"Previos Output\")\n",
     "in3a = InceptionFactoryA(prev, 64, 64, 64, 64, 96, \"avg\", 32)\n",
-    "mx.visualization.plot_network(title=\"in3a\", symbol=in3a)"
+    "mx.visualization.plot_network(symbol=in3a)"
    ]
   },
   {
@@ -681,7 +681,7 @@
    "source": [
     "prev = mx.symbol.Variable(name=\"Previos Output\")\n",
     "in3c = InceptionFactoryB(prev, 128, 160, 64, 96)\n",
-    "mx.visualization.plot_network(title=\"in3c\", symbol=in3c)"
+    "mx.visualization.plot_network(symbol=in3c)"
    ]
   },
   {
diff --git a/example/python-howto/data_iter.py b/example/python-howto/data_iter.py
index d1cebc0a470d..ea541b6985ef 100644
--- a/example/python-howto/data_iter.py
+++ b/example/python-howto/data_iter.py
@@ -42,7 +42,11 @@
         # Backend Parameter
         # Optional
         # Prefetch buffer size
-        prefetch_buffer=4)
+        prefetch_buffer=4,
+        # Backend Parameter,
+        # Optional
+        # Whether round batch,
+        round_batch=True)
 
 batchidx = 0
 for data, label in dataiter:
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 39679d5a3f46..5f8e32cc3f7b 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -683,6 +683,16 @@ MXNET_DLL int MXDataIterBeforeFirst(DataIterHandle handle);
  */
 MXNET_DLL int MXDataIterGetData(DataIterHandle handle,
                                 NDArrayHandle *out);
+
+/*!
+ * \brief Get the padding number in current data batch
+ * \param handle the handle pointer to the data iterator
+ * \param pad pad number ptr
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetPadNum(DataIterHandle handle,
+                                  int *pad);
+
 /*!
  * \brief Get the handle to the NDArray of underlying label
  * \param handle the handle pointer to the data iterator
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 1c9a6bc8d61a..8f65cac2214e 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -62,6 +62,8 @@ struct DataBatch {
   std::vector<NDArray> data;
   /*! \brief extra data to be fed to the network */
   std::string extra_data;
+  /*! \brief num of example padded to batch */
+  int num_batch_padd;
 };  // struct DataBatch
 
 /*! \brief typedef the factory function of data iterator */
diff --git a/mshadow b/mshadow
index bf678e6ac05d..7a3ccdee3018 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit bf678e6ac05d5115f92db0b668e4424401f31b14
+Subproject commit 7a3ccdee30189d9a01d2e6c823c4b76b4c92f558
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index e9630b678ee0..7bca6efbb46d 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -25,5 +25,7 @@
 from . import visualization
 # use viz as short for mx.ndarray
 from . import visualization as viz
+from . import callback
+from . import misc
 
 __version__ = "0.1.0"
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
new file mode 100644
index 000000000000..dca238a37709
--- /dev/null
+++ b/python/mxnet/callback.py
@@ -0,0 +1,91 @@
+# pylint: disable=logging-not-lazy, blacklisted-name, invalid-name
+"""model helper for knowing training status"""
+import sys
+import math
+import logging
+import time
+from .model import save_checkpoint
+
+def do_checkpoint(prefix):
+    """Callback to checkpoint the model to prefix every iteration.
+
+    Parameters
+    ----------
+    prefix : str
+        The file prefix to checkpoint to
+
+    Returns
+    -------
+    callback : function
+        The callback function that can be passed as iter_end_callback to fit.
+    """
+    def _callback(iter_no, s, arg, aux):
+        """The checkpoint function."""
+        save_checkpoint(prefix, iter_no + 1, s, arg, aux)
+    return _callback
+
+class Speedometer(object):
+    """Calculate training speed in frequent
+
+    Parameters
+    ----------
+    batch_size: int
+        batch_size of data
+    frequent: int
+        calcutaion frequent
+    """
+    def __init__(self, batch_size, frequent=50):
+        self.batch_size = batch_size
+        self.frequent = frequent
+        self.init = False
+        self.tic = 0
+
+    def __call__(self, count):
+        """
+        Show speed
+
+        Parameters
+        ----------
+        count: int
+            current batch count
+        """
+
+        if self.init:
+            if count % self.frequent == 0:
+                speed = self.frequent * self.batch_size / (time.time() - self.tic)
+                logging.info("Batch [%d]\tSpeed: %.2f samples/sec" % (count, speed))
+                self.tic = time.time()
+        else:
+            self.init = True
+            self.tic = time.time()
+
+class ProgressBar(object):
+    """Show a progress bar
+
+    Parameters
+    ----------
+    total: int
+        total batch size
+    length: int
+        length or progress bar
+    """
+    def __init__(self, total, length=80):
+        self.bar_len = length
+        self.total = total
+
+    def __call__(self, count):
+        """
+        Update progress bar
+
+        Parameters
+        ----------
+        count: int
+            current batch count
+        """
+
+        filled_len = int(round(self.bar_len * count / float(self.total)))
+        percents = math.ceil(100.0 * count / float(self.total))
+        bar = '=' * filled_len + '-' * (self.bar_len - filled_len)
+        sys.stdout.write('[%s] %s%s\r' % (bar, percents, '%'))
+
+
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index e4e6905aba3a..5ac381d99e38 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -82,6 +82,14 @@ def getlabel(self):
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
         return NDArray(hdl, False)
 
+    def getpad(self):
+        """get padded sample num in the batch
+
+        """
+        pad = ctypes.c_int(0)
+        check_call(_LIB.MXDataIterGetPadNum(self.handle, ctypes.byref(pad)))
+        return pad.value
+
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
     name = ctypes.c_char_p()
diff --git a/python/mxnet/misc.py b/python/mxnet/misc.py
new file mode 100644
index 000000000000..43da2e1fc350
--- /dev/null
+++ b/python/mxnet/misc.py
@@ -0,0 +1,58 @@
+# pylint: disable=invalid-name, logging-not-lazy, arguments-differ
+"""learning rate scheduler"""
+
+import math
+import logging
+
+class LearningRateScheduler(object):
+    """Base class of learning rate scheduler"""
+    def __init__(self):
+        self.base_lr = 0.01
+
+    def __call__(self):
+        """lr calculation function"""
+        raise NotImplementedError("must override this")
+
+
+class FactorScheduler(LearningRateScheduler):
+    """Reduce learning rate in factor
+
+    Parameters
+    ----------
+    step: int
+        schedule learning rate after every round
+    factor: float
+        reduce learning rate factor
+    """
+    def __init__(self, step, factor=0.1):
+        super(FactorScheduler, self).__init__()
+        if step < 1:
+            raise ValueError("Schedule step must be greater or equal than 1 round")
+        if factor >= 1.0:
+            raise ValueError("Factor must be less than 1 to make lr reduce")
+        self.step = step
+        self.factor = factor
+        self.old_lr = self.base_lr
+        self.init = False
+
+    def __call__(self, iteration):
+        """
+        Call to schedule current learning rate
+
+        Parameters
+        ----------
+        iteration: int
+            Current iteration count
+        """
+
+        if self.init == False:
+            self.init = True
+            self.old_lr = self.base_lr
+        lr = self.base_lr * math.pow(self.factor, int(iteration / self.step))
+        if lr != self.old_lr:
+            self.old_lr = lr
+            logging.info("At Iteration [%d]: Swith to new learning rate %.5f" \
+                    % (iteration, lr))
+        return lr
+
+
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index df450be4cb86..d5672644cab8 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -121,7 +121,8 @@ def _train_multi_device(symbol, ctx, input_shape,
                         arg_params, aux_params,
                         begin_round, end_round, optimizer,
                         train_data, eval_data=None, eval_metric=None,
-                        iter_end_callback=None, logger=None):
+                        iter_end_callback=None, epoch_end_callback=None,
+                        logger=None):
     """Internal training function on multiple devices.
 
     This function will also work for single device as well.
@@ -165,6 +166,12 @@ def _train_multi_device(symbol, ctx, input_shape,
         A callback that is invoked at end of each iteration.
         This can be used to checkpoint model each iteration.
 
+    learning_rate_scheduler: Scheduler
+        A Scheduler to adjust learning rate
+
+    epoch_end_callback: callable(iteration)
+        A callback that is invoked at end of each batch
+
     logger : logging logger
         When not specified, default logger will be used.
 
@@ -230,6 +237,7 @@ def _train_multi_device(symbol, ctx, input_shape,
         train_data.reset()
         optimizer.begin_round(iteration)
         eval_metric.reset()
+        nbatch = 0
         # Iterate over training data.
         for data, label in train_data:
             # Copy data into the target
@@ -258,6 +266,14 @@ def _train_multi_device(symbol, ctx, input_shape,
                 # optimizea
                 for w, g, state in zip(arg_list, grad_list, opt_list):
                     optimizer.update(index, w, g, state)
+            nbatch += 1
+            # epoch callback (for print purpose)
+            if epoch_end_callback != None:
+                if isinstance(epoch_end_callback, list):
+                    for call in epoch_end_callback:
+                        call(nbatch)
+                else:
+                    epoch_end_callback(nbatch)
             # evaluate at end, so out_cpu_array can lazy copy
             eval_metric.update(out_cpu_array, label)
 
@@ -293,8 +309,12 @@ def _train_multi_device(symbol, ctx, input_shape,
                 if name in aux_params:
                     weight = sum(w.copyto(cpu()) for w in block) / len(block)
                     weight.copyto(aux_params[name])
-        if iter_end_callback:
-            iter_end_callback(iteration, symbol, arg_params, aux_params)
+        if iter_end_callback != None:
+            if isinstance(iter_end_callback, list):
+                for call in iter_end_callback:
+                    call(iteration, symbol, arg_params, aux_params)
+            else:
+                iter_end_callback(iteration, symbol, arg_params, aux_params)
     # end of all iterations
     return
 
@@ -372,25 +392,6 @@ def load_checkpoint(prefix, iteration):
     return (symbol, arg_params, aux_params)
 
 
-def do_checkpoint(prefix):
-    """Callback to checkpoint the model to prefix every iteration.
-
-    Parameters
-    ----------
-    prefix : str
-        The file prefix to checkpoint to
-
-    Returns
-    -------
-    callback : function
-        The callback function that can be passed as iter_end_callback to fit.
-    """
-    def _callback(iter_no, s, arg, aux):
-        """The checkpoint function."""
-        save_checkpoint(prefix, iter_no + 1, s, arg, aux)
-    return _callback
-
-
 class FeedForward(BASE_ESTIMATOR):
     """Model class of MXNet for training and predicting feedforward nets.
 
@@ -524,11 +525,15 @@ def predict(self, X):
         for data, _ in X:
             data.copyto(self._pred_exec_input)
             self._pred_exec.forward(is_train=False)
-            outputs.append(self._pred_exec.outputs[0].asnumpy())
+            out_batch = self._pred_exec.outputs[0].asnumpy()
+            padded = X.getpad()
+            real_size = out_batch.shape[0] - padded
+            out_batch = out_batch[0:real_size, :]
+            outputs.append(out_batch)
         return np.concatenate(outputs)
 
     def fit(self, X, y=None, eval_data=None, eval_metric='acc',
-            iter_end_callback=None, logger=None):
+            iter_end_callback=None, epoch_end_callback=None, logger=None):
         """Fit the model.
 
         Parameters
@@ -551,6 +556,13 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
             A callback that is invoked at end of each iteration.
             This can be used to checkpoint model each iteration.
 
+        learning_rate_scheduler: Scheduler
+            A Scheduler to adjust learning rate
+
+        epoch_end_callback: callable(iteration)
+            A callback that is invoked at end of each batch
+            For print purpose
+
         logger : logging logger, optional
             When not specified, default logger will be used.
         """
@@ -573,6 +585,7 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
                             train_data=X, eval_data=eval_data,
                             eval_metric=eval_metric,
                             iter_end_callback=iter_end_callback,
+                            epoch_end_callback=epoch_end_callback,
                             logger=logger)
 
     def save(self, prefix, iteration=None):
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index d1f0ae4ef246..5dc444e21620 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,9 +1,12 @@
-# pylint: disable=fixme, invalid-name, unused-argument
+# pylint: disable=fixme, invalid-name, unused-argument, too-many-arguments
 """Common Optimization algorithms with regularizations."""
 from .ndarray import NDArray, zeros
 
 class Optimizer(object):
     """Base class of all optimizers."""
+    def __init__(self):
+        self.iteration = 0
+
     def begin_round(self, iteration):
         """Function called to notify beginning of iteration.
 
@@ -12,7 +15,7 @@ def begin_round(self, iteration):
         iteration : int
             The iteration number.
         """
-        pass
+        self.iteration = iteration
 
 
 class SGD(Optimizer):
@@ -33,11 +36,15 @@ class SGD(Optimizer):
         rescaling factor of gradient.
     """
     def __init__(self, learning_rate=0.01, momentum=0.0,
-                 wd=0.0001, rescale_grad=1):
+                 wd=0.0001, rescale_grad=1, lr_scheduler=None):
+        super(SGD, self).__init__()
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd
         self.rescale_grad = rescale_grad
+        self.lr_scheduler = lr_scheduler
+        if lr_scheduler != None:
+            self.lr_scheduler.base_lr = learning_rate
         self.momentums = {}
 
     def create_state(self, index, weight):
@@ -74,14 +81,19 @@ def update(self, index, weight, grad, state):
         # TODO(bing) implement wd_bias, wd_gamma, wd_beta
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+
+        if self.lr_scheduler != None:
+            lr = self.lr_scheduler(self.iteration)
+        else:
+            lr = self.lr
         if state:
             mom = state
             mom[:] *= self.momentum
-            mom[:] += -self.lr * (grad * self.rescale_grad + self.wd * weight)
+            mom[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
             weight[:] += mom
         else:
             assert self.momentum == 0.0
-            weight[:] += -self.lr * (grad * self.rescale_grad + self.wd * weight)
+            weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
 
 
 def create(name, rescale_grad=1, **kwargs):
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 3992a241b69f..3ed08be1d2df 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -25,7 +25,7 @@ def _str2tuple(string):
     return re.findall(r"\d+", string)
 
 
-def plot_network(title, symbol, shape=None):
+def plot_network(symbol, title="plot", shape=None):
     """convert symbol to dot object for visualization
 
     Parameters
@@ -137,3 +137,5 @@ def plot_network(title, symbol, shape=None):
                     dot.edge(tail_name=name, head_name=input_name, **attr)
 
     return dot
+
+
diff --git a/src/c_api.cc b/src/c_api.cc
index 5df4d266eee8..5154efb8b78e 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -927,6 +927,13 @@ int MXDataIterGetData(DataIterHandle handle, NDArrayHandle *out) {
   API_END();
 }
 
+int MXDataIterGetPadNum(DataIterHandle handle, int *pad) {
+  API_BEGIN();
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  *pad = db.num_batch_padd;
+  API_END();
+}
+
 int MXKVStoreCreate(const char *type,
                     KVStoreHandle *out) {
   API_BEGIN();
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 2a082c57f4ff..fdef92880d72 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -105,7 +105,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
       mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
               d.data[0].get<mshadow::cpu, 3, float>());
       if (++ top >= param_.batch_size) {
-          return true;
+        return true;
       }
     }
     if (top != 0) {
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 2449d4a38bc5..b3bbdb40c07e 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -66,6 +66,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
         if (*dptr == nullptr) {
           // allocate databatch
           *dptr = new DataBatch();
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
           (*dptr)->data.resize(batch.data.size());
           for (size_t i = 0; i < batch.data.size(); ++i) {
             (*dptr)->data.at(i) = NDArray(batch.data[i].shape_, Context::CPU());
@@ -77,6 +78,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
           CHECK_EQ((*dptr)->data.at(i).shape(), batch.data[i].shape_);
           mshadow::Copy(((*dptr)->data)[i].data().FlatTo2D<cpu, real_t>(),
                         batch.data[i].FlatTo2D<cpu, real_t>());
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
         }
         return true;
       },
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 5ad44fe0350b..3287ddb3e73d 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -51,7 +51,7 @@ def test_mlp():
         X=train_dataiter,
         eval_data=val_dataiter,
         eval_metric=accuracy,
-        iter_end_callback=mx.model.do_checkpoint(prefix),
+        iter_end_callback=mx.callback.do_checkpoint(prefix),
         ctx=[mx.cpu(i) for i in range(2)],
         num_round=num_round,
         learning_rate=0.01, wd=0.0004,