diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 2934467eaabf..ab141f4abdc6 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -32,6 +32,8 @@ export(mx.io.arrayiter)
 export(mx.io.extract)
 export(mx.kv.create)
 export(mx.metric.accuracy)
+export(mx.metric.custom)
+export(mx.metric.rmse)
 export(mx.model.FeedForward.create)
 export(mx.model.load)
 export(mx.model.save)
diff --git a/R-package/R/io.R b/R-package/R/io.R
index 938c501f689a..5fe51c0eb70e 100644
--- a/R-package/R/io.R
+++ b/R-package/R/io.R
@@ -42,7 +42,7 @@ mx.io.arrayiter <- function(data, label,
   if (shuffle) {
     unif.rnds <- as.array(mx.runif(c(length(label)), ctx=mx.cpu()));
   } else {
-    unif.rnds <- mx.array(0)
+    unif.rnds <- as.array(0)
   }
   mx.io.internal.arrayiter(as.array(data),
                            as.array(label),
diff --git a/R-package/R/metric.R b/R-package/R/metric.R
index 68f574e0d42e..97cc7314977d 100644
--- a/R-package/R/metric.R
+++ b/R-package/R/metric.R
@@ -1,10 +1,12 @@
-# create a customized metric based on feval(label, pred)
+#' Helper function to create a customized metric
+#' 
+#' @export
 mx.metric.custom <-function(name, feval) {
   init <- function() {
     c(0, 0)
   }
   update <- function(label, pred, state) {
-    m <- feval(label, pred)
+    m <- feval(as.array(label), as.array(pred))
     state <- c(state[[1]] + 1, state[[2]] + m)
     return(state)
   }
@@ -20,6 +22,14 @@ mx.metric.custom <-function(name, feval) {
 #'
 #' @export
 mx.metric.accuracy <- mx.metric.custom("accuracy", function(label, pred) {
-  ypred = max.col(as.array(pred), tie="first")
-  return(sum((as.array(label) + 1) == ypred) / length(label))
+  ypred = max.col(pred, tie="first")
+  return(sum((label + 1) == ypred) / length(label))
+})
+
+#' RMSE metric
+#' 
+#' @export
+mx.metric.rmse <- mx.metric.custom("rmse", function(label, pred) {
+  res <- sqrt(mean((label-pred)^2))
+  return(res)
 })
diff --git a/R-package/R/model.R b/R-package/R/model.R
index af9473970934..d0d0d057b0ec 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -295,7 +295,7 @@ mx.model.FeedForward.create <-
 function(symbol, X, y=NULL, ctx=NULL,
          num.round=10, optimizer="sgd",
          initializer=mx.init.uniform(0.01),
-         eval.data=NULL, eval.metric=mx.metric.accuracy,
+         eval.data=NULL, eval.metric=NULL,
          iter.end.callback=NULL, epoch.end.callback=NULL,
          array.batch.size=128,
          kvstore="local",
diff --git a/R-package/man/mx.metric.custom.Rd b/R-package/man/mx.metric.custom.Rd
new file mode 100644
index 000000000000..5671c931ca2a
--- /dev/null
+++ b/R-package/man/mx.metric.custom.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/metric.R
+\name{mx.metric.custom}
+\alias{mx.metric.custom}
+\title{Helper function to create a customized metric}
+\usage{
+mx.metric.custom(name, feval)
+}
+\description{
+Helper function to create a customized metric
+}
+
diff --git a/R-package/man/mx.metric.rmse.Rd b/R-package/man/mx.metric.rmse.Rd
new file mode 100644
index 000000000000..f6f4cc2d1d87
--- /dev/null
+++ b/R-package/man/mx.metric.rmse.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/metric.R
+\docType{data}
+\name{mx.metric.rmse}
+\alias{mx.metric.rmse}
+\title{RMSE metric}
+\format{\preformatted{List of 3
+ $ init  :function ()  
+ $ update:function (label, pred, state)  
+ $ get   :function (state)  
+ - attr(*, "class")= chr "mx.metric"
+}}
+\usage{
+mx.metric.rmse
+}
+\description{
+RMSE metric
+}
+\keyword{datasets}
+
diff --git a/R-package/man/mx.model.FeedForward.create.Rd b/R-package/man/mx.model.FeedForward.create.Rd
index b5288d878db1..e8b871720a92 100644
--- a/R-package/man/mx.model.FeedForward.create.Rd
+++ b/R-package/man/mx.model.FeedForward.create.Rd
@@ -6,9 +6,9 @@
 \usage{
 mx.model.FeedForward.create(symbol, X, y = NULL, ctx = NULL,
   num.round = 10, optimizer = "sgd", initializer = mx.init.uniform(0.01),
-  eval.data = NULL, eval.metric = mx.metric.accuracy,
-  iter.end.callback = NULL, epoch.end.callback = NULL,
-  array.batch.size = 128, kvstore = "local", ...)
+  eval.data = NULL, eval.metric = NULL, iter.end.callback = NULL,
+  epoch.end.callback = NULL, array.batch.size = 128, kvstore = "local",
+  ...)
 }
 \arguments{
 \item{symbol}{The symbolic configuration of the neural network.}
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
new file mode 100644
index 000000000000..287a967813b9
--- /dev/null
+++ b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
@@ -0,0 +1,150 @@
+Neural Network with MXNet in Five Minutes
+=============================================
+
+This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. 
+
+We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
+
+## Classification
+
+First of all, let us load in the data and preprocess it:
+
+```{r}
+require(mlbench)
+require(mxnet)
+
+data(Sonar, package="mlbench")
+
+Sonar[,61] = as.numeric(Sonar[,61])-1
+train.ind = c(1:50, 100:150)
+train.x = data.matrix(Sonar[train.ind, 1:60])
+train.y = Sonar[train.ind, 61]
+test.x = data.matrix(Sonar[-train.ind, 1:60])
+test.y = Sonar[-train.ind, 61]
+```
+
+The next step is to define the structure of the neural network.
+
+```{r}
+# Define the input data
+data <- mx.symbol.Variable("data")
+# A fully connected hidden layer 
+# data: input source
+# name: fc1
+# num_hidden: number of neurons in this hidden layer
+fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=20)
+
+# An activation function
+# fc1: input source
+# name: relu1
+# act_type: type for the activation function
+act1 <- mx.symbol.Activation(fc1, name="tanh1", act_type="tanh")
+fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=2)
+
+# Softmax function for the output layer
+softmax <- mx.symbol.Softmax(fc2, name="sm")
+```
+
+According to the comments in the code, you can see the meaning of each function and its arguments. They can be easily modified according to your need.
+
+Before we start to train the model, we can specify where to run our program:
+
+```{r}
+device.cpu = mx.cpu()
+```
+
+Here we choose to run it on CPU.
+
+After the network configuration, we can start the training process:
+
+```{r}
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
+                                     ctx=device.cpu, num.round=20, array.batch.size=15,
+                                     learning.rate=0.07, momentum=0.9, eval.metric=mx.metric.accuracy,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
+
+Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate
+
+```{r}
+preds = predict(model, test.x)
+pred.label = max.col(preds)-1
+table(pred.label, test.y)
+```
+
+## Regression
+
+Again, let us preprocess the data first.
+
+```{r}
+data(BostonHousing, package="mlbench")
+
+train.ind = seq(1, 506, 3)
+train.x = data.matrix(BostonHousing[train.ind, -14])
+train.y = BostonHousing[train.ind, 14]
+test.x = data.matrix(BostonHousing[-train.ind, -14])
+test.y = BostonHousing[-train.ind, 14]
+```
+
+We can configure a similar network as what we have done above. The only difference is in the output activation:
+
+```{r}
+# Define the input data
+data <- mx.symbol.Variable("data")
+# A fully connected hidden layer 
+# data: input source
+# name: fc1
+# num_hidden: number of neurons in this hidden layer
+fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=20)
+
+# An activation function
+# fc1: input source
+# name: relu1
+# act_type: type for the activation function
+act1 <- mx.symbol.Activation(fc1, name="tanh1", act_type="tanh")
+fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=1)
+
+# Softmax function for the output layer
+lro <- mx.symbol.LinearRegressionOutput(fc2, name="lro")
+```
+
+What we changed is mainly the last function, this enables the new network to optimize for squared loss. We can now train on this simple data set.
+
+```{r}
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
+                                     ctx=device.cpu, num.round=5, array.batch.size=10,
+                                     learning.rate=0.1, momentum=0.9, eval.metric=mx.metric.rmse,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
+
+It is also easy to make prediction and evaluate
+
+```{r}
+preds = predict(model, test.x)
+sqrt(mean((preds-test.y)^2))
+```
+
+Currently we have two pre-defined metrics "accuracy" and "rmse". One might wonder how to customize the evaluation metric. `mxnet` provides the interface for users to define their own metric of interests:
+
+```{r}
+demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
+  res <- mean(abs(label-pred))
+  return(res)
+})
+```
+
+This is an example for mean absolute error. We can simply plug it in the training function:
+
+```{r}
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
+                                     ctx=device.cpu, num.round=5, array.batch.size=10,
+                                     learning.rate=0.1, momentum=0.9, eval.metric=demo.metric.mae,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
+
+Congratulations! Now you have learnt the basic for using `mxnet`.
+
+
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index b749bc9cb4e0..20fdd83ddd57 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -1,9 +1,5 @@
----
-title: "Handwritten Digits Classification Competition"
-author: "Tong He"
-date: "October 17, 2015"
-output: html_document
----
+Handwritten Digits Classification Competition
+======================================================
 
 [MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set. We will present the basic usage of `mxnet` to compete in this challenge.
 
@@ -14,6 +10,7 @@ First, let us download the data from [here](https://www.kaggle.com/c/digit-recog
 Then we can read them in R and convert to matrices.
 
 ```{r, eval=FALSE}
+require(mxnet)
 train <- read.csv('data/train.csv', header=TRUE)
 test <- read.csv('data/test.csv', header=TRUE)
 train <- data.matrix(train)
@@ -25,7 +22,7 @@ train.y <- train[,1]
 
 Here every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255], we can linearly transform it into [0,1] by
 
-```{r, eval = FALSE}
+```{r, eval=FALSE}
 train.x <- train.x/255
 test <- test/255
 ```
@@ -40,14 +37,14 @@ table(train.y)
 
 Now we have the data. The next step is to configure the structure of our network.
 
-```{r}
+```{r, eval=FALSE}
 data <- mx.symbol.Variable("data")
 fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
 act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
+fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
 act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
 fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-softmax <- mx.symbol.Softmax(fc3, name = "sm")
+softmax <- mx.symbol.Softmax(fc3, name="sm")
 ```
 
 1. In `mxnet`, we use its own data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` use `data` to represent the input data, i.e. the input layer.
@@ -62,19 +59,19 @@ softmax <- mx.symbol.Softmax(fc3, name = "sm")
 
 We are almost ready for the training process. Before we start the computation, let's decide what device should we use.
 
-```{r}
+```{r, eval=FALSE}
 devices <- lapply(1:2, function(i) {
   mx.cpu(i)
 })
 ```
 
-Here we assign two threads of our CPU to `mxnet`. After all these preparation, you can run the following command to train the neural network!
+Here we assign two threads of our CPU to `mxnet`. After all these preparation, you can run the following command to train the neural network! Note that `mx.set.seed` is the correct function to control the random process in `mxnet`.
 
-```{r}
-set.seed(0)
+```{r, eval=FALSE}
+mx.set.seed(0)
 model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
                                      ctx=devices, num.round=10, array.batch.size=100,
-                                     learning.rate=0.07, momentum=0.9,
+                                     learning.rate=0.07, momentum=0.9,  eval.metric=mx.metric.accuracy,
                                      initializer=mx.init.uniform(0.07),
                                      epoch.end.callback=mx.callback.log.train.metric(100))
 ```
@@ -83,31 +80,103 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
 
 To make prediction, we can simply write
 
-```{r}
+```{r, eval=FALSE}
 preds <- predict(model, test)
 dim(preds)
 ```
 
 It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
 
-```{r}
+```{r, eval=FALSE}
 pred.label <- max.col(preds) - 1
 table(pred.label)
 ```
 
 With a little extra effort in the csv format, we can have our submission to the competition!
 
-```{r}
+```{r, eval=FALSE}
 submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
+## LeNet
+
+Next we are going to introduce a new network structure: [LeNet](http://yann.lecun.com/exdb/lenet/). It is proposed by Yann LeCun to recognize handwritten digits. Now we are going to demonstrate how to construct and train an LeNet in `mxnet`.
+
+First we construct the network:
+
+```{r, eval=FALSE}
+# input
+data <- mx.symbol.Variable('data')
+# first conv
+conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
+tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
+pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
+                          kernel=c(2,2), stride=c(2,2))
+# second conv
+conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
+tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
+pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
+                          kernel=c(2,2), stride=c(2,2))
+# first fullc
+flatten <- mx.symbol.Flatten(data=pool2)
+fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
+tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
+# second fullc
+fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
+# loss
+lenet <- mx.symbol.Softmax(data=fc2)
+```
+
+Then let us reshape the matrices into arrays:
+
+```{r, eval=FALSE}
+train.array <- t(train.x)
+dim(train.array) <- c(1,28,28,nrow(train.x))
+train.array <- aperm(train.array, c(4,1,2,3))
+test.array <- t(test)
+dim(test.array) <- c(1,28,28,nrow(test))
+test.array <- aperm(test.array, c(4,1,2,3))
+```
 
+Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
 
+```{r, eval=FALSE}
+device.cpu <- mx.cpu()
+device.gpu <- lapply(1:4, function(i) {
+  mx.gpu(i)
+})
+```
 
+Training on CPU:
 
+```{r, eval=FALSE}
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
+                                     ctx=device.cpu, num.round=5, array.batch.size=100,
+                                     learning.rate=0.05, momentum=0.9, wd=0.00001,
+                                     eval.metric=mx.metric.accuracy,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
 
+Training on GPU:
 
+```{r, eval=FALSE}
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
+                                     ctx=device.gpu, num.round=5, array.batch.size=100,
+                                     learning.rate=0.05, momentum=0.9, wd=0.00001,
+                                     eval.metric=mx.metric.accuracy,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
 
+Finally we can submit the result to Kaggle again to see the improvement of our ranking!
 
+```{r, eval=FALSE}
+preds <- predict(model, test.array)
+pred.label <- max.col(preds) - 1
+submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
+```
 
+![](../web-data/mxnet/knitr/mnistCompetition-kaggle-submission.png)
diff --git a/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd b/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd
index 2b608066b753..69f163ba6ad2 100644
--- a/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd
+++ b/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd
@@ -30,9 +30,8 @@ Let's create `NDArray` on either GPU or CPU
 ```{r}
 require(mxnet)
 a <- mx.nd.zeros(c(2, 3)) # create a 2-by-3 matrix on cpu
-b <- mx.nd.zeros(c(2, 3), mx.gpu()) # create a 2-by-3 matrix on gpu 0
-c <- mx.nd.zeros(c(2, 3), mx.gpu(2)) # create a 2-by-3 matrix on gpu 0
-c$dim()
+b <- mx.nd.zeros(c(2, 3), mx.cpu()) # create a 2-by-3 matrix on gpu 0
+c <- mx.nd.zeros(c(2, 3), mx.gpu(1)) # create a 2-by-3 matrix on gpu 0
 ```
 
 We can also initialize an `NDArray` object in various ways:
@@ -72,7 +71,7 @@ as.array(d)
 If two `NDArray`s sit on different divices, we need to explicitly move them 
 into the same one. For instance:
 
-```{r}
+```{r, eval=FALSE}
 a <- mx.nd.ones(c(2, 3)) * 2
 b <- mx.nd.ones(c(2, 3), mx.gpu()) / 8
 c <- mx.nd.copyto(a, mx.gpu()) * b
diff --git a/doc/R-package/Makefile b/doc/R-package/Makefile
index 7ca47d63776d..5dcd78adbdb3 100644
--- a/doc/R-package/Makefile
+++ b/doc/R-package/Makefile
@@ -5,6 +5,7 @@ PKGROOT=../../R-package
 classifyRealImageWithPretrainedModel.md:
 mnistCompetition.Rmd:
 ndarrayAndSymbolTutorial.Rmd:
+fiveMinutesNeuralNetwork.Rmd:
 
 # General Rules for build rmarkdowns, need knitr
 %.md: $(PKGROOT)/vignettes/%.Rmd
@@ -12,5 +13,5 @@ ndarrayAndSymbolTutorial.Rmd:
 	Rscript -e \
 	"require(knitr);"\
 	"knitr::opts_knit\$$set(root.dir=\".\");"\
-	"knitr::opts_chunk\$$set(fig.path=\"../doc-image/mxnet/knitr/$(basename $@)-\");"\
+	"knitr::opts_chunk\$$set(fig.path=\"../web-data/mxnet/knitr/$(basename $@)-\");"\
 	"knitr::knit(\"$+\")"
diff --git a/doc/R-package/classifyRealImageWithPretrainedModel.md b/doc/R-package/classifyRealImageWithPretrainedModel.md
index eb5480faabe8..16d96f9abbd2 100644
--- a/doc/R-package/classifyRealImageWithPretrainedModel.md
+++ b/doc/R-package/classifyRealImageWithPretrainedModel.md
@@ -90,7 +90,7 @@ im <- load.image(system.file("extdata/parrots.png", package="imager"))
 plot(im)
 ```
 
-![plot of chunk unnamed-chunk-5](../doc-image/mxnet/knitr/classifyRealImageWithPretrainedModel-unnamed-chunk-5-1.png) 
+![plot of chunk unnamed-chunk-5](../web-data/mxnet/knitr/classifyRealImageWithPretrainedModel-unnamed-chunk-5-1.png) 
 
 Before feeding the image to the deep net, we need to do some preprocessing
 to make the image fit the input requirement of deepnet. The preprocessing
diff --git a/doc/R-package/fiveMinutesNeuralNetwork.md b/doc/R-package/fiveMinutesNeuralNetwork.md
new file mode 100644
index 000000000000..2e386a683d33
--- /dev/null
+++ b/doc/R-package/fiveMinutesNeuralNetwork.md
@@ -0,0 +1,228 @@
+Neural Network with MXNet in Five Minutes
+=============================================
+
+This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. 
+
+We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
+
+## Classification
+
+First of all, let us load in the data and preprocess it:
+
+
+```r
+require(mlbench)
+```
+
+```
+## Loading required package: mlbench
+```
+
+```r
+require(mxnet)
+```
+
+```
+## Loading required package: mxnet
+## Loading required package: methods
+```
+
+```r
+data(Sonar, package="mlbench")
+
+Sonar[,61] = as.numeric(Sonar[,61])-1
+train.ind = c(1:50, 100:150)
+train.x = data.matrix(Sonar[train.ind, 1:60])
+train.y = Sonar[train.ind, 61]
+test.x = data.matrix(Sonar[-train.ind, 1:60])
+test.y = Sonar[-train.ind, 61]
+```
+
+The next step is to define the structure of the neural network.
+
+
+```r
+# Define the input data
+data <- mx.symbol.Variable("data")
+# A fully connected hidden layer 
+# data: input source
+# name: fc1
+# num_hidden: number of neurons in this hidden layer
+fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=20)
+
+# An activation function
+# fc1: input source
+# name: relu1
+# act_type: type for the activation function
+act1 <- mx.symbol.Activation(fc1, name="tanh1", act_type="tanh")
+fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=2)
+
+# Softmax function for the output layer
+softmax <- mx.symbol.Softmax(fc2, name="sm")
+```
+
+According to the comments in the code, you can see the meaning of each function and its arguments. They can be easily modified according to your need.
+
+Before we start to train the model, we can specify where to run our program:
+
+
+```r
+device.cpu = mx.cpu()
+```
+
+Here we choose to run it on CPU.
+
+After the network configuration, we can start the training process:
+
+
+```r
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
+                                     ctx=device.cpu, num.round=20, array.batch.size=15,
+                                     learning.rate=0.07, momentum=0.9, eval.metric=mx.metric.accuracy,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
+
+```
+## Start training with 1 devices
+## [1] Train-accuracy=0.5
+## [2] Train-accuracy=0.514285714285714
+## [3] Train-accuracy=0.514285714285714
+## [4] Train-accuracy=0.514285714285714
+## [5] Train-accuracy=0.514285714285714
+## [6] Train-accuracy=0.609523809523809
+## [7] Train-accuracy=0.676190476190476
+## [8] Train-accuracy=0.695238095238095
+## [9] Train-accuracy=0.723809523809524
+## [10] Train-accuracy=0.780952380952381
+## [11] Train-accuracy=0.8
+## [12] Train-accuracy=0.761904761904762
+## [13] Train-accuracy=0.742857142857143
+## [14] Train-accuracy=0.761904761904762
+## [15] Train-accuracy=0.847619047619047
+## [16] Train-accuracy=0.857142857142857
+## [17] Train-accuracy=0.857142857142857
+## [18] Train-accuracy=0.828571428571429
+## [19] Train-accuracy=0.838095238095238
+## [20] Train-accuracy=0.857142857142857
+```
+
+Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate
+
+
+```r
+preds = predict(model, test.x)
+pred.label = max.col(preds)-1
+table(pred.label, test.y)
+```
+
+```
+##           test.y
+## pred.label  0  1
+##          0 24 14
+##          1 36 33
+```
+
+## Regression
+
+Again, let us preprocess the data first.
+
+
+```r
+data(BostonHousing, package="mlbench")
+
+train.ind = seq(1, 506, 3)
+train.x = data.matrix(BostonHousing[train.ind, -14])
+train.y = BostonHousing[train.ind, 14]
+test.x = data.matrix(BostonHousing[-train.ind, -14])
+test.y = BostonHousing[-train.ind, 14]
+```
+
+We can configure a similar network as what we have done above. The only difference is in the output activation:
+
+
+```r
+# Define the input data
+data <- mx.symbol.Variable("data")
+# A fully connected hidden layer 
+# data: input source
+# name: fc1
+# num_hidden: number of neurons in this hidden layer
+fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=20)
+
+# An activation function
+# fc1: input source
+# name: relu1
+# act_type: type for the activation function
+act1 <- mx.symbol.Activation(fc1, name="tanh1", act_type="tanh")
+fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=1)
+
+# Softmax function for the output layer
+lro <- mx.symbol.LinearRegressionOutput(fc2, name="lro")
+```
+
+What we changed is mainly the last function, this enables the new network to optimize for squared loss. We can now train on this simple data set.
+
+
+```r
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
+                                     ctx=device.cpu, num.round=5, array.batch.size=10,
+                                     learning.rate=0.1, momentum=0.9, eval.metric=mx.metric.rmse,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
+
+```
+## Start training with 1 devices
+## [1] Train-rmse=20.8877275599495
+## [2] Train-rmse=12.8786644532322
+## [3] Train-rmse=10.3635559222185
+## [4] Train-rmse=10.5605206622052
+## [5] Train-rmse=10.2502398389275
+```
+
+It is also easy to make prediction and evaluate
+
+
+```r
+preds = predict(model, test.x)
+sqrt(mean((preds-test.y)^2))
+```
+
+```
+## [1] 9.49181
+```
+
+Currently we have two pre-defined metrics "accuracy" and "rmse". One might wonder how to customize the evaluation metric. `mxnet` provides the interface for users to define their own metric of interests:
+
+
+```r
+demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
+  res <- mean(abs(label-pred))
+  return(res)
+})
+```
+
+This is an example for mean absolute error. We can simply plug it in the training function:
+
+
+```r
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
+                                     ctx=device.cpu, num.round=5, array.batch.size=10,
+                                     learning.rate=0.1, momentum=0.9, eval.metric=demo.metric.mae,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
+
+```
+## Start training with 1 devices
+## [1] Train-mae=19.3546375619262
+## [2] Train-mae=10.5938747770646
+## [3] Train-mae=8.51244305161869
+## [4] Train-mae=8.41277845326592
+## [5] Train-mae=8.23570416674895
+```
+
+Congratulations! Now you have learnt the basic for using `mxnet`.
+
+
diff --git a/doc/R-package/mnistCompetition.md b/doc/R-package/mnistCompetition.md
index dd806dfe777b..189f016dd4a4 100644
--- a/doc/R-package/mnistCompetition.md
+++ b/doc/R-package/mnistCompetition.md
@@ -1,9 +1,5 @@
----
-title: "Handwritten Digits Classification Competition"
-author: "Tong He"
-date: "October 17, 2015"
-output: html_document
----
+Handwritten Digits Classification Competition
+======================================================
 
 [MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set. We will present the basic usage of `mxnet` to compete in this challenge.
 
@@ -15,6 +11,7 @@ Then we can read them in R and convert to matrices.
 
 
 ```r
+require(mxnet)
 train <- read.csv('data/train.csv', header=TRUE)
 test <- read.csv('data/test.csv', header=TRUE)
 train <- data.matrix(train)
@@ -46,58 +43,12 @@ Now we have the data. The next step is to configure the structure of our network
 
 ```r
 data <- mx.symbol.Variable("data")
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.Variable"
-```
-
-```r
 fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.FullyConnected"
-```
-
-```r
 act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.Activation"
-```
-
-```r
-fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.FullyConnected"
-```
-
-```r
+fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
 act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.Activation"
-```
-
-```r
 fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.FullyConnected"
-```
-
-```r
-softmax <- mx.symbol.Softmax(fc3, name = "sm")
-```
-
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.symbol.Softmax"
+softmax <- mx.symbol.Softmax(fc3, name="sm")
 ```
 
 1. In `mxnet`, we use its own data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` use `data` to represent the input data, i.e. the input layer.
@@ -119,26 +70,18 @@ devices <- lapply(1:2, function(i) {
 })
 ```
 
-```
-## Error in FUN(1:2[[1L]], ...): could not find function "mx.cpu"
-```
-
-Here we assign two threads of our CPU to `mxnet`. After all these preparation, you can run the following command to train the neural network!
+Here we assign two threads of our CPU to `mxnet`. After all these preparation, you can run the following command to train the neural network! Note that `mx.set.seed` is the correct function to control the random process in `mxnet`.
 
 
 ```r
-set.seed(0)
+mx.set.seed(0)
 model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
                                      ctx=devices, num.round=10, array.batch.size=100,
-                                     learning.rate=0.07, momentum=0.9,
+                                     learning.rate=0.07, momentum=0.9,  eval.metric=mx.metric.accuracy,
                                      initializer=mx.init.uniform(0.07),
                                      epoch.end.callback=mx.callback.log.train.metric(100))
 ```
 
-```
-## Error in eval(expr, envir, enclos): could not find function "mx.model.FeedForward.create"
-```
-
 ## Prediction and Submission
 
 To make prediction, we can simply write
@@ -146,64 +89,109 @@ To make prediction, we can simply write
 
 ```r
 preds <- predict(model, test)
-```
-
-```
-## Error in predict(model, test): object 'model' not found
-```
-
-```r
 dim(preds)
 ```
 
-```
-## Error in eval(expr, envir, enclos): object 'preds' not found
-```
-
 It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
 
 
 ```r
 pred.label <- max.col(preds) - 1
+table(pred.label)
 ```
 
-```
-## Error in as.matrix(m): object 'preds' not found
-```
+With a little extra effort in the csv format, we can have our submission to the competition!
+
 
 ```r
-table(pred.label)
+submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
-```
-## Error in table(pred.label): object 'pred.label' not found
-```
+## LeNet
 
-With a little extra effort in the csv format, we can have our submission to the competition!
+Next we are going to introduce a new network structure: [LeNet](http://yann.lecun.com/exdb/lenet/). It is proposed by Yann LeCun to recognize handwritten digits. Now we are going to demonstrate how to construct and train an LeNet in `mxnet`.
+
+First we construct the network:
 
 
 ```r
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+# input
+data <- mx.symbol.Variable('data')
+# first conv
+conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
+tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
+pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
+                          kernel=c(2,2), stride=c(2,2))
+# second conv
+conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
+tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
+pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
+                          kernel=c(2,2), stride=c(2,2))
+# first fullc
+flatten <- mx.symbol.Flatten(data=pool2)
+fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
+tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
+# second fullc
+fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
+# loss
+lenet <- mx.symbol.Softmax(data=fc2)
 ```
 
-```
-## Error in nrow(test): object 'test' not found
-```
+Then let us reshape the matrices into arrays:
+
 
 ```r
-write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
+train.array <- t(train.x)
+dim(train.array) <- c(1,28,28,nrow(train.x))
+train.array <- aperm(train.array, c(4,1,2,3))
+test.array <- t(test)
+dim(test.array) <- c(1,28,28,nrow(test))
+test.array <- aperm(test.array, c(4,1,2,3))
 ```
 
-```
-## Error in is.data.frame(x): object 'submission' not found
+Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
+
+
+```r
+device.cpu <- mx.cpu()
+device.gpu <- lapply(1:4, function(i) {
+  mx.gpu(i)
+})
 ```
 
+Training on CPU:
 
 
+```r
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
+                                     ctx=device.cpu, num.round=5, array.batch.size=100,
+                                     learning.rate=0.05, momentum=0.9, wd=0.00001,
+                                     eval.metric=mx.metric.accuracy,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
 
+Training on GPU:
 
 
+```r
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
+                                     ctx=device.gpu, num.round=5, array.batch.size=100,
+                                     learning.rate=0.05, momentum=0.9, wd=0.00001,
+                                     eval.metric=mx.metric.accuracy,
+                                     epoch.end.callback=mx.callback.log.train.metric(100))
+```
 
+Finally we can submit the result to Kaggle again to see the improvement of our ranking!
 
 
+```r
+preds <- predict(model, test.array)
+pred.label <- max.col(preds) - 1
+submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
+```
 
+![](../web-data/mxnet/knitr/mnistCompetition-kaggle-submission.png)