diff --git a/examples/cifar10/cifar10_full_test.prototxt b/examples/cifar10/cifar10_full_test.prototxt
index 0e1957a9045..1f77b4f0348 100644
--- a/examples/cifar10/cifar10_full_test.prototxt
+++ b/examples/cifar10/cifar10_full_test.prototxt
@@ -166,16 +166,17 @@ layers {
     }
   }
 }
-layers {
-  name: "prob"
-  type: SOFTMAX
-  bottom: "ip1"
-  top: "prob"
-}
 layers {
   name: "accuracy"
   type: ACCURACY
-  bottom: "prob"
+  bottom: "ip1"
   bottom: "label"
   top: "accuracy"
 }
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip1"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/cifar10/cifar10_quick_test.prototxt b/examples/cifar10/cifar10_quick_test.prototxt
index a154b9a0ea7..aa82c32aa24 100644
--- a/examples/cifar10/cifar10_quick_test.prototxt
+++ b/examples/cifar10/cifar10_quick_test.prototxt
@@ -160,16 +160,17 @@ layers {
     }
   }
 }
-layers {
-  name: "prob"
-  type: SOFTMAX
-  bottom: "ip2"
-  top: "prob"
-}
 layers {
   name: "accuracy"
   type: ACCURACY
-  bottom: "prob"
+  bottom: "ip2"
   bottom: "label"
   top: "accuracy"
 }
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/feature_extraction/imagenet_val.prototxt b/examples/feature_extraction/imagenet_val.prototxt
index 14bfe770ef8..b0451a1a114 100644
--- a/examples/feature_extraction/imagenet_val.prototxt
+++ b/examples/feature_extraction/imagenet_val.prototxt
@@ -227,3 +227,10 @@ layers {
   bottom: "label"
   top: "accuracy"
 }
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/imagenet/alexnet_val.prototxt b/examples/imagenet/alexnet_val.prototxt
index 3fd6296ef9d..1d8d86b78ff 100644
--- a/examples/imagenet/alexnet_val.prototxt
+++ b/examples/imagenet/alexnet_val.prototxt
@@ -213,15 +213,16 @@ layers {
   top: "fc8"
 }
 layers {
-  name: "prob"
-  type: SOFTMAX
+  name: "accuracy"
+  type: ACCURACY
   bottom: "fc8"
-  top: "prob"
+  bottom: "label"
+  top: "accuracy"
 }
 layers {
-  top: "accuracy"
-  name: "accuracy"
-  type: ACCURACY
-  bottom: "prob"
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "fc8"
   bottom: "label"
+  top: "loss"
 }
diff --git a/examples/imagenet/imagenet_val.prototxt b/examples/imagenet/imagenet_val.prototxt
index dd26f40ea14..8be5150cdd2 100644
--- a/examples/imagenet/imagenet_val.prototxt
+++ b/examples/imagenet/imagenet_val.prototxt
@@ -212,16 +212,17 @@ layers {
     num_output: 1000
   }
 }
-layers {
-  name: "prob"
-  type: SOFTMAX
-  bottom: "fc8"
-  top: "prob"
-}
 layers {
   name: "accuracy"
   type: ACCURACY
-  bottom: "prob"
+  bottom: "fc8"
   bottom: "label"
   top: "accuracy"
 }
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
\ No newline at end of file
diff --git a/examples/mnist/lenet_consolidated_solver.prototxt b/examples/mnist/lenet_consolidated_solver.prototxt
index 07cbc211414..ef851e0f656 100644
--- a/examples/mnist/lenet_consolidated_solver.prototxt
+++ b/examples/mnist/lenet_consolidated_solver.prototxt
@@ -262,19 +262,20 @@ test_net_param {
       }
     }
   }
-  layers {
-    name: "prob"
-    type: SOFTMAX
-    bottom: "ip2"
-    top: "prob"
-  }
   layers {
     name: "accuracy"
     type: ACCURACY
-    bottom: "prob"
+    bottom: "ip2"
     bottom: "label"
     top: "accuracy"
   }
+  layers {
+    name: "loss"
+    type: SOFTMAX_LOSS
+    bottom: "ip2"
+    bottom: "label"
+    top: "loss"
+  }
 }
 
 # The train set has 60K images, so we run 600 test iters (600 * 100 = 60K).
@@ -385,19 +386,20 @@ test_net_param {
       }
     }
   }
-  layers {
-    name: "prob"
-    type: SOFTMAX
-    bottom: "ip2"
-    top: "prob"
-  }
   layers {
     name: "accuracy"
     type: ACCURACY
-    bottom: "prob"
+    bottom: "ip2"
     bottom: "label"
     top: "accuracy"
   }
+  layers {
+    name: "loss"
+    type: SOFTMAX_LOSS
+    bottom: "ip2"
+    bottom: "label"
+    top: "loss"
+  }
 }
 
 # Expected results for first and last 500 iterations:
diff --git a/examples/mnist/lenet_test.prototxt b/examples/mnist/lenet_test.prototxt
index 3b59b75513d..2497f02ae86 100644
--- a/examples/mnist/lenet_test.prototxt
+++ b/examples/mnist/lenet_test.prototxt
@@ -102,16 +102,17 @@ layers {
     }
   }
 }
-layers {
-  name: "prob"
-  type: SOFTMAX
-  bottom: "ip2"
-  top: "prob"
-}
 layers {
   name: "accuracy"
   type: ACCURACY
-  bottom: "prob"
+  bottom: "ip2"
   bottom: "label"
   top: "accuracy"
 }
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/mnist_autoencoder_test.prototxt b/examples/mnist/mnist_autoencoder_test.prototxt
index 5090e82fe0a..b52364c17fc 100644
--- a/examples/mnist/mnist_autoencoder_test.prototxt
+++ b/examples/mnist/mnist_autoencoder_test.prototxt
@@ -142,4 +142,5 @@ layers {
   bottom: "flatdata"
   name: "loss"
   type: EUCLIDEAN_LOSS
+  top: "loss"
 }
diff --git a/examples/pascal-finetuning/pascal_finetune_val.prototxt b/examples/pascal-finetuning/pascal_finetune_val.prototxt
index ff898fe7376..91ded585d85 100644
--- a/examples/pascal-finetuning/pascal_finetune_val.prototxt
+++ b/examples/pascal-finetuning/pascal_finetune_val.prototxt
@@ -313,16 +313,18 @@ layers {
     }
   }
 }
-layers {
-  name: "prob"
-  type: SOFTMAX
-  bottom: "fc8_pascal"
-  top: "prob"
-}
 layers {
   name: "accuracy"
   type: ACCURACY
-  bottom: "prob"
+  bottom: "fc8_pascal"
   bottom: "label"
   top: "accuracy"
 }
+layers {
+  name: "prob"
+  type: SOFTMAX_LOSS
+  bottom: "fc8_pascal"
+  bottom: "label"
+  top: "loss"
+}
+
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 381bf0f4f8f..db7c63edca1 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -37,7 +37,50 @@ class LossLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {}
 
   virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
+  virtual inline int MaxTopBlobs() const { return 1; }
+};
+
+// Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer.
+template <typename Dtype> class SoftmaxLayer;
+
+/* SoftmaxWithLossLayer
+  Implements softmax and computes the loss.
+
+  It is preferred over separate softmax + multinomiallogisticloss
+  layers due to more numerically stable gradients.
+
+  In test, this layer could be replaced by simple softmax layer.
+*/
+template <typename Dtype>
+class SoftmaxWithLossLayer : public Layer<Dtype> {
+ public:
+  explicit SoftmaxWithLossLayer(const LayerParameter& param)
+      : Layer<Dtype>(param), softmax_layer_(new SoftmaxLayer<Dtype>(param)) {}
+  virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+  virtual inline LayerParameter_LayerType type() const {
+    return LayerParameter_LayerType_SOFTMAX_LOSS;
+  }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
+ protected:
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  shared_ptr<SoftmaxLayer<Dtype> > softmax_layer_;
+  // prob stores the output probability of the layer.
+  Blob<Dtype> prob_;
+  // Vector holders to call the underlying softmax layer forward and backward.
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  vector<Blob<Dtype>*> softmax_top_vec_;
 };
 
 /* SigmoidCrossEntropyLossLayer
@@ -166,7 +209,7 @@ class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
 
 /* AccuracyLayer
   Note: not an actual loss layer! Does not implement backwards step.
-  Computes the accuracy and logprob of a with respect to b.
+  Computes the accuracy of argmax(a) with respect to b.
 */
 template <typename Dtype>
 class AccuracyLayer : public Layer<Dtype> {
@@ -180,6 +223,9 @@ class AccuracyLayer : public Layer<Dtype> {
     return LayerParameter_LayerType_ACCURACY;
   }
 
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
  protected:
   virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index fc3dbbe1938..3fd7e2f8bdb 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -405,46 +405,6 @@ class SoftmaxLayer : public Layer<Dtype> {
   Blob<Dtype> scale_;
 };
 
-/* SoftmaxWithLossLayer
-  Implements softmax and computes the loss.
-
-  It is preferred over separate softmax + multinomiallogisticloss
-  layers due to more numerically stable gradients.
-
-  In test, this layer could be replaced by simple softmax layer.
-*/
-template <typename Dtype>
-class SoftmaxWithLossLayer : public Layer<Dtype> {
- public:
-  explicit SoftmaxWithLossLayer(const LayerParameter& param)
-      : Layer<Dtype>(param), softmax_layer_(new SoftmaxLayer<Dtype>(param)) {}
-  virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-
-  virtual inline LayerParameter_LayerType type() const {
-    return LayerParameter_LayerType_SOFTMAX_LOSS;
-  }
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
-
- protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-
-  shared_ptr<SoftmaxLayer<Dtype> > softmax_layer_;
-  // prob stores the output probability of the layer.
-  Blob<Dtype> prob_;
-  // Vector holders to call the underlying softmax layer forward and backward.
-  vector<Blob<Dtype>*> softmax_bottom_vec_;
-  vector<Blob<Dtype>*> softmax_top_vec_;
-};
-
 /* SplitLayer
 */
 template <typename Dtype>
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index fbc943eaf2d..899750f869a 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -23,14 +23,13 @@ void AccuracyLayer<Dtype>::SetUp(
   CHECK_EQ(bottom[1]->channels(), 1);
   CHECK_EQ(bottom[1]->height(), 1);
   CHECK_EQ(bottom[1]->width(), 1);
-  (*top)[0]->Reshape(1, 2, 1, 1);
+  (*top)[0]->Reshape(1, 1, 1, 1);
 }
 
 template <typename Dtype>
 Dtype AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   Dtype accuracy = 0;
-  Dtype logprob = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   int num = bottom[0]->num();
@@ -48,13 +47,10 @@ Dtype AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     if (max_id == static_cast<int>(bottom_label[i])) {
       ++accuracy;
     }
-    Dtype prob = max(bottom_data[i * dim + static_cast<int>(bottom_label[i])],
-                     Dtype(kLOG_THRESHOLD));
-    logprob -= log(prob);
   }
   // LOG(INFO) << "Accuracy: " << accuracy;
   (*top)[0]->mutable_cpu_data()[0] = accuracy / num;
-  (*top)[0]->mutable_cpu_data()[1] = logprob / num;
+
   // Accuracy layer should not be used as a loss function.
   return Dtype(0);
 }
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index a894d470c64..766294997f0 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -35,6 +35,9 @@ Dtype EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       diff_.mutable_cpu_data());
   Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
   Dtype loss = dot / bottom[0]->num() / Dtype(2);
+  if (top->size() == 1) {
+    (*top)[0]->mutable_cpu_data()[0] = loss;
+  }
   return loss;
 }
 
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index ab6e67d73b1..3e7fc4f812b 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -48,6 +48,9 @@ Dtype InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       loss -= infogain_mat[label * dim + j] * log(prob);
     }
   }
+  if (top->size() == 1) {
+    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+  }
   return loss / num;
 }
 
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 14ea975ad0d..ac8ad216732 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -20,6 +20,10 @@ void LossLayer<Dtype>::SetUp(
   Layer<Dtype>::SetUp(bottom, top);
   CHECK_EQ(bottom[0]->num(), bottom[1]->num())
       << "The data and label should have the same number.";
+  if (top->size() == 1) {
+  // Layers should copy the loss in the top blob
+    (*top)[0]->Reshape(1, 1, 1, 1);
+  }
   FurtherSetUp(bottom, top);
 }
 
diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
index 6486621d8aa..5a408795d6d 100644
--- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
@@ -35,6 +35,9 @@ Dtype MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
     Dtype prob = max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
     loss -= log(prob);
   }
+  if (top->size() == 1){
+    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+  }
   return loss / num;
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index a638684f3b6..955581d8875 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -41,6 +41,9 @@ Dtype SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
     loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
         log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
   }
+  if (top->size() == 1) {
+    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+  }
   return loss / num;
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 61004541fce..0caed2b83bd 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -29,6 +29,9 @@ Dtype SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
     loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
         log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
   }
+  if (top->size() == 1) {
+    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+  }
   return loss / num;
 }
 
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index ef6eebabadd..bdb3272ee25 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -20,6 +20,15 @@ void SoftmaxWithLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
   softmax_bottom_vec_.push_back(bottom[0]);
   softmax_top_vec_.push_back(&prob_);
   softmax_layer_->SetUp(softmax_bottom_vec_, &softmax_top_vec_);
+  if (top->size() >= 1) {
+    // softmax loss (averaged across batch)
+    (*top)[0]->Reshape(1, 1, 1, 1);
+  }
+  if (top->size() == 2) {
+    // softmax output
+    (*top)[1]->Reshape(bottom[0]->num(), bottom[0]->channels(),
+        bottom[0]->height(), bottom[0]->width());
+  }
 }
 
 template <typename Dtype>
@@ -37,6 +46,12 @@ Dtype SoftmaxWithLossLayer<Dtype>::Forward_cpu(
     loss += -log(max(prob_data[i * dim + static_cast<int>(label[i])],
                      Dtype(FLT_MIN)));
   }
+  if (top->size() >= 1) {
+    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+  }
+  if (top->size() == 2) {
+    (*top)[1]->ShareData(prob_);
+  }
   return loss / num;
 }
 
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 1e43a038455..7f2f67b59c6 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -64,6 +64,7 @@ class NetTest : public ::testing::Test {
         "  type: SOFTMAX_LOSS "
         "  bottom: 'innerproduct' "
         "  bottom: 'label' "
+        "  top: 'top_loss' "
         "} ";
     NetParameter param;
     CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
@@ -81,6 +82,7 @@ TYPED_TEST(NetTest, TestHasBlob) {
   EXPECT_TRUE(this->net_->has_blob("label"));
   EXPECT_TRUE(this->net_->has_blob("innerproduct"));
   EXPECT_FALSE(this->net_->has_blob("loss"));
+  EXPECT_TRUE(this->net_->has_blob("top_loss"));
 }
 
 TYPED_TEST(NetTest, TestGetBlob) {
@@ -88,6 +90,7 @@ TYPED_TEST(NetTest, TestGetBlob) {
   EXPECT_EQ(this->net_->blob_by_name("label"), this->net_->blobs()[1]);
   EXPECT_EQ(this->net_->blob_by_name("innerproduct"), this->net_->blobs()[2]);
   EXPECT_FALSE(this->net_->blob_by_name("loss"));
+  EXPECT_EQ(this->net_->blob_by_name("top_loss"), this->net_->blobs()[3]);
 }
 
 TYPED_TEST(NetTest, TestHasLayer) {