diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 0692c11c257..25e167819d3 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -41,6 +41,8 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
       diff_.mutable_cpu_data());  // a_i-b_i
   const int channels = bottom[0]->channels();
   Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
   Dtype loss(0.0);
   for (int i = 0; i < bottom[0]->num(); ++i) {
     dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
@@ -48,7 +50,12 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
     if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
       loss += dist_sq_.cpu_data()[i];
     } else {  // dissimilar pairs
-      loss += std::max(margin-dist_sq_.cpu_data()[i], Dtype(0.0));
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+        loss += dist*dist;
+      }
     }
   }
   loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
@@ -59,6 +66,8 @@ template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
   for (int i = 0; i < 2; ++i) {
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
@@ -76,10 +85,20 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
               Dtype(0.0),
               bout + (j*channels));
         } else {  // dissimilar pairs
-          if ((margin-dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+          Dtype mdist(0.0);
+          Dtype beta(0.0);
+          if (legacy_version) {
+            mdist = margin - dist_sq_.cpu_data()[j];
+            beta = -alpha;
+          } else {
+            Dtype dist = sqrt(dist_sq_.cpu_data()[j]);
+            mdist = margin - dist;
+            beta = -alpha * mdist / (dist + Dtype(1e-4));
+          }
+          if (mdist > Dtype(0.0)) {
             caffe_cpu_axpby(
                 channels,
-                -alpha,
+                beta,
                 diff_.cpu_data() + (j*channels),
                 Dtype(0.0),
                 bout + (j*channels));
diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu
index 78a55995a0a..931239316ac 100644
--- a/src/caffe/layers/contrastive_loss_layer.cu
+++ b/src/caffe/layers/contrastive_loss_layer.cu
@@ -32,12 +32,20 @@ void ContrastiveLossLayer<Dtype>::Forward_gpu(
       Dtype(0.0),
       dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
   Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
   Dtype loss(0.0);
   for (int i = 0; i < bottom[0]->num(); ++i) {
     if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
       loss += dist_sq_.cpu_data()[i];
     } else {  // dissimilar pairs
-      loss += std::max(margin-dist_sq_.cpu_data()[i], Dtype(0.0));
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]),
+                              Dtype(0.0));
+        loss += dist*dist;
+      }
     }
   }
   loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
@@ -45,8 +53,8 @@ void ContrastiveLossLayer<Dtype>::Forward_gpu(
 }
 
 template <typename Dtype>
-__global__ void CLLForward(const int count, const int channels,
-    const Dtype margin, const Dtype alpha,
+__global__ void CLLBackward(const int count, const int channels,
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
     const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
     Dtype *bottom_diff) {
   CUDA_KERNEL_LOOP(i, count) {
@@ -54,8 +62,18 @@ __global__ void CLLForward(const int count, const int channels,
     if (static_cast<int>(y[n])) {  // similar pairs
       bottom_diff[i] = alpha * diff[i];
     } else {  // dissimilar pairs
-      if ((margin-dist_sq[n]) > 0.0) {
-        bottom_diff[i] = -alpha * diff[i];
+      Dtype mdist(0.0);
+      Dtype beta(0.0);
+      if (legacy_version) {
+        mdist = (margin - dist_sq[n]);
+        beta = -alpha;
+      } else {
+        Dtype dist = sqrt(dist_sq[n]);
+        mdist = (margin - dist);
+        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
+      }
+      if (mdist > 0.0) {
+        bottom_diff[i] = beta;
       } else {
         bottom_diff[i] = 0;
       }
@@ -71,12 +89,14 @@ void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const int count = bottom[0]->count();
       const int channels = bottom[0]->channels();
       Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+      const bool legacy_version =
+          this->layer_param_.contrastive_loss_param().legacy_version();
       const Dtype sign = (i == 0) ? 1 : -1;
       const Dtype alpha = sign * top[0]->cpu_diff()[0] /
           static_cast<Dtype>(bottom[0]->num());
       // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, channels, margin, alpha,
+      CLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+          count, channels, margin, legacy_version, alpha,
           bottom[2]->gpu_data(),  // pair similarity 0 or 1
           diff_.gpu_data(),  // the cached eltwise difference between a and b
           dist_sq_.gpu_data(),  // the cached square distance between a and b
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 5b21cf20028..fe4ce366972 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -401,8 +401,15 @@ message ConcatParameter {
 
 // Message that stores parameters used by ContrastiveLossLayer
 message ContrastiveLossParameter {
-  //margin for dissimilar pair
+  // margin for dissimilar pair
   optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false]; 
 }
 
 // Message that stores parameters used by ConvolutionLayer
diff --git a/src/caffe/test/test_contrastive_loss_layer.cpp b/src/caffe/test/test_contrastive_loss_layer.cpp
index d269fbc26f2..1e9447cbc51 100644
--- a/src/caffe/test/test_contrastive_loss_layer.cpp
+++ b/src/caffe/test/test_contrastive_loss_layer.cpp
@@ -22,15 +22,15 @@ class ContrastiveLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   ContrastiveLossLayerTest()
-      : blob_bottom_data_i_(new Blob<Dtype>(128, 10, 1, 1)),
-        blob_bottom_data_j_(new Blob<Dtype>(128, 10, 1, 1)),
-        blob_bottom_y_(new Blob<Dtype>(128, 1, 1, 1)),
+      : blob_bottom_data_i_(new Blob<Dtype>(512, 2, 1, 1)),
+        blob_bottom_data_j_(new Blob<Dtype>(512, 2, 1, 1)),
+        blob_bottom_y_(new Blob<Dtype>(512, 1, 1, 1)),
         blob_top_loss_(new Blob<Dtype>()) {
     // fill the values
     FillerParameter filler_param;
-    filler_param.set_mean(0.0);
-    filler_param.set_std(0.3);  // distances~=1.0 to test both sides of margin
-    GaussianFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-1.0);
+    filler_param.set_max(1.0);  // distances~=1.0 to test both sides of margin
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_i_);
     blob_bottom_vec_.push_back(blob_bottom_data_i_);
     filler.Fill(this->blob_bottom_data_j_);
@@ -79,7 +79,8 @@ TYPED_TEST(ContrastiveLossLayerTest, TestForward) {
     if (this->blob_bottom_y_->cpu_data()[i]) {  // similar pairs
       loss += dist_sq;
     } else {
-      loss += std::max(margin-dist_sq, Dtype(0));
+      Dtype dist = std::max(margin - sqrt(dist_sq), 0.0);
+      loss += dist*dist;
     }
   }
   loss /= static_cast<Dtype>(num) * Dtype(2);
@@ -99,4 +100,47 @@ TYPED_TEST(ContrastiveLossLayerTest, TestGradient) {
       this->blob_top_vec_, 1);
 }
 
+TYPED_TEST(ContrastiveLossLayerTest, TestForwardLegacy) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_contrastive_loss_param()->set_legacy_version(true);
+  ContrastiveLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // manually compute to compare
+  const Dtype margin = layer_param.contrastive_loss_param().margin();
+  const int num = this->blob_bottom_data_i_->num();
+  const int channels = this->blob_bottom_data_i_->channels();
+  Dtype loss(0);
+  for (int i = 0; i < num; ++i) {
+    Dtype dist_sq(0);
+    for (int j = 0; j < channels; ++j) {
+      Dtype diff = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
+          this->blob_bottom_data_j_->cpu_data()[i*channels+j];
+      dist_sq += diff*diff;
+    }
+    if (this->blob_bottom_y_->cpu_data()[i]) {  // similar pairs
+      loss += dist_sq;
+    } else {
+      loss += std::max(margin - dist_sq, Dtype(0.0));
+    }
+  }
+  loss /= static_cast<Dtype>(num) * Dtype(2);
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6);
+}
+
+TYPED_TEST(ContrastiveLossLayerTest, TestGradientLegacy) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_contrastive_loss_param()->set_legacy_version(true);
+  ContrastiveLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  // check the gradient for the first two bottom layers
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 1);
+}
+
 }  // namespace caffe