diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 0692c11c257..25e167819d3 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -41,6 +41,8 @@ void ContrastiveLossLayer::Forward_cpu( diff_.mutable_cpu_data()); // a_i-b_i const int channels = bottom[0]->channels(); Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); Dtype loss(0.0); for (int i = 0; i < bottom[0]->num(); ++i) { dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, @@ -48,7 +50,12 @@ void ContrastiveLossLayer::Forward_cpu( if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs loss += dist_sq_.cpu_data()[i]; } else { // dissimilar pairs - loss += std::max(margin-dist_sq_.cpu_data()[i], Dtype(0.0)); + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist*dist; + } } } loss = loss / static_cast(bottom[0]->num()) / Dtype(2); @@ -59,6 +66,8 @@ template void ContrastiveLossLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; @@ -76,10 +85,20 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, Dtype(0.0), bout + (j*channels)); } else { // dissimilar pairs - if ((margin-dist_sq_.cpu_data()[j]) > Dtype(0.0)) { + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = margin - dist_sq_.cpu_data()[j]; + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq_.cpu_data()[j]); + mdist = margin - dist; + beta = -alpha * mdist / (dist + Dtype(1e-4)); + } + if (mdist > Dtype(0.0)) { caffe_cpu_axpby( channels, - -alpha, + beta, diff_.cpu_data() + (j*channels), Dtype(0.0), bout + (j*channels)); diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu index 78a55995a0a..931239316ac 100644 --- a/src/caffe/layers/contrastive_loss_layer.cu +++ b/src/caffe/layers/contrastive_loss_layer.cu @@ -32,12 +32,20 @@ void ContrastiveLossLayer::Forward_gpu( Dtype(0.0), dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); Dtype loss(0.0); for (int i = 0; i < bottom[0]->num(); ++i) { if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs loss += dist_sq_.cpu_data()[i]; } else { // dissimilar pairs - loss += std::max(margin-dist_sq_.cpu_data()[i], Dtype(0.0)); + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), + Dtype(0.0)); + loss += dist*dist; + } } } loss = loss / static_cast(bottom[0]->num()) / Dtype(2); @@ -45,8 +53,8 @@ void ContrastiveLossLayer::Forward_gpu( } template -__global__ void CLLForward(const int count, const int channels, - const Dtype margin, const Dtype alpha, +__global__ void CLLBackward(const int count, const int channels, + const Dtype margin, const bool legacy_version, const Dtype alpha, const Dtype* y, const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff) { CUDA_KERNEL_LOOP(i, count) { @@ -54,8 +62,18 @@ __global__ void CLLForward(const int count, const int channels, if (static_cast(y[n])) { // similar pairs bottom_diff[i] = alpha * diff[i]; } else { // dissimilar pairs - if ((margin-dist_sq[n]) > 0.0) { - bottom_diff[i] = -alpha * diff[i]; + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = (margin - dist_sq[n]); + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq[n]); + mdist = (margin - dist); + beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; + } + if (mdist > 0.0) { + bottom_diff[i] = beta; } else { bottom_diff[i] = 0; } @@ -71,12 +89,14 @@ void ContrastiveLossLayer::Backward_gpu(const vector*>& top, const int count = bottom[0]->count(); const int channels = bottom[0]->channels(); Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + const bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / static_cast(bottom[0]->num()); // NOLINT_NEXT_LINE(whitespace/operators) - CLLForward<<>>( - count, channels, margin, alpha, + CLLBackward<<>>( + count, channels, margin, legacy_version, alpha, bottom[2]->gpu_data(), // pair similarity 0 or 1 diff_.gpu_data(), // the cached eltwise difference between a and b dist_sq_.gpu_data(), // the cached square distance between a and b diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 5b21cf20028..fe4ce366972 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -401,8 +401,15 @@ message ConcatParameter { // Message that stores parameters used by ContrastiveLossLayer message ContrastiveLossParameter { - //margin for dissimilar pair + // margin for dissimilar pair optional float margin = 1 [default = 1.0]; + // The first implementation of this cost did not exactly match the cost of + // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2. + // legacy_version = false (the default) uses (margin - d)^2 as proposed in the + // Hadsell paper. New models should probably use this version. + // legacy_version = true uses (margin - d^2). This is kept to support / + // reproduce existing models and results + optional bool legacy_version = 2 [default = false]; } // Message that stores parameters used by ConvolutionLayer diff --git a/src/caffe/test/test_contrastive_loss_layer.cpp b/src/caffe/test/test_contrastive_loss_layer.cpp index d269fbc26f2..1e9447cbc51 100644 --- a/src/caffe/test/test_contrastive_loss_layer.cpp +++ b/src/caffe/test/test_contrastive_loss_layer.cpp @@ -22,15 +22,15 @@ class ContrastiveLossLayerTest : public MultiDeviceTest { protected: ContrastiveLossLayerTest() - : blob_bottom_data_i_(new Blob(128, 10, 1, 1)), - blob_bottom_data_j_(new Blob(128, 10, 1, 1)), - blob_bottom_y_(new Blob(128, 1, 1, 1)), + : blob_bottom_data_i_(new Blob(512, 2, 1, 1)), + blob_bottom_data_j_(new Blob(512, 2, 1, 1)), + blob_bottom_y_(new Blob(512, 1, 1, 1)), blob_top_loss_(new Blob()) { // fill the values FillerParameter filler_param; - filler_param.set_mean(0.0); - filler_param.set_std(0.3); // distances~=1.0 to test both sides of margin - GaussianFiller filler(filler_param); + filler_param.set_min(-1.0); + filler_param.set_max(1.0); // distances~=1.0 to test both sides of margin + UniformFiller filler(filler_param); filler.Fill(this->blob_bottom_data_i_); blob_bottom_vec_.push_back(blob_bottom_data_i_); filler.Fill(this->blob_bottom_data_j_); @@ -79,7 +79,8 @@ TYPED_TEST(ContrastiveLossLayerTest, TestForward) { if (this->blob_bottom_y_->cpu_data()[i]) { // similar pairs loss += dist_sq; } else { - loss += std::max(margin-dist_sq, Dtype(0)); + Dtype dist = std::max(margin - sqrt(dist_sq), 0.0); + loss += dist*dist; } } loss /= static_cast(num) * Dtype(2); @@ -99,4 +100,47 @@ TYPED_TEST(ContrastiveLossLayerTest, TestGradient) { this->blob_top_vec_, 1); } +TYPED_TEST(ContrastiveLossLayerTest, TestForwardLegacy) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + layer_param.mutable_contrastive_loss_param()->set_legacy_version(true); + ContrastiveLossLayer layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // manually compute to compare + const Dtype margin = layer_param.contrastive_loss_param().margin(); + const int num = this->blob_bottom_data_i_->num(); + const int channels = this->blob_bottom_data_i_->channels(); + Dtype loss(0); + for (int i = 0; i < num; ++i) { + Dtype dist_sq(0); + for (int j = 0; j < channels; ++j) { + Dtype diff = this->blob_bottom_data_i_->cpu_data()[i*channels+j] - + this->blob_bottom_data_j_->cpu_data()[i*channels+j]; + dist_sq += diff*diff; + } + if (this->blob_bottom_y_->cpu_data()[i]) { // similar pairs + loss += dist_sq; + } else { + loss += std::max(margin - dist_sq, Dtype(0.0)); + } + } + loss /= static_cast(num) * Dtype(2); + EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6); +} + +TYPED_TEST(ContrastiveLossLayerTest, TestGradientLegacy) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + layer_param.mutable_contrastive_loss_param()->set_legacy_version(true); + ContrastiveLossLayer layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + GradientChecker checker(1e-2, 1e-2, 1701); + // check the gradient for the first two bottom layers + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 1); +} + } // namespace caffe