diff --git a/docs/api/python/gluon/loss.md b/docs/api/python/gluon/loss.md
index 3747a0f89bf2..948f4983d370 100644
--- a/docs/api/python/gluon/loss.md
+++ b/docs/api/python/gluon/loss.md
@@ -25,6 +25,7 @@ This package includes several commonly used loss functions in neural networks.
     LogisticLoss
     TripletLoss
     CTCLoss
+    CosineEmbeddingLoss
     PoissonNLLLoss
 ```
 
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 5d3ebb7caf5b..da43b62a1c34 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -23,7 +23,7 @@
            'SigmoidBinaryCrossEntropyLoss', 'SigmoidBCELoss',
            'SoftmaxCrossEntropyLoss', 'SoftmaxCELoss',
            'KLDivLoss', 'CTCLoss', 'HuberLoss', 'HingeLoss',
-           'SquaredHingeLoss', 'LogisticLoss', 'TripletLoss', 'PoissonNLLLoss']
+           'SquaredHingeLoss', 'LogisticLoss', 'TripletLoss', 'PoissonNLLLoss', 'CosineEmbeddingLoss']
 
 import numpy as np
 from .. import ndarray
@@ -767,3 +767,71 @@ def hybrid_forward(self, F, pred, target, sample_weight=None, epsilon=1e-08):
             loss += stirling_factor
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss)
+
+
+class CosineEmbeddingLoss(Loss):
+    r"""For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance
+    between the vectors. This can be interpreted as how similar/dissimilar two input vectors are.
+
+    .. math::
+
+        L = \sum_i \begin{cases} 1 - {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = 1\\
+                         {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = -1 \end{cases}\\
+        cos\_sim(input1, input2) = \frac{{input1}_i.{input2}_i}{||{input1}_i||.||{input2}_i||}
+
+    `input1`, `input2` can have arbitrary shape as long as they have the same number of elements.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+    margin : float
+        Margin of separation between correct and incorrect pair.
+
+
+    Inputs:
+        - **input1**: a tensor with arbitrary shape
+        - **input2**: another tensor with same shape as pred to which input1 is
+          compared for similarity and loss calculation
+        - **label**: A 1-D tensor indicating for each pair input1 and input2, target label is 1 or -1
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as input1. For example, if input1 has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: The loss tensor with shape (batch_size,).
+    """
+    def __init__(self, weight=None, batch_axis=0, margin=0, **kwargs):
+        super(CosineEmbeddingLoss, self).__init__(weight, batch_axis, **kwargs)
+        self._margin = margin
+
+    def hybrid_forward(self, F, input1, input2, label, sample_weight=None):
+        input1 = _reshape_like(F, input1, input2)
+        label = label.reshape((-1, 1))
+        cos_sim = self._cosine_similarity(F, input1, input2)
+        y_1 = label == 1
+        y_minus_1 = label == -1
+        cos_sim_a = (1 - cos_sim) * y_1
+
+        if F is ndarray:
+            z_array = F.array([0])
+        else:
+            z_array = F.zeros((1, 1))
+        cos_sim_b = F.broadcast_maximum(z_array, y_minus_1 * (cos_sim - self._margin), axis=1)
+        loss = cos_sim_a + cos_sim_b
+        loss = _apply_weighting(F, loss, self._weight, sample_weight)
+        return loss
+
+    def _cosine_similarity(self, F, x, y, axis=-1):
+        # Calculates the cosine similarity between 2 vectors
+        x_norm = F.norm(x, axis=axis).reshape(-1, 1)
+        y_norm = F.norm(y, axis=axis).reshape(-1, 1)
+        x_dot_y = F.sum(x*y, axis=axis).reshape(-1, 1)
+        if F is ndarray:
+            eps_arr = F.array([1e-12])
+        else:
+            eps_arr = F.full((1, 1), 1e-12)
+        return (x_dot_y / F.broadcast_maximum(x_norm * y_norm, eps_arr))
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 2b062fba5ec0..18d1ebf8fb11 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -349,6 +349,23 @@ def test_triplet_loss():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 @with_seed()
+def test_cosine_loss():
+    #Generating samples
+    input1 = mx.nd.random.randn(3, 2)
+    input2 = mx.nd.random.randn(3, 2)
+    label = mx.nd.sign(mx.nd.random.randn(input1.shape[0]))
+    #Calculating loss from cosine embedding loss function in Gluon
+    Loss = gluon.loss.CosineEmbeddingLoss()
+    loss = Loss(input1, input2, label)
+
+    # Calculating the loss Numpy way
+    numerator = mx.nd.sum(input1 * input2, keepdims=True, axis=1)
+    denominator = mx.nd.sqrt(mx.nd.sum(input1**2, axis=1, keepdims=True)) \
+    * mx.nd.sqrt(mx.nd.sum(input2**2, axis=1, keepdims=True))
+    numpy_loss = mx.nd.where(label == 1, 1-numerator/denominator, \
+    mx.nd.broadcast_maximum(mx.nd.array([0]), numerator/denominator, axis=1))
+    assert_almost_equal(loss.asnumpy(), numpy_loss.asnumpy(), rtol=1e-3, atol=1e-5)
+
 def test_poisson_nllloss():
     pred = mx.nd.random.normal(shape=(3, 4))
     min_pred = mx.nd.min(pred)
@@ -404,6 +421,7 @@ def test_poisson_nllloss_mod():
             optimizer='adam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()