From 79afc4a063b77b7e0ca8f1ceb7794c4b1b408f07 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Mon, 12 Feb 2018 18:00:43 -0800
Subject: [PATCH 01/10] add macro/micro f1 and test and binary abstraction

---
 python/mxnet/metric.py               | 131 +++++++++++++++++++--------
 tests/python/unittest/test_metric.py |  47 ++++++++++
 2 files changed, 141 insertions(+), 37 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 8bb3f6ee0a81..86e0863c7cc3 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -155,6 +155,9 @@ def get_name_value(self):
             value = [value]
         return list(zip(name, value))
 
+    def macro(self):
+        return MacroMetric(self)
+
 # pylint: disable=invalid-name
 register = registry.get_register_func(EvalMetric, 'metric')
 alias = registry.get_alias_func(EvalMetric, 'metric')
@@ -475,8 +478,72 @@ def update(self, labels, preds):
             self.num_inst += num_samples
 
 
+class BinaryClassificationMetric(EvalMetric):
+
+    def __init__(self):
+        self.num_inst = 0
+        self.true_positives = 0
+        self.false_negatives = 0
+        self.false_positives = 0
+
+    def _update_binary_stats(self, label, pred):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        pred = pred.asnumpy()
+        label = label.asnumpy().astype('int32')
+        pred_label = numpy.argmax(pred, axis=1)
+
+        check_label_shapes(label, pred)
+        if len(numpy.unique(label)) > 2:
+            raise ValueError("F1 currently only supports binary classification.")
+
+        for y_pred, y_true in zip(pred_label, label):
+            if y_pred == 1 and y_true == 1:
+                self.true_positives += 1.
+            elif y_pred == 1 and y_true == 0:
+                self.false_positives += 1.
+            elif y_pred == 0 and y_true == 1:
+                self.false_negatives += 1.
+        self.num_inst += label.shape[0]
+
+
+    @property
+    def _precision(self):
+        if self.true_positives + self.false_positives > 0:
+            return self.true_positives / (self.true_positives + self.false_positives)
+        else:
+            return 0.
+
+    @property
+    def _recall(self):
+        if self.true_positives + self.false_negatives > 0:
+            return self.true_positives / (self.true_positives + self.false_negatives)
+        else:
+            return 0.
+
+    @property
+    def _fscore(self):
+        if self._precision + self._recall > 0:
+            return 2 * self._precision * self._recall / (self._precision + self._recall)
+        else:
+            return 0.
+
+    def reset(self):
+        self.num_inst = 0
+        self.false_positives = 0
+        self.false_negatives = 0
+        self.true_positives = 0
+
 @register
-class F1(EvalMetric):
+class F1(BinaryClassificationMetric):
     """Computes the F1 score of a binary classification problem.
 
     The F1 score is equivalent to weighted average of the precision and recall,
@@ -508,16 +575,16 @@ class F1(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0., 1., 1.])]
-    >>> acc = mx.metric.F1()
-    >>> acc.update(preds = predicts, labels = labels)
-    >>> print acc.get()
+    >>> f1 = mx.metric.F1()
+    >>> f1.update(preds = predicts, labels = labels)
+    >>> print f1.get()
     ('f1', 0.8)
     """
 
-    def __init__(self, name='f1',
-                 output_names=None, label_names=None):
-        super(F1, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+    def __init__(self, name='f1', output_names=None, label_names=None):
+        BinaryClassificationMetric.__init__(self)
+        EvalMetric.__init__(self,
+                            name, output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -533,41 +600,31 @@ def update(self, labels, preds):
         check_label_shapes(labels, preds)
 
         for label, pred in zip(labels, preds):
-            pred = pred.asnumpy()
-            label = label.asnumpy().astype('int32')
-            pred_label = numpy.argmax(pred, axis=1)
+            self._update_binary_stats(label, pred)
 
-            check_label_shapes(label, pred)
-            if len(numpy.unique(label)) > 2:
-                raise ValueError("F1 currently only supports binary classification.")
-
-            true_positives, false_positives, false_negatives = 0., 0., 0.
+    def get(self):
+        if self.num_inst == 0:
+            return self.name, float('nan')
+        else:
+            return self.name, self._fscore
 
-            for y_pred, y_true in zip(pred_label, label):
-                if y_pred == 1 and y_true == 1:
-                    true_positives += 1.
-                elif y_pred == 1 and y_true == 0:
-                    false_positives += 1.
-                elif y_pred == 0 and y_true == 1:
-                    false_negatives += 1.
+class MacroMetric(EvalMetric):
 
-            if true_positives + false_positives > 0:
-                precision = true_positives / (true_positives + false_positives)
-            else:
-                precision = 0.
+    def __init__(self, base_metric):
+        super(MacroMetric, self).__init__("macro_" + base_metric.name, output_names=base_metric.output_names,
+                                          label_names=base_metric.label_names)
+        self.base_metric = base_metric
 
-            if true_positives + false_negatives > 0:
-                recall = true_positives / (true_positives + false_negatives)
-            else:
-                recall = 0.
+    def update(self, labels, preds):
+        self.base_metric.update(labels, preds)
+        self.sum_metric += self.base_metric.get()[1]
+        self.num_inst += 1
+        self.base_metric.reset()
 
-            if precision + recall > 0:
-                f1_score = 2 * precision * recall / (precision + recall)
-            else:
-                f1_score = 0.
+class MacroF1(MacroMetric):
 
-            self.sum_metric += f1_score
-            self.num_inst += 1
+    def __init__(self, name, output_names, label_names):
+        super(MacroF1, self).__init__(F1(name, output_names=output_names, label_names=label_names))
 
 
 @register
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 0f2f27f9eb24..090d29d8c94e 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -26,6 +26,53 @@ def check_metric(metric, *args, **kwargs):
 
     assert metric.get_config() == metric2.get_config()
 
+def test_f1():
+    microF1 = mx.metric.F1()
+    macroF1 = mx.metric.F1().macro()
+
+    assert np.isnan(macroF1.get()[1])
+    assert np.isnan(microF1.get()[1])
+
+    # check divide by zero
+    pred = mx.nd.array([[0.9, 0.1],
+                        [0.8, 0.2]])
+    label = mx.nd.array([0, 0])
+    macroF1.update([label], [pred])
+    microF1.update([label], [pred])
+    assert macroF1.get()[1] == 0.0
+    assert microF1.get()[1] == 0.0
+    macroF1.reset()
+    microF1.reset()
+
+    pred11 = mx.nd.array([[0.1, 0.9],
+                         [0.5, 0.5]])
+    label11 = mx.nd.array([1, 0])
+    pred12 = mx.nd.array([[0.85, 0.15],
+                          [1.0, 0.0]])
+    label12 = mx.nd.array([1, 0])
+    pred21 = mx.nd.array([[0.6, 0.4]])
+    label21 = mx.nd.array([0])
+    pred22 = mx.nd.array([[0.2, 0.8]])
+    label22 = mx.nd.array([1])
+
+    microF1.update([label11, label12], [pred11, pred12])
+    macroF1.update([label11, label12], [pred11, pred12])
+    assert microF1.num_inst == 4
+    assert macroF1.num_inst == 1
+    # f1 = 2 * tp / (2 * tp + fp + fn)
+    fscore1 = 2 * (1) / (2 * 1 + 1 + 0)
+    assert microF1.get()[1] == fscore1
+    assert macroF1.get()[1] == fscore1
+
+    microF1.update([label21, label22], [pred21, pred22])
+    macroF1.update([label21, label22], [pred21, pred22])
+    assert microF1.num_inst == 6
+    assert macroF1.num_inst == 2
+    fscore2 = 2 * (1) / (2 * 1 + 0 + 0)
+    assert macroF1.get()[1] == (fscore1 + fscore2) / 2
+    fscore_total = 2 * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
+    assert microF1.get()[1] == fscore_total
+
 
 def test_metrics():
     check_metric('acc', axis=0)

From bc44e9d744f133b4814939605accec006c61efd2 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Mon, 12 Feb 2018 19:11:06 -0800
Subject: [PATCH 02/10] make average an option

---
 python/mxnet/metric.py               | 98 ++++++++++++++--------------
 tests/python/unittest/test_metric.py | 12 ++--
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 86e0863c7cc3..bd94ad0c0391 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -155,9 +155,6 @@ def get_name_value(self):
             value = [value]
         return list(zip(name, value))
 
-    def macro(self):
-        return MacroMetric(self)
-
 # pylint: disable=invalid-name
 register = registry.get_register_func(EvalMetric, 'metric')
 alias = registry.get_alias_func(EvalMetric, 'metric')
@@ -478,23 +475,28 @@ def update(self, labels, preds):
             self.num_inst += num_samples
 
 
-class BinaryClassificationMetric(EvalMetric):
+class _BinaryClassificationMixin(object):
+    """
+    Private class for keeping track of TPR, FPR, TNR, FNR counts for a classification metric. This
+    class is not intended to be instantiated directly, but extended by concrete metric types.
+    """
 
     def __init__(self):
-        self.num_inst = 0
-        self.true_positives = 0
-        self.false_negatives = 0
-        self.false_positives = 0
+        self._true_positives = 0
+        self._false_negatives = 0
+        self._false_positives = 0
+        self._true_negatives = 0
 
     def _update_binary_stats(self, label, pred):
-        """Updates the internal evaluation result.
+        """
+        Update various binary classification counts for a single (label, pred) pair.
 
         Parameters
         ----------
-        labels : list of `NDArray`
+        labels : `NDArray`
             The labels of the data.
 
-        preds : list of `NDArray`
+        preds : `NDArray`
             Predicted values.
         """
         pred = pred.asnumpy()
@@ -507,25 +509,26 @@ def _update_binary_stats(self, label, pred):
 
         for y_pred, y_true in zip(pred_label, label):
             if y_pred == 1 and y_true == 1:
-                self.true_positives += 1.
+                self._true_positives += 1.
             elif y_pred == 1 and y_true == 0:
-                self.false_positives += 1.
+                self._false_positives += 1.
             elif y_pred == 0 and y_true == 1:
-                self.false_negatives += 1.
-        self.num_inst += label.shape[0]
+                self._false_negatives += 1.
+            else:
+                self._true_negatives += 1.
 
 
     @property
     def _precision(self):
-        if self.true_positives + self.false_positives > 0:
-            return self.true_positives / (self.true_positives + self.false_positives)
+        if self._true_positives + self._false_positives > 0:
+            return self._true_positives / (self._true_positives + self._false_positives)
         else:
             return 0.
 
     @property
     def _recall(self):
-        if self.true_positives + self.false_negatives > 0:
-            return self.true_positives / (self.true_positives + self.false_negatives)
+        if self._true_positives + self._false_negatives > 0:
+            return self._true_positives / (self._true_positives + self._false_negatives)
         else:
             return 0.
 
@@ -536,14 +539,17 @@ def _fscore(self):
         else:
             return 0.
 
-    def reset(self):
-        self.num_inst = 0
-        self.false_positives = 0
-        self.false_negatives = 0
-        self.true_positives = 0
+    def _total_examples(self):
+        return self._false_negatives + self._false_positives + self._true_negatives + self._true_positives
+
+    def _reset_stats(self):
+        self._false_positives = 0
+        self._false_negatives = 0
+        self._true_positives = 0
+        self._true_negatives = 0
 
 @register
-class F1(BinaryClassificationMetric):
+class F1(EvalMetric, _BinaryClassificationMixin):
     """Computes the F1 score of a binary classification problem.
 
     The F1 score is equivalent to weighted average of the precision and recall,
@@ -570,6 +576,10 @@ class F1(BinaryClassificationMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
+    average : str
+        Strategy to be used for aggregating across micro-batches.
+            "macro": average the F1 scores for each batch
+            "micro": compute a single F1 score across all batches
 
     Examples
     --------
@@ -581,10 +591,11 @@ class F1(BinaryClassificationMetric):
     ('f1', 0.8)
     """
 
-    def __init__(self, name='f1', output_names=None, label_names=None):
-        BinaryClassificationMetric.__init__(self)
+    def __init__(self, name='f1', output_names=None, label_names=None, average="macro"):
+        _BinaryClassificationMixin.__init__(self)
         EvalMetric.__init__(self,
                             name, output_names=output_names, label_names=label_names)
+        self.average = average
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -601,30 +612,19 @@ def update(self, labels, preds):
 
         for label, pred in zip(labels, preds):
             self._update_binary_stats(label, pred)
-
-    def get(self):
-        if self.num_inst == 0:
-            return self.name, float('nan')
+        if self.average == "macro":
+            self.sum_metric += self._fscore
+            self.num_inst += 1
+            self._reset_stats()
         else:
-            return self.name, self._fscore
-
-class MacroMetric(EvalMetric):
-
-    def __init__(self, base_metric):
-        super(MacroMetric, self).__init__("macro_" + base_metric.name, output_names=base_metric.output_names,
-                                          label_names=base_metric.label_names)
-        self.base_metric = base_metric
+            self.sum_metric = self._fscore * self._total_examples()
+            self.num_inst = self._total_examples()
 
-    def update(self, labels, preds):
-        self.base_metric.update(labels, preds)
-        self.sum_metric += self.base_metric.get()[1]
-        self.num_inst += 1
-        self.base_metric.reset()
-
-class MacroF1(MacroMetric):
-
-    def __init__(self, name, output_names, label_names):
-        super(MacroF1, self).__init__(F1(name, output_names=output_names, label_names=label_names))
+    def reset(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0.
+        self._reset_stats()
 
 
 @register
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 090d29d8c94e..75ab117052e8 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -27,8 +27,8 @@ def check_metric(metric, *args, **kwargs):
     assert metric.get_config() == metric2.get_config()
 
 def test_f1():
-    microF1 = mx.metric.F1()
-    macroF1 = mx.metric.F1().macro()
+    microF1 = mx.metric.F1(average="micro")
+    macroF1 = mx.metric.F1(average="macro")
 
     assert np.isnan(macroF1.get()[1])
     assert np.isnan(microF1.get()[1])
@@ -61,17 +61,17 @@ def test_f1():
     assert macroF1.num_inst == 1
     # f1 = 2 * tp / (2 * tp + fp + fn)
     fscore1 = 2 * (1) / (2 * 1 + 1 + 0)
-    assert microF1.get()[1] == fscore1
-    assert macroF1.get()[1] == fscore1
+    np.testing.assert_almost_equal(microF1.get()[1], fscore1)
+    np.testing.assert_almost_equal(macroF1.get()[1], fscore1)
 
     microF1.update([label21, label22], [pred21, pred22])
     macroF1.update([label21, label22], [pred21, pred22])
     assert microF1.num_inst == 6
     assert macroF1.num_inst == 2
     fscore2 = 2 * (1) / (2 * 1 + 0 + 0)
-    assert macroF1.get()[1] == (fscore1 + fscore2) / 2
     fscore_total = 2 * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
-    assert microF1.get()[1] == fscore_total
+    np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
+    np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2)
 
 
 def test_metrics():

From 7ec5a8861a622b450492094ce23a55fb0d35224d Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Mon, 12 Feb 2018 19:54:57 -0800
Subject: [PATCH 03/10] use metric.create

---
 python/mxnet/metric.py               | 26 ++++++----
 tests/python/unittest/test_metric.py | 75 +++++++++++-----------------
 2 files changed, 45 insertions(+), 56 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index bd94ad0c0391..d0400da0ff6c 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -477,8 +477,7 @@ def update(self, labels, preds):
 
 class _BinaryClassificationMixin(object):
     """
-    Private class for keeping track of TPR, FPR, TNR, FNR counts for a classification metric. This
-    class is not intended to be instantiated directly, but extended by concrete metric types.
+    Private mixin for keeping track of TPR, FPR, TNR, FNR counts for a classification metric.
     """
 
     def __init__(self):
@@ -489,14 +488,15 @@ def __init__(self):
 
     def _update_binary_stats(self, label, pred):
         """
-        Update various binary classification counts for a single (label, pred) pair.
+        Update various binary classification counts for a single (label, pred)
+        pair.
 
         Parameters
         ----------
-        labels : `NDArray`
+        label : `NDArray`
             The labels of the data.
 
-        preds : `NDArray`
+        pred : `NDArray`
             Predicted values.
         """
         pred = pred.asnumpy()
@@ -539,8 +539,10 @@ def _fscore(self):
         else:
             return 0.
 
+    @property
     def _total_examples(self):
-        return self._false_negatives + self._false_positives + self._true_negatives + self._true_positives
+        return self._false_negatives + self._false_positives + \
+               self._true_negatives + self._true_positives
 
     def _reset_stats(self):
         self._false_positives = 0
@@ -591,10 +593,11 @@ class F1(EvalMetric, _BinaryClassificationMixin):
     ('f1', 0.8)
     """
 
-    def __init__(self, name='f1', output_names=None, label_names=None, average="macro"):
+    def __init__(self, name='f1',
+                 output_names=None, label_names=None, average="macro"):
         _BinaryClassificationMixin.__init__(self)
-        EvalMetric.__init__(self,
-                            name, output_names=output_names, label_names=label_names)
+        EvalMetric.__init__(self, name=name,
+                            output_names=output_names, label_names=label_names)
         self.average = average
 
     def update(self, labels, preds):
@@ -612,13 +615,14 @@ def update(self, labels, preds):
 
         for label, pred in zip(labels, preds):
             self._update_binary_stats(label, pred)
+
         if self.average == "macro":
             self.sum_metric += self._fscore
             self.num_inst += 1
             self._reset_stats()
         else:
-            self.sum_metric = self._fscore * self._total_examples()
-            self.num_inst = self._total_examples()
+            self.sum_metric = self._fscore * self._total_examples
+            self.num_inst = self._total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 75ab117052e8..ad0d35d642fc 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -26,8 +26,36 @@ def check_metric(metric, *args, **kwargs):
 
     assert metric.get_config() == metric2.get_config()
 
+def test_metrics():
+    check_metric('acc', axis=0)
+    check_metric('f1')
+    check_metric('perplexity', -1)
+    check_metric('pearsonr')
+    check_metric('nll_loss')
+    composite = mx.metric.create(['acc', 'f1'])
+    check_metric(composite)
+
+def test_nll_loss():
+    metric = mx.metric.create('nll_loss')
+    pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
+    label = mx.nd.array([2, 1])
+    metric.update([label], [pred])
+    _, loss = metric.get()
+    expected_loss = 0.0
+    expected_loss = -(np.log(pred[0][2].asscalar()) + np.log(pred[1][1].asscalar())) / 2
+    assert loss == expected_loss
+
+def test_acc():
+    pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
+    label = mx.nd.array([0, 1, 1])
+    metric = mx.metric.create('acc')
+    metric.update([label], [pred])
+    _, acc = metric.get()
+    expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size
+    assert acc == expected_acc
+
 def test_f1():
-    microF1 = mx.metric.F1(average="micro")
+    microF1 = mx.metric.create("f1", average="micro")
     macroF1 = mx.metric.F1(average="macro")
 
     assert np.isnan(macroF1.get()[1])
@@ -45,7 +73,7 @@ def test_f1():
     microF1.reset()
 
     pred11 = mx.nd.array([[0.1, 0.9],
-                         [0.5, 0.5]])
+                          [0.5, 0.5]])
     label11 = mx.nd.array([1, 0])
     pred12 = mx.nd.array([[0.85, 0.15],
                           [1.0, 0.0]])
@@ -73,49 +101,6 @@ def test_f1():
     np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
     np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2)
 
-
-def test_metrics():
-    check_metric('acc', axis=0)
-    check_metric('f1')
-    check_metric('perplexity', -1)
-    check_metric('pearsonr')
-    check_metric('nll_loss')
-    composite = mx.metric.create(['acc', 'f1'])
-    check_metric(composite)
-
-def test_nll_loss():
-    metric = mx.metric.create('nll_loss')
-    pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
-    label = mx.nd.array([2, 1])
-    metric.update([label], [pred])
-    _, loss = metric.get()
-    expected_loss = 0.0
-    expected_loss = -(np.log(pred[0][2].asscalar()) + np.log(pred[1][1].asscalar())) / 2
-    assert loss == expected_loss
-
-def test_acc():
-    pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
-    label = mx.nd.array([0, 1, 1])
-    metric = mx.metric.create('acc')
-    metric.update([label], [pred])
-    _, acc = metric.get()
-    expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size
-    assert acc == expected_acc
-
-def test_f1():
-    pred = mx.nd.array([[0.3, 0.7], [1., 0], [0.4, 0.6], [0.6, 0.4], [0.9, 0.1]])
-    label = mx.nd.array([0, 1, 1, 1, 1])
-    positives = np.argmax(pred, axis=1).sum().asscalar()
-    true_positives = (np.argmax(pred, axis=1) == label).sum().asscalar()
-    precision = true_positives / positives
-    overall_positives = label.sum().asscalar()
-    recall = true_positives / overall_positives
-    f1_expected = 2 * (precision * recall) / (precision + recall)
-    metric = mx.metric.create('f1')
-    metric.update([label], [pred])
-    _, f1 = metric.get()
-    assert f1 == f1_expected
-
 def test_perplexity():
     pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])
     label = mx.nd.array([0, 1, 1])

From dc0f0e84aa6dce9635ce8315bce48453de64b287 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Mon, 12 Feb 2018 23:01:06 -0800
Subject: [PATCH 04/10] add decimal for float division

---
 tests/python/unittest/test_metric.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index ad0d35d642fc..fee8b66e3af5 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -88,7 +88,7 @@ def test_f1():
     assert microF1.num_inst == 4
     assert macroF1.num_inst == 1
     # f1 = 2 * tp / (2 * tp + fp + fn)
-    fscore1 = 2 * (1) / (2 * 1 + 1 + 0)
+    fscore1 = 2. * (1) / (2 * 1 + 1 + 0)
     np.testing.assert_almost_equal(microF1.get()[1], fscore1)
     np.testing.assert_almost_equal(macroF1.get()[1], fscore1)
 
@@ -96,10 +96,10 @@ def test_f1():
     macroF1.update([label21, label22], [pred21, pred22])
     assert microF1.num_inst == 6
     assert macroF1.num_inst == 2
-    fscore2 = 2 * (1) / (2 * 1 + 0 + 0)
-    fscore_total = 2 * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
+    fscore2 = 2. * (1) / (2 * 1 + 0 + 0)
+    fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
     np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
-    np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2)
+    np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.)
 
 def test_perplexity():
     pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])

From e448019cbb919f3cfc61e78f1b8cf6ec02bc9186 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Tue, 13 Feb 2018 09:31:53 -0800
Subject: [PATCH 05/10] add default in docstring, reference generic base class
 in error msg

---
 python/mxnet/metric.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index d0400da0ff6c..76c168024e31 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -505,7 +505,8 @@ def _update_binary_stats(self, label, pred):
 
         check_label_shapes(label, pred)
         if len(numpy.unique(label)) > 2:
-            raise ValueError("F1 currently only supports binary classification.")
+            raise ValueError("%s currently only supports binary classification."
+                             % self.__class__.__name__)
 
         for y_pred, y_true in zip(pred_label, label):
             if y_pred == 1 and y_true == 1:
@@ -578,7 +579,7 @@ class F1(EvalMetric, _BinaryClassificationMixin):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    average : str
+    average : str, default 'macro'
         Strategy to be used for aggregating across micro-batches.
             "macro": average the F1 scores for each batch
             "micro": compute a single F1 score across all batches

From 3c86317a7b0ee5f48733854b10d384b9908c5cd4 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Tue, 13 Feb 2018 12:20:14 -0800
Subject: [PATCH 06/10] expand on docstring

---
 python/mxnet/metric.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 76c168024e31..28976ae185cf 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -477,7 +477,9 @@ def update(self, labels, preds):
 
 class _BinaryClassificationMixin(object):
     """
-    Private mixin for keeping track of TPR, FPR, TNR, FNR counts for a classification metric.
+    Private mixin for classification metrics. True/false positive rate and true/false negative
+    rate are sufficient statistics for various concrete metrics. This class provides the machinery
+    to track those statistics across mini-batches of (label, prediction) pairs.
     """
 
     def __init__(self):

From 797c01c360ed20b633b47e5c68f3ef53b2186cde Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Wed, 14 Feb 2018 08:51:09 -0800
Subject: [PATCH 07/10] use scikit in test

---
 tests/python/unittest/test_metric.py | 61 ++++++++++++++++------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index fee8b66e3af5..81d6ab5f9e37 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -19,6 +19,8 @@
 import numpy as np
 import json
 
+from sklearn.metrics import f1_score as scikit_f1
+
 def check_metric(metric, *args, **kwargs):
     metric = mx.metric.create(metric, *args, **kwargs)
     str_metric = json.dumps(metric.get_config())
@@ -55,22 +57,22 @@ def test_acc():
     assert acc == expected_acc
 
 def test_f1():
-    microF1 = mx.metric.create("f1", average="micro")
-    macroF1 = mx.metric.F1(average="macro")
+    micro_f1 = mx.metric.create("f1", average="micro")
+    macro_f1 = mx.metric.F1(average="macro")
 
-    assert np.isnan(macroF1.get()[1])
-    assert np.isnan(microF1.get()[1])
+    assert np.isnan(macro_f1.get()[1])
+    assert np.isnan(micro_f1.get()[1])
 
     # check divide by zero
     pred = mx.nd.array([[0.9, 0.1],
                         [0.8, 0.2]])
     label = mx.nd.array([0, 0])
-    macroF1.update([label], [pred])
-    microF1.update([label], [pred])
-    assert macroF1.get()[1] == 0.0
-    assert microF1.get()[1] == 0.0
-    macroF1.reset()
-    microF1.reset()
+    macro_f1.update([label], [pred])
+    micro_f1.update([label], [pred])
+    assert macro_f1.get()[1] == 0.0
+    assert micro_f1.get()[1] == 0.0
+    macro_f1.reset()
+    micro_f1.reset()
 
     pred11 = mx.nd.array([[0.1, 0.9],
                           [0.5, 0.5]])
@@ -83,23 +85,28 @@ def test_f1():
     pred22 = mx.nd.array([[0.2, 0.8]])
     label22 = mx.nd.array([1])
 
-    microF1.update([label11, label12], [pred11, pred12])
-    macroF1.update([label11, label12], [pred11, pred12])
-    assert microF1.num_inst == 4
-    assert macroF1.num_inst == 1
-    # f1 = 2 * tp / (2 * tp + fp + fn)
-    fscore1 = 2. * (1) / (2 * 1 + 1 + 0)
-    np.testing.assert_almost_equal(microF1.get()[1], fscore1)
-    np.testing.assert_almost_equal(macroF1.get()[1], fscore1)
-
-    microF1.update([label21, label22], [pred21, pred22])
-    macroF1.update([label21, label22], [pred21, pred22])
-    assert microF1.num_inst == 6
-    assert macroF1.num_inst == 2
-    fscore2 = 2. * (1) / (2 * 1 + 0 + 0)
-    fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
-    np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
-    np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.)
+    micro_f1.update([label11, label12], [pred11, pred12])
+    macro_f1.update([label11, label12], [pred11, pred12])
+    assert micro_f1.num_inst == 4
+    assert macro_f1.num_inst == 1
+    np_pred1 = np.concatenate([mx.nd.argmax(pred11, axis=1).asnumpy(),
+                              mx.nd.argmax(pred12, axis=1).asnumpy()])
+    np_label1 = np.concatenate([label11.asnumpy(), label12.asnumpy()])
+    np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label1, np_pred1))
+    np.testing.assert_almost_equal(macro_f1.get()[1], scikit_f1(np_label1, np_pred1))
+
+    micro_f1.update([label21, label22], [pred21, pred22])
+    macro_f1.update([label21, label22], [pred21, pred22])
+    assert micro_f1.num_inst == 6
+    assert macro_f1.num_inst == 2
+    np_pred2 = np.concatenate([mx.nd.argmax(pred21, axis=1).asnumpy(),
+                               mx.nd.argmax(pred22, axis=1).asnumpy()])
+    np_pred_total = np.concatenate([np_pred1, np_pred2])
+    np_label2 = np.concatenate([label21.asnumpy(), label22.asnumpy()])
+    np_label_total = np.concatenate([np_label1, np_label2])
+    np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label_total, np_pred_total))
+    np.testing.assert_almost_equal(macro_f1.get()[1], (scikit_f1(np_label1, np_pred1) +
+                                                      scikit_f1(np_label2, np_pred2)) / 2)
 
 def test_perplexity():
     pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])

From 115a6352e6aa5ed68e3f5238b126b0231971e17a Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Wed, 14 Feb 2018 11:17:35 -0800
Subject: [PATCH 08/10] Revert "use scikit in test"

This reverts commit 797c01c360ed20b633b47e5c68f3ef53b2186cde.
---
 tests/python/unittest/test_metric.py | 61 ++++++++++++----------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 81d6ab5f9e37..fee8b66e3af5 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -19,8 +19,6 @@
 import numpy as np
 import json
 
-from sklearn.metrics import f1_score as scikit_f1
-
 def check_metric(metric, *args, **kwargs):
     metric = mx.metric.create(metric, *args, **kwargs)
     str_metric = json.dumps(metric.get_config())
@@ -57,22 +55,22 @@ def test_acc():
     assert acc == expected_acc
 
 def test_f1():
-    micro_f1 = mx.metric.create("f1", average="micro")
-    macro_f1 = mx.metric.F1(average="macro")
+    microF1 = mx.metric.create("f1", average="micro")
+    macroF1 = mx.metric.F1(average="macro")
 
-    assert np.isnan(macro_f1.get()[1])
-    assert np.isnan(micro_f1.get()[1])
+    assert np.isnan(macroF1.get()[1])
+    assert np.isnan(microF1.get()[1])
 
     # check divide by zero
     pred = mx.nd.array([[0.9, 0.1],
                         [0.8, 0.2]])
     label = mx.nd.array([0, 0])
-    macro_f1.update([label], [pred])
-    micro_f1.update([label], [pred])
-    assert macro_f1.get()[1] == 0.0
-    assert micro_f1.get()[1] == 0.0
-    macro_f1.reset()
-    micro_f1.reset()
+    macroF1.update([label], [pred])
+    microF1.update([label], [pred])
+    assert macroF1.get()[1] == 0.0
+    assert microF1.get()[1] == 0.0
+    macroF1.reset()
+    microF1.reset()
 
     pred11 = mx.nd.array([[0.1, 0.9],
                           [0.5, 0.5]])
@@ -85,28 +83,23 @@ def test_f1():
     pred22 = mx.nd.array([[0.2, 0.8]])
     label22 = mx.nd.array([1])
 
-    micro_f1.update([label11, label12], [pred11, pred12])
-    macro_f1.update([label11, label12], [pred11, pred12])
-    assert micro_f1.num_inst == 4
-    assert macro_f1.num_inst == 1
-    np_pred1 = np.concatenate([mx.nd.argmax(pred11, axis=1).asnumpy(),
-                              mx.nd.argmax(pred12, axis=1).asnumpy()])
-    np_label1 = np.concatenate([label11.asnumpy(), label12.asnumpy()])
-    np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label1, np_pred1))
-    np.testing.assert_almost_equal(macro_f1.get()[1], scikit_f1(np_label1, np_pred1))
-
-    micro_f1.update([label21, label22], [pred21, pred22])
-    macro_f1.update([label21, label22], [pred21, pred22])
-    assert micro_f1.num_inst == 6
-    assert macro_f1.num_inst == 2
-    np_pred2 = np.concatenate([mx.nd.argmax(pred21, axis=1).asnumpy(),
-                               mx.nd.argmax(pred22, axis=1).asnumpy()])
-    np_pred_total = np.concatenate([np_pred1, np_pred2])
-    np_label2 = np.concatenate([label21.asnumpy(), label22.asnumpy()])
-    np_label_total = np.concatenate([np_label1, np_label2])
-    np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label_total, np_pred_total))
-    np.testing.assert_almost_equal(macro_f1.get()[1], (scikit_f1(np_label1, np_pred1) +
-                                                      scikit_f1(np_label2, np_pred2)) / 2)
+    microF1.update([label11, label12], [pred11, pred12])
+    macroF1.update([label11, label12], [pred11, pred12])
+    assert microF1.num_inst == 4
+    assert macroF1.num_inst == 1
+    # f1 = 2 * tp / (2 * tp + fp + fn)
+    fscore1 = 2. * (1) / (2 * 1 + 1 + 0)
+    np.testing.assert_almost_equal(microF1.get()[1], fscore1)
+    np.testing.assert_almost_equal(macroF1.get()[1], fscore1)
+
+    microF1.update([label21, label22], [pred21, pred22])
+    macroF1.update([label21, label22], [pred21, pred22])
+    assert microF1.num_inst == 6
+    assert macroF1.num_inst == 2
+    fscore2 = 2. * (1) / (2 * 1 + 0 + 0)
+    fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
+    np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
+    np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.)
 
 def test_perplexity():
     pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])

From 85b85031ba41eafb268d788f34049ae91ce35146 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Wed, 14 Feb 2018 15:54:14 -0800
Subject: [PATCH 09/10] use composition

---
 python/mxnet/metric.py | 81 +++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 28976ae185cf..c4940e04c285 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -475,20 +475,21 @@ def update(self, labels, preds):
             self.num_inst += num_samples
 
 
-class _BinaryClassificationMixin(object):
+class _BinaryClassificationMetrics(object):
     """
-    Private mixin for classification metrics. True/false positive rate and true/false negative
-    rate are sufficient statistics for various concrete metrics. This class provides the machinery
-    to track those statistics across mini-batches of (label, prediction) pairs.
+    Private container class for classification metric statistics. True/false positive and
+     true/false negative counts are sufficient statistics for various classification metrics.
+    This class provides the machinery to track those statistics across mini-batches of
+    (label, prediction) pairs.
     """
 
     def __init__(self):
-        self._true_positives = 0
-        self._false_negatives = 0
-        self._false_positives = 0
-        self._true_negatives = 0
+        self.true_positives = 0
+        self.false_negatives = 0
+        self.false_positives = 0
+        self.true_negatives = 0
 
-    def _update_binary_stats(self, label, pred):
+    def update_binary_stats(self, label, pred):
         """
         Update various binary classification counts for a single (label, pred)
         pair.
@@ -512,49 +513,49 @@ def _update_binary_stats(self, label, pred):
 
         for y_pred, y_true in zip(pred_label, label):
             if y_pred == 1 and y_true == 1:
-                self._true_positives += 1.
+                self.true_positives += 1.
             elif y_pred == 1 and y_true == 0:
-                self._false_positives += 1.
+                self.false_positives += 1.
             elif y_pred == 0 and y_true == 1:
-                self._false_negatives += 1.
+                self.false_negatives += 1.
             else:
-                self._true_negatives += 1.
-
+                self.true_negatives += 1.
 
     @property
-    def _precision(self):
-        if self._true_positives + self._false_positives > 0:
-            return self._true_positives / (self._true_positives + self._false_positives)
+    def precision(self):
+        if self.true_positives + self.false_positives > 0:
+            return self.true_positives / (self.true_positives + self.false_positives)
         else:
             return 0.
 
     @property
-    def _recall(self):
-        if self._true_positives + self._false_negatives > 0:
-            return self._true_positives / (self._true_positives + self._false_negatives)
+    def recall(self):
+        if self.true_positives + self.false_negatives > 0:
+            return self.true_positives / (self.true_positives + self.false_negatives)
         else:
             return 0.
 
     @property
-    def _fscore(self):
-        if self._precision + self._recall > 0:
-            return 2 * self._precision * self._recall / (self._precision + self._recall)
+    def fscore(self):
+        if self.precision + self.recall > 0:
+            return 2 * self.precision * self.recall / (self.precision + self.recall)
         else:
             return 0.
 
     @property
-    def _total_examples(self):
-        return self._false_negatives + self._false_positives + \
-               self._true_negatives + self._true_positives
+    def total_examples(self):
+        return self.false_negatives + self.false_positives + \
+               self.true_negatives + self.true_positives
+
+    def reset_stats(self):
+        self.false_positives = 0
+        self.false_negatives = 0
+        self.true_positives = 0
+        self.true_negatives = 0
 
-    def _reset_stats(self):
-        self._false_positives = 0
-        self._false_negatives = 0
-        self._true_positives = 0
-        self._true_negatives = 0
 
 @register
-class F1(EvalMetric, _BinaryClassificationMixin):
+class F1(EvalMetric):
     """Computes the F1 score of a binary classification problem.
 
     The F1 score is equivalent to weighted average of the precision and recall,
@@ -598,10 +599,10 @@ class F1(EvalMetric, _BinaryClassificationMixin):
 
     def __init__(self, name='f1',
                  output_names=None, label_names=None, average="macro"):
-        _BinaryClassificationMixin.__init__(self)
+        self.average = average
+        self.metrics = _BinaryClassificationMetrics()
         EvalMetric.__init__(self, name=name,
                             output_names=output_names, label_names=label_names)
-        self.average = average
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -617,21 +618,21 @@ def update(self, labels, preds):
         check_label_shapes(labels, preds)
 
         for label, pred in zip(labels, preds):
-            self._update_binary_stats(label, pred)
+            self.metrics.update_binary_stats(label, pred)
 
         if self.average == "macro":
-            self.sum_metric += self._fscore
+            self.sum_metric += self.metrics.fscore
             self.num_inst += 1
-            self._reset_stats()
+            self.metrics.reset_stats()
         else:
-            self.sum_metric = self._fscore * self._total_examples
-            self.num_inst = self._total_examples
+            self.sum_metric = self.metrics.fscore * self.metrics.total_examples
+            self.num_inst = self.metrics.total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
         self.num_inst = 0.
-        self._reset_stats()
+        self.metrics.reset_stats()
 
 
 @register

From 2a19389eabf9bb4fe1969ab6059ac6d8e54b7b1c Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Thu, 15 Feb 2018 08:21:19 -0800
Subject: [PATCH 10/10] minibatches

---
 python/mxnet/metric.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index c4940e04c285..0a02b80a1c06 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -583,9 +583,9 @@ class F1(EvalMetric):
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
     average : str, default 'macro'
-        Strategy to be used for aggregating across micro-batches.
-            "macro": average the F1 scores for each batch
-            "micro": compute a single F1 score across all batches
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average the F1 scores for each batch.
+            "micro": compute a single F1 score across all batches.
 
     Examples
     --------