From 79afc4a063b77b7e0ca8f1ceb7794c4b1b408f07 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 12 Feb 2018 18:00:43 -0800 Subject: [PATCH 01/10] add macro/micro f1 and test and binary abstraction --- python/mxnet/metric.py | 131 +++++++++++++++++++-------- tests/python/unittest/test_metric.py | 47 ++++++++++ 2 files changed, 141 insertions(+), 37 deletions(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index 8bb3f6ee0a81..86e0863c7cc3 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -155,6 +155,9 @@ def get_name_value(self): value = [value] return list(zip(name, value)) + def macro(self): + return MacroMetric(self) + # pylint: disable=invalid-name register = registry.get_register_func(EvalMetric, 'metric') alias = registry.get_alias_func(EvalMetric, 'metric') @@ -475,8 +478,72 @@ def update(self, labels, preds): self.num_inst += num_samples +class BinaryClassificationMetric(EvalMetric): + + def __init__(self): + self.num_inst = 0 + self.true_positives = 0 + self.false_negatives = 0 + self.false_positives = 0 + + def _update_binary_stats(self, label, pred): + """Updates the internal evaluation result. + + Parameters + ---------- + labels : list of `NDArray` + The labels of the data. + + preds : list of `NDArray` + Predicted values. + """ + pred = pred.asnumpy() + label = label.asnumpy().astype('int32') + pred_label = numpy.argmax(pred, axis=1) + + check_label_shapes(label, pred) + if len(numpy.unique(label)) > 2: + raise ValueError("F1 currently only supports binary classification.") + + for y_pred, y_true in zip(pred_label, label): + if y_pred == 1 and y_true == 1: + self.true_positives += 1. + elif y_pred == 1 and y_true == 0: + self.false_positives += 1. + elif y_pred == 0 and y_true == 1: + self.false_negatives += 1. + self.num_inst += label.shape[0] + + + @property + def _precision(self): + if self.true_positives + self.false_positives > 0: + return self.true_positives / (self.true_positives + self.false_positives) + else: + return 0. + + @property + def _recall(self): + if self.true_positives + self.false_negatives > 0: + return self.true_positives / (self.true_positives + self.false_negatives) + else: + return 0. + + @property + def _fscore(self): + if self._precision + self._recall > 0: + return 2 * self._precision * self._recall / (self._precision + self._recall) + else: + return 0. + + def reset(self): + self.num_inst = 0 + self.false_positives = 0 + self.false_negatives = 0 + self.true_positives = 0 + @register -class F1(EvalMetric): +class F1(BinaryClassificationMetric): """Computes the F1 score of a binary classification problem. The F1 score is equivalent to weighted average of the precision and recall, @@ -508,16 +575,16 @@ class F1(EvalMetric): -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([0., 1., 1.])] - >>> acc = mx.metric.F1() - >>> acc.update(preds = predicts, labels = labels) - >>> print acc.get() + >>> f1 = mx.metric.F1() + >>> f1.update(preds = predicts, labels = labels) + >>> print f1.get() ('f1', 0.8) """ - def __init__(self, name='f1', - output_names=None, label_names=None): - super(F1, self).__init__( - name, output_names=output_names, label_names=label_names) + def __init__(self, name='f1', output_names=None, label_names=None): + BinaryClassificationMetric.__init__(self) + EvalMetric.__init__(self, + name, output_names=output_names, label_names=label_names) def update(self, labels, preds): """Updates the internal evaluation result. @@ -533,41 +600,31 @@ def update(self, labels, preds): check_label_shapes(labels, preds) for label, pred in zip(labels, preds): - pred = pred.asnumpy() - label = label.asnumpy().astype('int32') - pred_label = numpy.argmax(pred, axis=1) + self._update_binary_stats(label, pred) - check_label_shapes(label, pred) - if len(numpy.unique(label)) > 2: - raise ValueError("F1 currently only supports binary classification.") - - true_positives, false_positives, false_negatives = 0., 0., 0. + def get(self): + if self.num_inst == 0: + return self.name, float('nan') + else: + return self.name, self._fscore - for y_pred, y_true in zip(pred_label, label): - if y_pred == 1 and y_true == 1: - true_positives += 1. - elif y_pred == 1 and y_true == 0: - false_positives += 1. - elif y_pred == 0 and y_true == 1: - false_negatives += 1. +class MacroMetric(EvalMetric): - if true_positives + false_positives > 0: - precision = true_positives / (true_positives + false_positives) - else: - precision = 0. + def __init__(self, base_metric): + super(MacroMetric, self).__init__("macro_" + base_metric.name, output_names=base_metric.output_names, + label_names=base_metric.label_names) + self.base_metric = base_metric - if true_positives + false_negatives > 0: - recall = true_positives / (true_positives + false_negatives) - else: - recall = 0. + def update(self, labels, preds): + self.base_metric.update(labels, preds) + self.sum_metric += self.base_metric.get()[1] + self.num_inst += 1 + self.base_metric.reset() - if precision + recall > 0: - f1_score = 2 * precision * recall / (precision + recall) - else: - f1_score = 0. +class MacroF1(MacroMetric): - self.sum_metric += f1_score - self.num_inst += 1 + def __init__(self, name, output_names, label_names): + super(MacroF1, self).__init__(F1(name, output_names=output_names, label_names=label_names)) @register diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index 0f2f27f9eb24..090d29d8c94e 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -26,6 +26,53 @@ def check_metric(metric, *args, **kwargs): assert metric.get_config() == metric2.get_config() +def test_f1(): + microF1 = mx.metric.F1() + macroF1 = mx.metric.F1().macro() + + assert np.isnan(macroF1.get()[1]) + assert np.isnan(microF1.get()[1]) + + # check divide by zero + pred = mx.nd.array([[0.9, 0.1], + [0.8, 0.2]]) + label = mx.nd.array([0, 0]) + macroF1.update([label], [pred]) + microF1.update([label], [pred]) + assert macroF1.get()[1] == 0.0 + assert microF1.get()[1] == 0.0 + macroF1.reset() + microF1.reset() + + pred11 = mx.nd.array([[0.1, 0.9], + [0.5, 0.5]]) + label11 = mx.nd.array([1, 0]) + pred12 = mx.nd.array([[0.85, 0.15], + [1.0, 0.0]]) + label12 = mx.nd.array([1, 0]) + pred21 = mx.nd.array([[0.6, 0.4]]) + label21 = mx.nd.array([0]) + pred22 = mx.nd.array([[0.2, 0.8]]) + label22 = mx.nd.array([1]) + + microF1.update([label11, label12], [pred11, pred12]) + macroF1.update([label11, label12], [pred11, pred12]) + assert microF1.num_inst == 4 + assert macroF1.num_inst == 1 + # f1 = 2 * tp / (2 * tp + fp + fn) + fscore1 = 2 * (1) / (2 * 1 + 1 + 0) + assert microF1.get()[1] == fscore1 + assert macroF1.get()[1] == fscore1 + + microF1.update([label21, label22], [pred21, pred22]) + macroF1.update([label21, label22], [pred21, pred22]) + assert microF1.num_inst == 6 + assert macroF1.num_inst == 2 + fscore2 = 2 * (1) / (2 * 1 + 0 + 0) + assert macroF1.get()[1] == (fscore1 + fscore2) / 2 + fscore_total = 2 * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) + assert microF1.get()[1] == fscore_total + def test_metrics(): check_metric('acc', axis=0) From bc44e9d744f133b4814939605accec006c61efd2 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 12 Feb 2018 19:11:06 -0800 Subject: [PATCH 02/10] make average an option --- python/mxnet/metric.py | 98 ++++++++++++++-------------- tests/python/unittest/test_metric.py | 12 ++-- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index 86e0863c7cc3..bd94ad0c0391 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -155,9 +155,6 @@ def get_name_value(self): value = [value] return list(zip(name, value)) - def macro(self): - return MacroMetric(self) - # pylint: disable=invalid-name register = registry.get_register_func(EvalMetric, 'metric') alias = registry.get_alias_func(EvalMetric, 'metric') @@ -478,23 +475,28 @@ def update(self, labels, preds): self.num_inst += num_samples -class BinaryClassificationMetric(EvalMetric): +class _BinaryClassificationMixin(object): + """ + Private class for keeping track of TPR, FPR, TNR, FNR counts for a classification metric. This + class is not intended to be instantiated directly, but extended by concrete metric types. + """ def __init__(self): - self.num_inst = 0 - self.true_positives = 0 - self.false_negatives = 0 - self.false_positives = 0 + self._true_positives = 0 + self._false_negatives = 0 + self._false_positives = 0 + self._true_negatives = 0 def _update_binary_stats(self, label, pred): - """Updates the internal evaluation result. + """ + Update various binary classification counts for a single (label, pred) pair. Parameters ---------- - labels : list of `NDArray` + labels : `NDArray` The labels of the data. - preds : list of `NDArray` + preds : `NDArray` Predicted values. """ pred = pred.asnumpy() @@ -507,25 +509,26 @@ def _update_binary_stats(self, label, pred): for y_pred, y_true in zip(pred_label, label): if y_pred == 1 and y_true == 1: - self.true_positives += 1. + self._true_positives += 1. elif y_pred == 1 and y_true == 0: - self.false_positives += 1. + self._false_positives += 1. elif y_pred == 0 and y_true == 1: - self.false_negatives += 1. - self.num_inst += label.shape[0] + self._false_negatives += 1. + else: + self._true_negatives += 1. @property def _precision(self): - if self.true_positives + self.false_positives > 0: - return self.true_positives / (self.true_positives + self.false_positives) + if self._true_positives + self._false_positives > 0: + return self._true_positives / (self._true_positives + self._false_positives) else: return 0. @property def _recall(self): - if self.true_positives + self.false_negatives > 0: - return self.true_positives / (self.true_positives + self.false_negatives) + if self._true_positives + self._false_negatives > 0: + return self._true_positives / (self._true_positives + self._false_negatives) else: return 0. @@ -536,14 +539,17 @@ def _fscore(self): else: return 0. - def reset(self): - self.num_inst = 0 - self.false_positives = 0 - self.false_negatives = 0 - self.true_positives = 0 + def _total_examples(self): + return self._false_negatives + self._false_positives + self._true_negatives + self._true_positives + + def _reset_stats(self): + self._false_positives = 0 + self._false_negatives = 0 + self._true_positives = 0 + self._true_negatives = 0 @register -class F1(BinaryClassificationMetric): +class F1(EvalMetric, _BinaryClassificationMixin): """Computes the F1 score of a binary classification problem. The F1 score is equivalent to weighted average of the precision and recall, @@ -570,6 +576,10 @@ class F1(BinaryClassificationMetric): label_names : list of str, or None Name of labels that should be used when updating with update_dict. By default include all labels. + average : str + Strategy to be used for aggregating across micro-batches. + "macro": average the F1 scores for each batch + "micro": compute a single F1 score across all batches Examples -------- @@ -581,10 +591,11 @@ class F1(BinaryClassificationMetric): ('f1', 0.8) """ - def __init__(self, name='f1', output_names=None, label_names=None): - BinaryClassificationMetric.__init__(self) + def __init__(self, name='f1', output_names=None, label_names=None, average="macro"): + _BinaryClassificationMixin.__init__(self) EvalMetric.__init__(self, name, output_names=output_names, label_names=label_names) + self.average = average def update(self, labels, preds): """Updates the internal evaluation result. @@ -601,30 +612,19 @@ def update(self, labels, preds): for label, pred in zip(labels, preds): self._update_binary_stats(label, pred) - - def get(self): - if self.num_inst == 0: - return self.name, float('nan') + if self.average == "macro": + self.sum_metric += self._fscore + self.num_inst += 1 + self._reset_stats() else: - return self.name, self._fscore - -class MacroMetric(EvalMetric): - - def __init__(self, base_metric): - super(MacroMetric, self).__init__("macro_" + base_metric.name, output_names=base_metric.output_names, - label_names=base_metric.label_names) - self.base_metric = base_metric + self.sum_metric = self._fscore * self._total_examples() + self.num_inst = self._total_examples() - def update(self, labels, preds): - self.base_metric.update(labels, preds) - self.sum_metric += self.base_metric.get()[1] - self.num_inst += 1 - self.base_metric.reset() - -class MacroF1(MacroMetric): - - def __init__(self, name, output_names, label_names): - super(MacroF1, self).__init__(F1(name, output_names=output_names, label_names=label_names)) + def reset(self): + """Resets the internal evaluation result to initial state.""" + self.sum_metric = 0. + self.num_inst = 0. + self._reset_stats() @register diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index 090d29d8c94e..75ab117052e8 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -27,8 +27,8 @@ def check_metric(metric, *args, **kwargs): assert metric.get_config() == metric2.get_config() def test_f1(): - microF1 = mx.metric.F1() - macroF1 = mx.metric.F1().macro() + microF1 = mx.metric.F1(average="micro") + macroF1 = mx.metric.F1(average="macro") assert np.isnan(macroF1.get()[1]) assert np.isnan(microF1.get()[1]) @@ -61,17 +61,17 @@ def test_f1(): assert macroF1.num_inst == 1 # f1 = 2 * tp / (2 * tp + fp + fn) fscore1 = 2 * (1) / (2 * 1 + 1 + 0) - assert microF1.get()[1] == fscore1 - assert macroF1.get()[1] == fscore1 + np.testing.assert_almost_equal(microF1.get()[1], fscore1) + np.testing.assert_almost_equal(macroF1.get()[1], fscore1) microF1.update([label21, label22], [pred21, pred22]) macroF1.update([label21, label22], [pred21, pred22]) assert microF1.num_inst == 6 assert macroF1.num_inst == 2 fscore2 = 2 * (1) / (2 * 1 + 0 + 0) - assert macroF1.get()[1] == (fscore1 + fscore2) / 2 fscore_total = 2 * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) - assert microF1.get()[1] == fscore_total + np.testing.assert_almost_equal(microF1.get()[1], fscore_total) + np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2) def test_metrics(): From 7ec5a8861a622b450492094ce23a55fb0d35224d Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 12 Feb 2018 19:54:57 -0800 Subject: [PATCH 03/10] use metric.create --- python/mxnet/metric.py | 26 ++++++---- tests/python/unittest/test_metric.py | 75 +++++++++++----------------- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index bd94ad0c0391..d0400da0ff6c 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -477,8 +477,7 @@ def update(self, labels, preds): class _BinaryClassificationMixin(object): """ - Private class for keeping track of TPR, FPR, TNR, FNR counts for a classification metric. This - class is not intended to be instantiated directly, but extended by concrete metric types. + Private mixin for keeping track of TPR, FPR, TNR, FNR counts for a classification metric. """ def __init__(self): @@ -489,14 +488,15 @@ def __init__(self): def _update_binary_stats(self, label, pred): """ - Update various binary classification counts for a single (label, pred) pair. + Update various binary classification counts for a single (label, pred) + pair. Parameters ---------- - labels : `NDArray` + label : `NDArray` The labels of the data. - preds : `NDArray` + pred : `NDArray` Predicted values. """ pred = pred.asnumpy() @@ -539,8 +539,10 @@ def _fscore(self): else: return 0. + @property def _total_examples(self): - return self._false_negatives + self._false_positives + self._true_negatives + self._true_positives + return self._false_negatives + self._false_positives + \ + self._true_negatives + self._true_positives def _reset_stats(self): self._false_positives = 0 @@ -591,10 +593,11 @@ class F1(EvalMetric, _BinaryClassificationMixin): ('f1', 0.8) """ - def __init__(self, name='f1', output_names=None, label_names=None, average="macro"): + def __init__(self, name='f1', + output_names=None, label_names=None, average="macro"): _BinaryClassificationMixin.__init__(self) - EvalMetric.__init__(self, - name, output_names=output_names, label_names=label_names) + EvalMetric.__init__(self, name=name, + output_names=output_names, label_names=label_names) self.average = average def update(self, labels, preds): @@ -612,13 +615,14 @@ def update(self, labels, preds): for label, pred in zip(labels, preds): self._update_binary_stats(label, pred) + if self.average == "macro": self.sum_metric += self._fscore self.num_inst += 1 self._reset_stats() else: - self.sum_metric = self._fscore * self._total_examples() - self.num_inst = self._total_examples() + self.sum_metric = self._fscore * self._total_examples + self.num_inst = self._total_examples def reset(self): """Resets the internal evaluation result to initial state.""" diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index 75ab117052e8..ad0d35d642fc 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -26,8 +26,36 @@ def check_metric(metric, *args, **kwargs): assert metric.get_config() == metric2.get_config() +def test_metrics(): + check_metric('acc', axis=0) + check_metric('f1') + check_metric('perplexity', -1) + check_metric('pearsonr') + check_metric('nll_loss') + composite = mx.metric.create(['acc', 'f1']) + check_metric(composite) + +def test_nll_loss(): + metric = mx.metric.create('nll_loss') + pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]]) + label = mx.nd.array([2, 1]) + metric.update([label], [pred]) + _, loss = metric.get() + expected_loss = 0.0 + expected_loss = -(np.log(pred[0][2].asscalar()) + np.log(pred[1][1].asscalar())) / 2 + assert loss == expected_loss + +def test_acc(): + pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]]) + label = mx.nd.array([0, 1, 1]) + metric = mx.metric.create('acc') + metric.update([label], [pred]) + _, acc = metric.get() + expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size + assert acc == expected_acc + def test_f1(): - microF1 = mx.metric.F1(average="micro") + microF1 = mx.metric.create("f1", average="micro") macroF1 = mx.metric.F1(average="macro") assert np.isnan(macroF1.get()[1]) @@ -45,7 +73,7 @@ def test_f1(): microF1.reset() pred11 = mx.nd.array([[0.1, 0.9], - [0.5, 0.5]]) + [0.5, 0.5]]) label11 = mx.nd.array([1, 0]) pred12 = mx.nd.array([[0.85, 0.15], [1.0, 0.0]]) @@ -73,49 +101,6 @@ def test_f1(): np.testing.assert_almost_equal(microF1.get()[1], fscore_total) np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2) - -def test_metrics(): - check_metric('acc', axis=0) - check_metric('f1') - check_metric('perplexity', -1) - check_metric('pearsonr') - check_metric('nll_loss') - composite = mx.metric.create(['acc', 'f1']) - check_metric(composite) - -def test_nll_loss(): - metric = mx.metric.create('nll_loss') - pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]]) - label = mx.nd.array([2, 1]) - metric.update([label], [pred]) - _, loss = metric.get() - expected_loss = 0.0 - expected_loss = -(np.log(pred[0][2].asscalar()) + np.log(pred[1][1].asscalar())) / 2 - assert loss == expected_loss - -def test_acc(): - pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]]) - label = mx.nd.array([0, 1, 1]) - metric = mx.metric.create('acc') - metric.update([label], [pred]) - _, acc = metric.get() - expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size - assert acc == expected_acc - -def test_f1(): - pred = mx.nd.array([[0.3, 0.7], [1., 0], [0.4, 0.6], [0.6, 0.4], [0.9, 0.1]]) - label = mx.nd.array([0, 1, 1, 1, 1]) - positives = np.argmax(pred, axis=1).sum().asscalar() - true_positives = (np.argmax(pred, axis=1) == label).sum().asscalar() - precision = true_positives / positives - overall_positives = label.sum().asscalar() - recall = true_positives / overall_positives - f1_expected = 2 * (precision * recall) / (precision + recall) - metric = mx.metric.create('f1') - metric.update([label], [pred]) - _, f1 = metric.get() - assert f1 == f1_expected - def test_perplexity(): pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]]) label = mx.nd.array([0, 1, 1]) From dc0f0e84aa6dce9635ce8315bce48453de64b287 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 12 Feb 2018 23:01:06 -0800 Subject: [PATCH 04/10] add decimal for float division --- tests/python/unittest/test_metric.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index ad0d35d642fc..fee8b66e3af5 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -88,7 +88,7 @@ def test_f1(): assert microF1.num_inst == 4 assert macroF1.num_inst == 1 # f1 = 2 * tp / (2 * tp + fp + fn) - fscore1 = 2 * (1) / (2 * 1 + 1 + 0) + fscore1 = 2. * (1) / (2 * 1 + 1 + 0) np.testing.assert_almost_equal(microF1.get()[1], fscore1) np.testing.assert_almost_equal(macroF1.get()[1], fscore1) @@ -96,10 +96,10 @@ def test_f1(): macroF1.update([label21, label22], [pred21, pred22]) assert microF1.num_inst == 6 assert macroF1.num_inst == 2 - fscore2 = 2 * (1) / (2 * 1 + 0 + 0) - fscore_total = 2 * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) + fscore2 = 2. * (1) / (2 * 1 + 0 + 0) + fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) np.testing.assert_almost_equal(microF1.get()[1], fscore_total) - np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2) + np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.) def test_perplexity(): pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]]) From e448019cbb919f3cfc61e78f1b8cf6ec02bc9186 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 13 Feb 2018 09:31:53 -0800 Subject: [PATCH 05/10] add default in docstring, reference generic base class in error msg --- python/mxnet/metric.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index d0400da0ff6c..76c168024e31 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -505,7 +505,8 @@ def _update_binary_stats(self, label, pred): check_label_shapes(label, pred) if len(numpy.unique(label)) > 2: - raise ValueError("F1 currently only supports binary classification.") + raise ValueError("%s currently only supports binary classification." + % self.__class__.__name__) for y_pred, y_true in zip(pred_label, label): if y_pred == 1 and y_true == 1: @@ -578,7 +579,7 @@ class F1(EvalMetric, _BinaryClassificationMixin): label_names : list of str, or None Name of labels that should be used when updating with update_dict. By default include all labels. - average : str + average : str, default 'macro' Strategy to be used for aggregating across micro-batches. "macro": average the F1 scores for each batch "micro": compute a single F1 score across all batches From 3c86317a7b0ee5f48733854b10d384b9908c5cd4 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 13 Feb 2018 12:20:14 -0800 Subject: [PATCH 06/10] expand on docstring --- python/mxnet/metric.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index 76c168024e31..28976ae185cf 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -477,7 +477,9 @@ def update(self, labels, preds): class _BinaryClassificationMixin(object): """ - Private mixin for keeping track of TPR, FPR, TNR, FNR counts for a classification metric. + Private mixin for classification metrics. True/false positive rate and true/false negative + rate are sufficient statistics for various concrete metrics. This class provides the machinery + to track those statistics across mini-batches of (label, prediction) pairs. """ def __init__(self): From 797c01c360ed20b633b47e5c68f3ef53b2186cde Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Feb 2018 08:51:09 -0800 Subject: [PATCH 07/10] use scikit in test --- tests/python/unittest/test_metric.py | 61 ++++++++++++++++------------ 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index fee8b66e3af5..81d6ab5f9e37 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -19,6 +19,8 @@ import numpy as np import json +from sklearn.metrics import f1_score as scikit_f1 + def check_metric(metric, *args, **kwargs): metric = mx.metric.create(metric, *args, **kwargs) str_metric = json.dumps(metric.get_config()) @@ -55,22 +57,22 @@ def test_acc(): assert acc == expected_acc def test_f1(): - microF1 = mx.metric.create("f1", average="micro") - macroF1 = mx.metric.F1(average="macro") + micro_f1 = mx.metric.create("f1", average="micro") + macro_f1 = mx.metric.F1(average="macro") - assert np.isnan(macroF1.get()[1]) - assert np.isnan(microF1.get()[1]) + assert np.isnan(macro_f1.get()[1]) + assert np.isnan(micro_f1.get()[1]) # check divide by zero pred = mx.nd.array([[0.9, 0.1], [0.8, 0.2]]) label = mx.nd.array([0, 0]) - macroF1.update([label], [pred]) - microF1.update([label], [pred]) - assert macroF1.get()[1] == 0.0 - assert microF1.get()[1] == 0.0 - macroF1.reset() - microF1.reset() + macro_f1.update([label], [pred]) + micro_f1.update([label], [pred]) + assert macro_f1.get()[1] == 0.0 + assert micro_f1.get()[1] == 0.0 + macro_f1.reset() + micro_f1.reset() pred11 = mx.nd.array([[0.1, 0.9], [0.5, 0.5]]) @@ -83,23 +85,28 @@ def test_f1(): pred22 = mx.nd.array([[0.2, 0.8]]) label22 = mx.nd.array([1]) - microF1.update([label11, label12], [pred11, pred12]) - macroF1.update([label11, label12], [pred11, pred12]) - assert microF1.num_inst == 4 - assert macroF1.num_inst == 1 - # f1 = 2 * tp / (2 * tp + fp + fn) - fscore1 = 2. * (1) / (2 * 1 + 1 + 0) - np.testing.assert_almost_equal(microF1.get()[1], fscore1) - np.testing.assert_almost_equal(macroF1.get()[1], fscore1) - - microF1.update([label21, label22], [pred21, pred22]) - macroF1.update([label21, label22], [pred21, pred22]) - assert microF1.num_inst == 6 - assert macroF1.num_inst == 2 - fscore2 = 2. * (1) / (2 * 1 + 0 + 0) - fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) - np.testing.assert_almost_equal(microF1.get()[1], fscore_total) - np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.) + micro_f1.update([label11, label12], [pred11, pred12]) + macro_f1.update([label11, label12], [pred11, pred12]) + assert micro_f1.num_inst == 4 + assert macro_f1.num_inst == 1 + np_pred1 = np.concatenate([mx.nd.argmax(pred11, axis=1).asnumpy(), + mx.nd.argmax(pred12, axis=1).asnumpy()]) + np_label1 = np.concatenate([label11.asnumpy(), label12.asnumpy()]) + np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label1, np_pred1)) + np.testing.assert_almost_equal(macro_f1.get()[1], scikit_f1(np_label1, np_pred1)) + + micro_f1.update([label21, label22], [pred21, pred22]) + macro_f1.update([label21, label22], [pred21, pred22]) + assert micro_f1.num_inst == 6 + assert macro_f1.num_inst == 2 + np_pred2 = np.concatenate([mx.nd.argmax(pred21, axis=1).asnumpy(), + mx.nd.argmax(pred22, axis=1).asnumpy()]) + np_pred_total = np.concatenate([np_pred1, np_pred2]) + np_label2 = np.concatenate([label21.asnumpy(), label22.asnumpy()]) + np_label_total = np.concatenate([np_label1, np_label2]) + np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label_total, np_pred_total)) + np.testing.assert_almost_equal(macro_f1.get()[1], (scikit_f1(np_label1, np_pred1) + + scikit_f1(np_label2, np_pred2)) / 2) def test_perplexity(): pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]]) From 115a6352e6aa5ed68e3f5238b126b0231971e17a Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Feb 2018 11:17:35 -0800 Subject: [PATCH 08/10] Revert "use scikit in test" This reverts commit 797c01c360ed20b633b47e5c68f3ef53b2186cde. --- tests/python/unittest/test_metric.py | 61 ++++++++++++---------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index 81d6ab5f9e37..fee8b66e3af5 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -19,8 +19,6 @@ import numpy as np import json -from sklearn.metrics import f1_score as scikit_f1 - def check_metric(metric, *args, **kwargs): metric = mx.metric.create(metric, *args, **kwargs) str_metric = json.dumps(metric.get_config()) @@ -57,22 +55,22 @@ def test_acc(): assert acc == expected_acc def test_f1(): - micro_f1 = mx.metric.create("f1", average="micro") - macro_f1 = mx.metric.F1(average="macro") + microF1 = mx.metric.create("f1", average="micro") + macroF1 = mx.metric.F1(average="macro") - assert np.isnan(macro_f1.get()[1]) - assert np.isnan(micro_f1.get()[1]) + assert np.isnan(macroF1.get()[1]) + assert np.isnan(microF1.get()[1]) # check divide by zero pred = mx.nd.array([[0.9, 0.1], [0.8, 0.2]]) label = mx.nd.array([0, 0]) - macro_f1.update([label], [pred]) - micro_f1.update([label], [pred]) - assert macro_f1.get()[1] == 0.0 - assert micro_f1.get()[1] == 0.0 - macro_f1.reset() - micro_f1.reset() + macroF1.update([label], [pred]) + microF1.update([label], [pred]) + assert macroF1.get()[1] == 0.0 + assert microF1.get()[1] == 0.0 + macroF1.reset() + microF1.reset() pred11 = mx.nd.array([[0.1, 0.9], [0.5, 0.5]]) @@ -85,28 +83,23 @@ def test_f1(): pred22 = mx.nd.array([[0.2, 0.8]]) label22 = mx.nd.array([1]) - micro_f1.update([label11, label12], [pred11, pred12]) - macro_f1.update([label11, label12], [pred11, pred12]) - assert micro_f1.num_inst == 4 - assert macro_f1.num_inst == 1 - np_pred1 = np.concatenate([mx.nd.argmax(pred11, axis=1).asnumpy(), - mx.nd.argmax(pred12, axis=1).asnumpy()]) - np_label1 = np.concatenate([label11.asnumpy(), label12.asnumpy()]) - np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label1, np_pred1)) - np.testing.assert_almost_equal(macro_f1.get()[1], scikit_f1(np_label1, np_pred1)) - - micro_f1.update([label21, label22], [pred21, pred22]) - macro_f1.update([label21, label22], [pred21, pred22]) - assert micro_f1.num_inst == 6 - assert macro_f1.num_inst == 2 - np_pred2 = np.concatenate([mx.nd.argmax(pred21, axis=1).asnumpy(), - mx.nd.argmax(pred22, axis=1).asnumpy()]) - np_pred_total = np.concatenate([np_pred1, np_pred2]) - np_label2 = np.concatenate([label21.asnumpy(), label22.asnumpy()]) - np_label_total = np.concatenate([np_label1, np_label2]) - np.testing.assert_almost_equal(micro_f1.get()[1], scikit_f1(np_label_total, np_pred_total)) - np.testing.assert_almost_equal(macro_f1.get()[1], (scikit_f1(np_label1, np_pred1) + - scikit_f1(np_label2, np_pred2)) / 2) + microF1.update([label11, label12], [pred11, pred12]) + macroF1.update([label11, label12], [pred11, pred12]) + assert microF1.num_inst == 4 + assert macroF1.num_inst == 1 + # f1 = 2 * tp / (2 * tp + fp + fn) + fscore1 = 2. * (1) / (2 * 1 + 1 + 0) + np.testing.assert_almost_equal(microF1.get()[1], fscore1) + np.testing.assert_almost_equal(macroF1.get()[1], fscore1) + + microF1.update([label21, label22], [pred21, pred22]) + macroF1.update([label21, label22], [pred21, pred22]) + assert microF1.num_inst == 6 + assert macroF1.num_inst == 2 + fscore2 = 2. * (1) / (2 * 1 + 0 + 0) + fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) + np.testing.assert_almost_equal(microF1.get()[1], fscore_total) + np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.) def test_perplexity(): pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]]) From 85b85031ba41eafb268d788f34049ae91ce35146 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Feb 2018 15:54:14 -0800 Subject: [PATCH 09/10] use composition --- python/mxnet/metric.py | 81 +++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index 28976ae185cf..c4940e04c285 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -475,20 +475,21 @@ def update(self, labels, preds): self.num_inst += num_samples -class _BinaryClassificationMixin(object): +class _BinaryClassificationMetrics(object): """ - Private mixin for classification metrics. True/false positive rate and true/false negative - rate are sufficient statistics for various concrete metrics. This class provides the machinery - to track those statistics across mini-batches of (label, prediction) pairs. + Private container class for classification metric statistics. True/false positive and + true/false negative counts are sufficient statistics for various classification metrics. + This class provides the machinery to track those statistics across mini-batches of + (label, prediction) pairs. """ def __init__(self): - self._true_positives = 0 - self._false_negatives = 0 - self._false_positives = 0 - self._true_negatives = 0 + self.true_positives = 0 + self.false_negatives = 0 + self.false_positives = 0 + self.true_negatives = 0 - def _update_binary_stats(self, label, pred): + def update_binary_stats(self, label, pred): """ Update various binary classification counts for a single (label, pred) pair. @@ -512,49 +513,49 @@ def _update_binary_stats(self, label, pred): for y_pred, y_true in zip(pred_label, label): if y_pred == 1 and y_true == 1: - self._true_positives += 1. + self.true_positives += 1. elif y_pred == 1 and y_true == 0: - self._false_positives += 1. + self.false_positives += 1. elif y_pred == 0 and y_true == 1: - self._false_negatives += 1. + self.false_negatives += 1. else: - self._true_negatives += 1. - + self.true_negatives += 1. @property - def _precision(self): - if self._true_positives + self._false_positives > 0: - return self._true_positives / (self._true_positives + self._false_positives) + def precision(self): + if self.true_positives + self.false_positives > 0: + return self.true_positives / (self.true_positives + self.false_positives) else: return 0. @property - def _recall(self): - if self._true_positives + self._false_negatives > 0: - return self._true_positives / (self._true_positives + self._false_negatives) + def recall(self): + if self.true_positives + self.false_negatives > 0: + return self.true_positives / (self.true_positives + self.false_negatives) else: return 0. @property - def _fscore(self): - if self._precision + self._recall > 0: - return 2 * self._precision * self._recall / (self._precision + self._recall) + def fscore(self): + if self.precision + self.recall > 0: + return 2 * self.precision * self.recall / (self.precision + self.recall) else: return 0. @property - def _total_examples(self): - return self._false_negatives + self._false_positives + \ - self._true_negatives + self._true_positives + def total_examples(self): + return self.false_negatives + self.false_positives + \ + self.true_negatives + self.true_positives + + def reset_stats(self): + self.false_positives = 0 + self.false_negatives = 0 + self.true_positives = 0 + self.true_negatives = 0 - def _reset_stats(self): - self._false_positives = 0 - self._false_negatives = 0 - self._true_positives = 0 - self._true_negatives = 0 @register -class F1(EvalMetric, _BinaryClassificationMixin): +class F1(EvalMetric): """Computes the F1 score of a binary classification problem. The F1 score is equivalent to weighted average of the precision and recall, @@ -598,10 +599,10 @@ class F1(EvalMetric, _BinaryClassificationMixin): def __init__(self, name='f1', output_names=None, label_names=None, average="macro"): - _BinaryClassificationMixin.__init__(self) + self.average = average + self.metrics = _BinaryClassificationMetrics() EvalMetric.__init__(self, name=name, output_names=output_names, label_names=label_names) - self.average = average def update(self, labels, preds): """Updates the internal evaluation result. @@ -617,21 +618,21 @@ def update(self, labels, preds): check_label_shapes(labels, preds) for label, pred in zip(labels, preds): - self._update_binary_stats(label, pred) + self.metrics.update_binary_stats(label, pred) if self.average == "macro": - self.sum_metric += self._fscore + self.sum_metric += self.metrics.fscore self.num_inst += 1 - self._reset_stats() + self.metrics.reset_stats() else: - self.sum_metric = self._fscore * self._total_examples - self.num_inst = self._total_examples + self.sum_metric = self.metrics.fscore * self.metrics.total_examples + self.num_inst = self.metrics.total_examples def reset(self): """Resets the internal evaluation result to initial state.""" self.sum_metric = 0. self.num_inst = 0. - self._reset_stats() + self.metrics.reset_stats() @register From 2a19389eabf9bb4fe1969ab6059ac6d8e54b7b1c Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 15 Feb 2018 08:21:19 -0800 Subject: [PATCH 10/10] minibatches --- python/mxnet/metric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index c4940e04c285..0a02b80a1c06 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -583,9 +583,9 @@ class F1(EvalMetric): Name of labels that should be used when updating with update_dict. By default include all labels. average : str, default 'macro' - Strategy to be used for aggregating across micro-batches. - "macro": average the F1 scores for each batch - "micro": compute a single F1 score across all batches + Strategy to be used for aggregating across mini-batches. + "macro": average the F1 scores for each batch. + "micro": compute a single F1 score across all batches. Examples --------