BUG: Avoid cancellations in nanskew/nankurt.

Joris Vankerschaver · Joris Vankerschaver · commit 7b310dad98dc · 2016-01-24T16:26:32.000-08:00
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -540,3 +540,5 @@ of columns didn't match the number of series provided (:issue:`12039`).
 
 - Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`)
 - Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`)
+
+- Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -474,6 +474,13 @@ def nanargmin(values, axis=None, skipna=True):
 
 @disallow('M8', 'm8')
 def nanskew(values, axis=None, skipna=True):
+    """ Compute the sample skewness.
+
+    The statistic computed here is the adjusted Fisher-Pearson standardized
+    moment coefficient G1. The algorithm computes this coefficient directly
+    from the second and third central moment.
+
+    """
 
     mask = isnull(values)
     if not is_float_dtype(values.dtype):
@@ -486,32 +493,48 @@ def nanskew(values, axis=None, skipna=True):
         values = values.copy()
         np.putmask(values, mask, 0)
 
-    typ = values.dtype.type
-    A = values.sum(axis) / count
-    B = (values**2).sum(axis) / count - A**typ(2)
-    C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B
+    mean = values.sum(axis, dtype=np.float64) / count
+    if axis is not None:
+        mean = np.expand_dims(mean, axis)
+
+    adjusted = values - mean
+    if skipna:
+        np.putmask(adjusted, mask, 0)
+    adjusted2 = adjusted ** 2
+    adjusted3 = adjusted2 * adjusted
+    m2 = adjusted2.sum(axis, dtype=np.float64)
+    m3 = adjusted3.sum(axis, dtype=np.float64)
 
     # floating point error
-    B = _zero_out_fperr(B)
-    C = _zero_out_fperr(C)
+    m2 = _zero_out_fperr(m2)
+    m3 = _zero_out_fperr(m3)
 
-    result = ((np.sqrt(count * count - count) * C) /
-              ((count - typ(2)) * np.sqrt(B)**typ(3)))
+    result = (count * (count-1) ** 0.5 / (count-2)) * (m3 / m2 ** 1.5)
+
+    dtype = values.dtype
+    if is_float_dtype(dtype):
+        result = result.astype(dtype)
 
     if isinstance(result, np.ndarray):
-        result = np.where(B == 0, 0, result)
+        result = np.where(m2 == 0, 0, result)
         result[count < 3] = np.nan
         return result
     else:
-        result = 0 if B == 0 else result
+        result = 0 if m2 == 0 else result
         if count < 3:
             return np.nan
         return result
 
 
 @disallow('M8', 'm8')
 def nankurt(values, axis=None, skipna=True):
+    """ Compute the sample skewness.
+
+    The statistic computed here is the adjusted Fisher-Pearson standardized
+    moment coefficient G2, computed directly from the second and fourth
+    central moment.
 
+    """
     mask = isnull(values)
     if not is_float_dtype(values.dtype):
         values = values.astype('f8')
@@ -523,30 +546,43 @@ def nankurt(values, axis=None, skipna=True):
         values = values.copy()
         np.putmask(values, mask, 0)
 
-    typ = values.dtype.type
-    A = values.sum(axis) / count
-    B = (values**2).sum(axis) / count - A**typ(2)
-    C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B
-    D = ((values**4).sum(axis) / count - A**typ(4) -
-         typ(6) * B * A * A - typ(4) * C * A)
+    mean = values.sum(axis, dtype=np.float64) / count
+    if axis is not None:
+        mean = np.expand_dims(mean, axis)
+
+    adjusted = values - mean
+    if skipna:
+        np.putmask(adjusted, mask, 0)
+    adjusted2 = adjusted ** 2
+    adjusted4 = adjusted2 ** 2
+    m2 = adjusted2.sum(axis, dtype=np.float64)
+    m4 = adjusted4.sum(axis, dtype=np.float64)
+
+    adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
+    numer = count * (count + 1) * (count - 1) * m4
+    denom = (count - 2) * (count - 3) * m2**2
+    result = numer / denom - adj
 
-    B = _zero_out_fperr(B)
-    D = _zero_out_fperr(D)
+    # floating point error
+    numer = _zero_out_fperr(numer)
+    denom = _zero_out_fperr(denom)
 
-    if not isinstance(B, np.ndarray):
-        # if B is a scalar, check these corner cases first before doing
-        # division
+    if not isinstance(denom, np.ndarray):
+        # if ``denom`` is a scalar, check these corner cases first before
+        # doing division
         if count < 4:
             return np.nan
-        if B == 0:
+        if denom == 0:
             return 0
 
-    result = (((count * count - typ(1)) * D / (B * B) - typ(3) *
-               ((count - typ(1))**typ(2))) / ((count - typ(2)) *
-                                              (count - typ(3))))
+    result = numer / denom - adj
+
+    dtype = values.dtype
+    if is_float_dtype(dtype):
+        result = result.astype(dtype)
 
     if isinstance(result, np.ndarray):
-        result = np.where(B == 0, 0, result)
+        result = np.where(denom == 0, 0, result)
         result[count < 4] = np.nan
 
     return result
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -888,6 +888,106 @@ def prng(self):
         return np.random.RandomState(1234)
 
 
+class TestNanskewFixedValues(tm.TestCase):
+
+    # xref GH 11974
+
+    def setUp(self):
+        # Test data + skewness value (computed with scipy.stats.skew)
+        self.samples = np.sin(np.linspace(0, 1, 200))
+        self.actual_skew = -0.1875895205961754
+
+    def test_constant_series(self):
+        # xref GH 11974
+        for val in [3075.2, 3075.3, 3075.5]:
+            data = val * np.ones(300)
+            skew = nanops.nanskew(data)
+            self.assertEqual(skew, 0.0)
+
+    def test_all_finite(self):
+        alpha, beta = 0.3, 0.1
+        left_tailed = self.prng.beta(alpha, beta, size=100)
+        self.assertLess(nanops.nanskew(left_tailed), 0)
+
+        alpha, beta = 0.1, 0.3
+        right_tailed = self.prng.beta(alpha, beta, size=100)
+        self.assertGreater(nanops.nanskew(right_tailed), 0)
+
+    def test_ground_truth(self):
+        skew = nanops.nanskew(self.samples)
+        self.assertAlmostEqual(skew, self.actual_skew)
+
+    def test_axis(self):
+        samples = np.vstack([self.samples,
+                             np.nan * np.ones(len(self.samples))])
+        skew = nanops.nanskew(samples, axis=1)
+        tm.assert_almost_equal(skew, [self.actual_skew, np.nan])
+
+    def test_nans(self):
+        samples = np.hstack([self.samples, np.nan])
+        skew = nanops.nanskew(samples, skipna=False)
+        self.assertTrue(np.isnan(skew))
+
+    def test_nans_skipna(self):
+        samples = np.hstack([self.samples, np.nan])
+        skew = nanops.nanskew(samples, skipna=True)
+        tm.assert_almost_equal(skew, self.actual_skew)
+
+    @property
+    def prng(self):
+        return np.random.RandomState(1234)
+
+
+class TestNankurtFixedValues(tm.TestCase):
+
+    # xref GH 11974
+
+    def setUp(self):
+        # Test data + kurtosis value (computed with scipy.stats.kurtosis)
+        self.samples = np.sin(np.linspace(0, 1, 200))
+        self.actual_kurt = -1.2058303433799713
+
+    def test_constant_series(self):
+        # xref GH 11974
+        for val in [3075.2, 3075.3, 3075.5]:
+            data = val * np.ones(300)
+            kurt = nanops.nankurt(data)
+            self.assertEqual(kurt, 0.0)
+
+    def test_all_finite(self):
+        alpha, beta = 0.3, 0.1
+        left_tailed = self.prng.beta(alpha, beta, size=100)
+        self.assertLess(nanops.nankurt(left_tailed), 0)
+
+        alpha, beta = 0.1, 0.3
+        right_tailed = self.prng.beta(alpha, beta, size=100)
+        self.assertGreater(nanops.nankurt(right_tailed), 0)
+
+    def test_ground_truth(self):
+        kurt = nanops.nankurt(self.samples)
+        self.assertAlmostEqual(kurt, self.actual_kurt)
+
+    def test_axis(self):
+        samples = np.vstack([self.samples,
+                             np.nan * np.ones(len(self.samples))])
+        kurt = nanops.nankurt(samples, axis=1)
+        tm.assert_almost_equal(kurt, [self.actual_kurt, np.nan])
+
+    def test_nans(self):
+        samples = np.hstack([self.samples, np.nan])
+        kurt = nanops.nankurt(samples, skipna=False)
+        self.assertTrue(np.isnan(kurt))
+
+    def test_nans_skipna(self):
+        samples = np.hstack([self.samples, np.nan])
+        kurt = nanops.nankurt(samples, skipna=True)
+        tm.assert_almost_equal(kurt, self.actual_kurt)
+
+    @property
+    def prng(self):
+        return np.random.RandomState(1234)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'