BUG: GH13219 Fixed. Allow unicode values in usecol

hassanshamim · hassanshamim · commit c30eeb52193d · 2016-05-31T14:51:53.000-07:00
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -372,3 +372,4 @@ Bug Fixes
 
 
 - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
+- Bug in ``pd.read_csv()`` that prevents ``usecol`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -882,12 +882,13 @@ def _validate_usecols_arg(usecols):
     or strings (column by name). Raises a ValueError
     if that is not the case.
     """
+    msg = ("The elements of 'usecols' must "
+           "either be all strings, all unicode, or all integers")
+
     if usecols is not None:
         usecols_dtype = lib.infer_dtype(usecols)
-        if usecols_dtype not in ('integer', 'string'):
-            raise ValueError(("The elements of 'usecols' "
-                              "must either be all strings "
-                              "or all integers"))
+        if usecols_dtype not in ('integer', 'string', 'unicode'):
+            raise ValueError(msg)
 
     return usecols
 
diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py
@@ -6,6 +6,7 @@
 """
 
 from datetime import datetime
+import nose
 
 import pandas.util.testing as tm
 
@@ -22,9 +23,8 @@ def test_raise_on_mixed_dtype_usecols(self):
         1000,2000,3000
         4000,5000,6000
         """
-        msg = ("The elements of \'usecols\' "
-               "must either be all strings "
-               "or all integers")
+        msg = ("The elements of 'usecols' must "
+               "either be all strings, all unicode, or all integers")
         usecols = [0, 'b', 2]
 
         with tm.assertRaisesRegexp(ValueError, msg):
@@ -254,3 +254,103 @@ def test_usecols_with_parse_dates_and_usecol_names(self):
                            usecols=[3, 0, 2],
                            parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_unicode_strings(self):
+        # see gh-13219
+
+        s = '''AAA,BBB,CCC,DDD
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'AAA': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'BBB': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_single_byte_unicode_strings(self):
+        # see gh-13219
+
+        s = '''A,B,C,D
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'A': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'B': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_mixed_encoding_strings(self):
+        s = '''AAA,BBB,CCC,DDD
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        msg = ("The elements of 'usecols' must "
+               "either be all strings, all unicode, or all integers")
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
+
+    def test_usecols_with_multibyte_characters(self):
+        s = '''あああ,いい,ううう,ええええ
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+        data = {
+            'あああ': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'いい': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_multibyte_unicode_characters(self):
+        raise nose.SkipTest('TODO: see gh-13253')
+
+        s = '''あああ,いい,ううう,ええええ
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+        data = {
+            'あああ': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'いい': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
+        tm.assert_frame_equal(df, expected)

Original file line number	Diff line number	Diff line change
`@@ -372,3 +372,4 @@ Bug Fixes`
`372`	`372`
`373`	`373`
`374`	`374`	- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
	`375`	+- Bug in ``pd.read_csv()`` that prevents ``usecol`` kwarg from accepting single-byte unicode strings (:issue:`13219`)