3636from pandas .util ._decorators import Appender
3737from pandas .util ._decorators import deprecate_kwarg
3838
39- VALID_ENCODINGS = ('ascii' , 'us-ascii' , 'latin-1' , 'latin_1' , 'iso-8859-1' ,
40- 'iso8859-1' , '8859' , 'cp819' , 'latin' , 'latin1' , 'L1' )
41-
4239_version_error = ("Version of given Stata file is not 104, 105, 108, "
4340 "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
4441 "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" )
169166
170167
171168@Appender (_read_stata_doc )
169+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
172170@deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
173171def read_stata (filepath_or_buffer , convert_dates = True ,
174172 convert_categoricals = True , encoding = None , index_col = None ,
@@ -182,7 +180,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182180 preserve_dtypes = preserve_dtypes ,
183181 columns = columns ,
184182 order_categoricals = order_categoricals ,
185- chunksize = chunksize , encoding = encoding )
183+ chunksize = chunksize )
186184
187185 if iterator or chunksize :
188186 data = reader
@@ -838,15 +836,8 @@ def get_base_missing_value(cls, dtype):
838836
839837
840838class StataParser (object ):
841- _default_encoding = 'latin-1'
842-
843- def __init__ (self , encoding ):
844- if encoding is not None :
845- if encoding not in VALID_ENCODINGS :
846- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
847- 'supported.' )
848839
849- self . _encoding = encoding
840+ def __init__ ( self ):
850841
851842 # type code.
852843 # --------------------
@@ -959,12 +950,13 @@ def __init__(self, encoding):
959950class StataReader (StataParser , BaseIterator ):
960951 __doc__ = _stata_reader_doc
961952
953+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
962954 @deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
963955 def __init__ (self , path_or_buf , convert_dates = True ,
964956 convert_categoricals = True , index_col = None ,
965957 convert_missing = False , preserve_dtypes = True ,
966958 columns = None , order_categoricals = True ,
967- encoding = 'latin-1' , chunksize = None ):
959+ encoding = None , chunksize = None ):
968960 super (StataReader , self ).__init__ (encoding )
969961 self .col_sizes = ()
970962
@@ -977,10 +969,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977969 self ._preserve_dtypes = preserve_dtypes
978970 self ._columns = columns
979971 self ._order_categoricals = order_categoricals
980- if encoding is not None :
981- if encoding not in VALID_ENCODINGS :
982- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
983- 'supported.' )
984972 self ._encoding = encoding
985973 self ._chunksize = chunksize
986974
@@ -998,18 +986,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998986 path_or_buf = _stringify_path (path_or_buf )
999987 if isinstance (path_or_buf , str ):
1000988 path_or_buf , encoding , _ , should_close = get_filepath_or_buffer (
1001- path_or_buf , encoding = self ._default_encoding
1002- )
989+ path_or_buf )
1003990
1004991 if isinstance (path_or_buf , (str , text_type , bytes )):
1005992 self .path_or_buf = open (path_or_buf , 'rb' )
1006993 else :
1007994 # Copy to BytesIO, and ensure no encoding
1008995 contents = path_or_buf .read ()
1009- try :
1010- contents = contents .encode (self ._default_encoding )
1011- except :
1012- pass
1013996 self .path_or_buf = BytesIO (contents )
1014997
1015998 self ._read_header ()
@@ -1030,6 +1013,15 @@ def close(self):
10301013 except IOError :
10311014 pass
10321015
1016+ def _set_encoding (self ):
1017+ """
1018+ Set string encoding which depends on file version
1019+ """
1020+ if self .format_version < 118 :
1021+ self ._encoding = 'latin-1'
1022+ else :
1023+ self ._encoding = 'utf-8'
1024+
10331025 def _read_header (self ):
10341026 first_char = self .path_or_buf .read (1 )
10351027 if struct .unpack ('c' , first_char )[0 ] == b'<' :
@@ -1049,6 +1041,7 @@ def _read_new_header(self, first_char):
10491041 self .format_version = int (self .path_or_buf .read (3 ))
10501042 if self .format_version not in [117 , 118 ]:
10511043 raise ValueError (_version_error )
1044+ self ._set_encoding ()
10521045 self .path_or_buf .read (21 ) # </release><byteorder>
10531046 self .byteorder = self .path_or_buf .read (3 ) == b'MSF' and '>' or '<'
10541047 self .path_or_buf .read (15 ) # </byteorder><K>
@@ -1235,6 +1228,7 @@ def _read_old_header(self, first_char):
12351228 self .format_version = struct .unpack ('b' , first_char )[0 ]
12361229 if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
12371230 raise ValueError (_version_error )
1231+ self ._set_encoding ()
12381232 self .byteorder = struct .unpack ('b' , self .path_or_buf .read (1 ))[
12391233 0 ] == 0x1 and '>' or '<'
12401234 self .filetype = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
@@ -1338,16 +1332,9 @@ def _decode(self, s):
13381332 return s .decode ('utf-8' )
13391333
13401334 def _null_terminate (self , s ):
1341- if compat .PY3 or self ._encoding is not None :
1342- # have bytes not strings, so must decode
1343- s = s .partition (b"\0 " )[0 ]
1344- return s .decode (self ._encoding or self ._default_encoding )
1345- else :
1346- null_byte = "\0 "
1347- try :
1348- return s .lstrip (null_byte )[:s .index (null_byte )]
1349- except :
1350- return s
1335+ # have bytes not strings, so must decode
1336+ s = s .partition (b"\0 " )[0 ]
1337+ return s .decode (self ._encoding )
13511338
13521339 def _read_value_labels (self ):
13531340 if self ._value_labels_read :
@@ -1433,10 +1420,7 @@ def _read_strls(self):
14331420 self .path_or_buf .read (4 ))[0 ]
14341421 va = self .path_or_buf .read (length )
14351422 if typ == 130 :
1436- encoding = 'utf-8'
1437- if self .format_version == 117 :
1438- encoding = self ._encoding or self ._default_encoding
1439- va = va [0 :- 1 ].decode (encoding )
1423+ va = va [0 :- 1 ].decode (self ._encoding )
14401424 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411425 self .GSO [str (v_o )] = va
14421426
0 commit comments