3636from pandas .util ._decorators import Appender
3737from pandas .util ._decorators import deprecate_kwarg
3838
39- VALID_ENCODINGS = ('ascii' , 'us-ascii' , 'latin-1' , 'latin_1' , 'iso-8859-1' ,
39+ # Allowed encodings of Stata dta files. Preferred is first entry
40+ VALID_ENCODINGS = ('latin-1' , 'latin_1' , 'ascii' , 'us-ascii' , 'iso-8859-1' ,
4041 'iso8859-1' , '8859' , 'cp819' , 'latin' , 'latin1' , 'L1' )
4142
43+ VALID_ENCODINGS_118 = ('utf8' , 'utf-8' )
44+
4245_version_error = ("Version of given Stata file is not 104, 105, 108, "
4346 "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
4447 "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" )
169172
170173
171174@Appender (_read_stata_doc )
175+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
172176@deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
173177def read_stata (filepath_or_buffer , convert_dates = True ,
174178 convert_categoricals = True , encoding = None , index_col = None ,
@@ -182,7 +186,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182186 preserve_dtypes = preserve_dtypes ,
183187 columns = columns ,
184188 order_categoricals = order_categoricals ,
185- chunksize = chunksize , encoding = encoding )
189+ chunksize = chunksize )
186190
187191 if iterator or chunksize :
188192 data = reader
@@ -399,16 +403,19 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
399403 elif infer_dtype (dates ) == 'datetime' :
400404 if delta :
401405 delta = dates .values - stata_epoch
402- f = lambda x : \
403- US_PER_DAY * x .days + 1000000 * x .seconds + x .microseconds
406+
407+ def f (x ):
408+ return US_PER_DAY * x .days + \
409+ 1000000 * x .seconds + x .microseconds
404410 v = np .vectorize (f )
405411 d ['delta' ] = v (delta )
406412 if year :
407413 year_month = dates .apply (lambda x : 100 * x .year + x .month )
408414 d ['year' ] = year_month .values // 100
409415 d ['month' ] = (year_month .values - d ['year' ] * 100 )
410416 if days :
411- f = lambda x : (x - datetime .datetime (x .year , 1 , 1 )).days
417+ def f (x ):
418+ return (x - datetime .datetime (x .year , 1 , 1 )).days
412419 v = np .vectorize (f )
413420 d ['days' ] = v (dates )
414421 else :
@@ -838,7 +845,6 @@ def get_base_missing_value(cls, dtype):
838845
839846
840847class StataParser (object ):
841- _default_encoding = 'latin-1'
842848
843849 def __init__ (self , encoding ):
844850 if encoding is not None :
@@ -959,12 +965,13 @@ def __init__(self, encoding):
959965class StataReader (StataParser , BaseIterator ):
960966 __doc__ = _stata_reader_doc
961967
968+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
962969 @deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
963970 def __init__ (self , path_or_buf , convert_dates = True ,
964971 convert_categoricals = True , index_col = None ,
965972 convert_missing = False , preserve_dtypes = True ,
966973 columns = None , order_categoricals = True ,
967- encoding = 'latin-1' , chunksize = None ):
974+ encoding = None , chunksize = None ):
968975 super (StataReader , self ).__init__ (encoding )
969976 self .col_sizes = ()
970977
@@ -977,10 +984,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977984 self ._preserve_dtypes = preserve_dtypes
978985 self ._columns = columns
979986 self ._order_categoricals = order_categoricals
980- if encoding is not None :
981- if encoding not in VALID_ENCODINGS :
982- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
983- 'supported.' )
984987 self ._encoding = encoding
985988 self ._chunksize = chunksize
986989
@@ -998,18 +1001,13 @@ def __init__(self, path_or_buf, convert_dates=True,
9981001 path_or_buf = _stringify_path (path_or_buf )
9991002 if isinstance (path_or_buf , str ):
10001003 path_or_buf , encoding , _ , should_close = get_filepath_or_buffer (
1001- path_or_buf , encoding = self ._default_encoding
1002- )
1004+ path_or_buf )
10031005
10041006 if isinstance (path_or_buf , (str , text_type , bytes )):
10051007 self .path_or_buf = open (path_or_buf , 'rb' )
10061008 else :
10071009 # Copy to BytesIO, and ensure no encoding
10081010 contents = path_or_buf .read ()
1009- try :
1010- contents = contents .encode (self ._default_encoding )
1011- except :
1012- pass
10131011 self .path_or_buf = BytesIO (contents )
10141012
10151013 self ._read_header ()
@@ -1030,6 +1028,15 @@ def close(self):
10301028 except IOError :
10311029 pass
10321030
1031+ def _set_encoding (self ):
1032+ """
1033+ Check validity of user-set encoding set the default encoding
1034+ """
1035+ if self .format_version < 118 :
1036+ self ._encoding = 'latin-1'
1037+ else :
1038+ self ._encoding = 'utf-8'
1039+
10331040 def _read_header (self ):
10341041 first_char = self .path_or_buf .read (1 )
10351042 if struct .unpack ('c' , first_char )[0 ] == b'<' :
@@ -1049,6 +1056,7 @@ def _read_new_header(self, first_char):
10491056 self .format_version = int (self .path_or_buf .read (3 ))
10501057 if self .format_version not in [117 , 118 ]:
10511058 raise ValueError (_version_error )
1059+ self ._set_encoding ()
10521060 self .path_or_buf .read (21 ) # </release><byteorder>
10531061 self .byteorder = self .path_or_buf .read (3 ) == b'MSF' and '>' or '<'
10541062 self .path_or_buf .read (15 ) # </byteorder><K>
@@ -1235,6 +1243,7 @@ def _read_old_header(self, first_char):
12351243 self .format_version = struct .unpack ('b' , first_char )[0 ]
12361244 if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
12371245 raise ValueError (_version_error )
1246+ self ._set_encoding ()
12381247 self .byteorder = struct .unpack ('b' , self .path_or_buf .read (1 ))[
12391248 0 ] == 0x1 and '>' or '<'
12401249 self .filetype = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
@@ -1338,16 +1347,9 @@ def _decode(self, s):
13381347 return s .decode ('utf-8' )
13391348
13401349 def _null_terminate (self , s ):
1341- if compat .PY3 or self ._encoding is not None :
1342- # have bytes not strings, so must decode
1343- s = s .partition (b"\0 " )[0 ]
1344- return s .decode (self ._encoding or self ._default_encoding )
1345- else :
1346- null_byte = "\0 "
1347- try :
1348- return s .lstrip (null_byte )[:s .index (null_byte )]
1349- except :
1350- return s
1350+ # have bytes not strings, so must decode
1351+ s = s .partition (b"\0 " )[0 ]
1352+ return s .decode (self ._encoding )
13511353
13521354 def _read_value_labels (self ):
13531355 if self ._value_labels_read :
@@ -1433,10 +1435,7 @@ def _read_strls(self):
14331435 self .path_or_buf .read (4 ))[0 ]
14341436 va = self .path_or_buf .read (length )
14351437 if typ == 130 :
1436- encoding = 'utf-8'
1437- if self .format_version == 117 :
1438- encoding = self ._encoding or self ._default_encoding
1439- va = va [0 :- 1 ].decode (encoding )
1438+ va = va [0 :- 1 ].decode (self ._encoding )
14401439 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411440 self .GSO [str (v_o )] = va
14421441
0 commit comments