@@ -98,30 +98,33 @@ def _get_skiprows(skiprows):
9898 type (skiprows ).__name__ )
9999
100100
101- def _read (io ):
101+ def _read (obj ):
102102 """Try to read from a url, file or string.
103103
104104 Parameters
105105 ----------
106- io : str, unicode, or file-like
106+ obj : str, unicode, or file-like
107107
108108 Returns
109109 -------
110110 raw_text : str
111111 """
112- if _is_url (io ):
113- with urlopen (io ) as url :
114- raw_text = url .read ()
115- elif hasattr (io , 'read' ):
116- raw_text = io .read ()
117- elif os .path .isfile (io ):
118- with open (io ) as f :
119- raw_text = f .read ()
120- elif isinstance (io , string_types ):
121- raw_text = io
112+ if _is_url (obj ):
113+ with urlopen (obj ) as url :
114+ text = url .read ()
115+ elif hasattr (obj , 'read' ):
116+ text = obj .read ()
117+ elif isinstance (obj , string_types ):
118+ text = obj
119+ try :
120+ if os .path .isfile (text ):
121+ with open (text , 'rb' ) as f :
122+ return f .read ()
123+ except TypeError :
124+ pass
122125 else :
123- raise TypeError ("Cannot read object of type %r" % type (io ).__name__ )
124- return raw_text
126+ raise TypeError ("Cannot read object of type %r" % type (obj ).__name__ )
127+ return text
125128
126129
127130class _HtmlFrameParser (object ):
@@ -165,10 +168,11 @@ class _HtmlFrameParser(object):
165168 See each method's respective documentation for details on their
166169 functionality.
167170 """
168- def __init__ (self , io , match , attrs ):
171+ def __init__ (self , io , match , attrs , encoding ):
169172 self .io = io
170173 self .match = match
171174 self .attrs = attrs
175+ self .encoding = encoding
172176
173177 def parse_tables (self ):
174178 tables = self ._parse_tables (self ._build_doc (), self .match , self .attrs )
@@ -422,7 +426,8 @@ def _setup_build_doc(self):
422426
423427 def _build_doc (self ):
424428 from bs4 import BeautifulSoup
425- return BeautifulSoup (self ._setup_build_doc (), features = 'html5lib' )
429+ return BeautifulSoup (self ._setup_build_doc (), features = 'html5lib' ,
430+ from_encoding = self .encoding )
426431
427432
428433def _build_xpath_expr (attrs ):
@@ -519,7 +524,7 @@ def _build_doc(self):
519524 from lxml .html import parse , fromstring , HTMLParser
520525 from lxml .etree import XMLSyntaxError
521526
522- parser = HTMLParser (recover = False )
527+ parser = HTMLParser (recover = False , encoding = self . encoding )
523528
524529 try :
525530 # try to parse the input in the simplest way
@@ -689,15 +694,15 @@ def _validate_flavor(flavor):
689694
690695
691696def _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
692- parse_dates , tupleize_cols , thousands , attrs ):
697+ parse_dates , tupleize_cols , thousands , attrs , encoding ):
693698 flavor = _validate_flavor (flavor )
694699 compiled_match = re .compile (match ) # you can pass a compiled regex here
695700
696701 # hack around python 3 deleting the exception variable
697702 retained = None
698703 for flav in flavor :
699704 parser = _parser_dispatch (flav )
700- p = parser (io , compiled_match , attrs )
705+ p = parser (io , compiled_match , attrs , encoding )
701706
702707 try :
703708 tables = p .parse_tables ()
@@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
715720
716721def read_html (io , match = '.+' , flavor = None , header = None , index_col = None ,
717722 skiprows = None , infer_types = None , attrs = None , parse_dates = False ,
718- tupleize_cols = False , thousands = ',' ):
723+ tupleize_cols = False , thousands = ',' , encoding = None ):
719724 r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
720725
721726 Parameters
@@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
792797 thousands : str, optional
793798 Separator to use to parse thousands. Defaults to ``','``.
794799
800+ encoding : str or None, optional
801+ The encoding used to decode the web page. Defaults to ``None``.``None``
802+ preserves the previous encoding behavior, which depends on the
803+ underlying parser library (e.g., the parser library will try to use
804+ the encoding provided by the document).
805+
795806 Returns
796807 -------
797808 dfs : list of DataFrames
@@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
837848 raise ValueError ('cannot skip rows starting from the end of the '
838849 'data (you passed a negative value)' )
839850 return _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
840- parse_dates , tupleize_cols , thousands , attrs )
851+ parse_dates , tupleize_cols , thousands , attrs , encoding )
0 commit comments