diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 158fa1561eb30..97b7555d833f8 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -108,6 +108,9 @@ Enhancements - ``read_html`` now sports an ``encoding`` argument that is passed to the underlying parser library. You can use this to read non-ascii encoded web pages (:issue:`7323`). +- ``read_excel`` now supports reading from URLs in the same way + that ``read_csv`` does. (:issue:`6809`) + - Support for dateutil timezones, which can now be used in the same way as pytz timezones across pandas. (:issue:`4688`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 67107ee20b336..f81cf6502a0e6 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -10,6 +10,7 @@ import numpy as np from pandas.io.parsers import TextParser +from pandas.io.common import _is_url, _urlopen from pandas.tseries.period import Period from pandas import json from pandas.compat import map, zip, reduce, range, lrange, u, add_metaclass @@ -56,8 +57,10 @@ def read_excel(io, sheetname=0, **kwds): Parameters ---------- - io : string, file-like object or xlrd workbook - If a string, expected to be a path to xls or xlsx file + io : string, file-like object, or xlrd workbook. + The string could be a URL. Valid URL schemes include http, ftp, s3, + and file. For file URLs, a host is expected. For instance, a local + file could be file://localhost/path/to/workbook.xlsx sheetname : string or int, default 0 Name of Excel sheet or the page number of the sheet header : int, default 0 @@ -98,6 +101,7 @@ def read_excel(io, sheetname=0, **kwds): ------- parsed : DataFrame DataFrame from the passed in Excel file + """ if 'kind' in kwds: kwds.pop('kind') @@ -139,11 +143,16 @@ def __init__(self, io, **kwds): raise ValueError("Unknown engine: %s" % engine) if isinstance(io, compat.string_types): - self.book = xlrd.open_workbook(io) - elif engine == "xlrd" and isinstance(io, xlrd.Book): + if _is_url(io): + data = _urlopen(io).read() + self.book = xlrd.open_workbook(file_contents=data) + else: + self.book = xlrd.open_workbook(io) + elif engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io - elif hasattr(io, "read"): - data = io.read() + elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): + # N.B. xlrd.Book has a read attribute too + data = io.read() self.book = xlrd.open_workbook(file_contents=data) else: raise ValueError('Must explicitly set engine if not passing in' diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index b45897dff9aa2..96db535347921 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -2,6 +2,7 @@ from pandas.compat import u, range, map, openpyxl_compat from datetime import datetime, date, time +import sys import os from distutils.version import LooseVersion @@ -11,6 +12,7 @@ from numpy import nan import numpy as np +from numpy.testing.decorators import slow from pandas import DataFrame, Index, MultiIndex from pandas.io.parsers import read_csv @@ -18,6 +20,7 @@ ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _OpenpyxlWriter, register_writer, _XlsxWriter ) +from pandas.io.common import URLError from pandas.util.testing import ensure_clean from pandas.core.config import set_option, get_option import pandas.util.testing as tm @@ -280,6 +283,39 @@ def test_read_xlrd_Book(self): result = read_excel(book, sheetname="SheetA", engine="xlrd") tm.assert_frame_equal(df, result) + @tm.network + def test_read_from_http_url(self): + _skip_if_no_xlrd() + + url = ('https://raw.github.com/pydata/pandas/master/' + 'pandas/io/tests/data/test.xlsx') + url_table = read_excel(url) + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'test.xlsx') + local_table = read_excel(localtable) + tm.assert_frame_equal(url_table, local_table) + + @slow + def test_read_from_file_url(self): + _skip_if_no_xlrd() + + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'test.xlsx') + local_table = read_excel(localtable) + + try: + url_table = read_excel('file://localhost/' + localtable) + except URLError: + # fails on some systems + raise nose.SkipTest("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + def test_xlsx_table(self): _skip_if_no_xlrd() _skip_if_no_openpyxl()