Skip to content

Commit ae5fe34

Browse files
authored
BUG: read_excel trailing blank rows and columns (#41227)
1 parent e4ee3d3 commit ae5fe34

File tree

9 files changed

+40
-7
lines changed

9 files changed

+40
-7
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,7 @@ I/O
829829
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
830830
- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
831831
- Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
832+
- Bug in :func:`read_excel` loading trailing empty rows/columns for some filetypes (:issue:`41167`)
832833
- Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`)
833834
- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
834835
- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)

pandas/io/excel/_openpyxl.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -571,15 +571,18 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
571571
last_row_with_data = -1
572572
for row_number, row in enumerate(sheet.rows):
573573
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
574-
if not all(cell == "" for cell in converted_row):
574+
while converted_row and converted_row[-1] == "":
575+
# trim trailing empty elements
576+
converted_row.pop()
577+
if converted_row:
575578
last_row_with_data = row_number
576579
data.append(converted_row)
577580

578581
# Trim trailing empty rows
579582
data = data[: last_row_with_data + 1]
580583

581-
if self.book.read_only and len(data) > 0:
582-
# With dimension reset, openpyxl no longer pads rows
584+
if len(data) > 0:
585+
# extend rows to max width
583586
max_width = max(len(data_row) for data_row in data)
584587
if min(len(data_row) for data_row in data) < max_width:
585588
empty_cell: list[Scalar] = [""]

pandas/io/excel/_pyxlsb.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,27 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
7575
return cell.v
7676

7777
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
78-
return [
79-
[self._convert_cell(c, convert_float) for c in r]
80-
for r in sheet.rows(sparse=False)
81-
]
78+
data: list[list[Scalar]] = []
79+
prevous_row_number = -1
80+
# When sparse=True the rows can have different lengths and empty rows are
81+
# not returned. The cells are namedtuples of row, col, value (r, c, v).
82+
for row in sheet.rows(sparse=True):
83+
row_number = row[0].r
84+
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
85+
while converted_row and converted_row[-1] == "":
86+
# trim trailing empty elements
87+
converted_row.pop()
88+
if converted_row:
89+
data.extend([[]] * (row_number - prevous_row_number - 1))
90+
data.append(converted_row)
91+
prevous_row_number = row_number
92+
if data:
93+
# extend rows to max_width
94+
max_width = max(len(data_row) for data_row in data)
95+
if min(len(data_row) for data_row in data) < max_width:
96+
empty_cell: list[Scalar] = [""]
97+
data = [
98+
data_row + (max_width - len(data_row)) * empty_cell
99+
for data_row in data
100+
]
101+
return data
2.99 KB
Binary file not shown.
25 KB
Binary file not shown.
7.9 KB
Binary file not shown.
8.62 KB
Binary file not shown.
8.6 KB
Binary file not shown.

pandas/tests/io/excel/test_readers.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,6 +1205,15 @@ def test_multiheader_two_blank_lines(self, read_ext):
12051205
)
12061206
tm.assert_frame_equal(result, expected)
12071207

1208+
def test_trailing_blanks(self, read_ext):
1209+
"""
1210+
Sheets can contain blank cells with no data. Some of our readers
1211+
were including those cells, creating many empty rows and columns
1212+
"""
1213+
file_name = "trailing_blanks" + read_ext
1214+
result = pd.read_excel(file_name)
1215+
assert result.shape == (3, 3)
1216+
12081217

12091218
class TestExcelFileRead:
12101219
@pytest.fixture(autouse=True)

0 commit comments

Comments
 (0)