From 1a35bedef8a72bfa464940081aea18949f2e8f73 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sat, 30 Dec 2017 11:07:51 -0700 Subject: [PATCH 1/4] bpo-28494: Improve zipfile.is_zipfile reliability The zipfile.is_zipfile function would only search for the EndOfZipfile section header. This failed to correctly identify non-zipfiles that contained this header. Now the zipfile.is_zipfile function verifies the first central directory entry. Changes: * Extended zipfile.is_zipfile to verify zipfile catalog * Added tests to validate failure of binary non-zipfiles --- Lib/test/test_zipfile/test_core.py | 19 +++++++++++++++++++ Lib/zipfile/__init__.py | 14 ++++++++++++-- .../2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst | 13 +++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 43056978848c03..e93603998f979e 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1991,6 +1991,25 @@ def test_is_zip_erroneous_file(self): self.assertFalse(zipfile.is_zipfile(fp)) fp.seek(0, 0) self.assertFalse(zipfile.is_zipfile(fp)) + # - passing non-zipfile with ZIP header elements + # data created using pyPNG like so: + # d = [(ord('P'), ord('K'), 5, 6), (ord('P'), ord('K'), 6, 6)] + # w = png.Writer(1,2,alpha=True,compression=0) + # f = open('onepix.png', 'wb') + # w.write(f, d) + # w.close() + data = (b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00" + b"\x00\x02\x08\x06\x00\x00\x00\x99\x81\xb6'\x00\x00\x00\x15I" + b"DATx\x01\x01\n\x00\xf5\xff\x00PK\x05\x06\x00PK\x06\x06\x07" + b"\xac\x01N\xc6|a\r\x00\x00\x00\x00IEND\xaeB`\x82") + # - passing a filename + with open(TESTFN, "wb") as fp: + fp.write(data) + self.assertFalse(zipfile.is_zipfile(TESTFN)) + # - passing a file-like object + fp = io.BytesIO() + fp.write(data) + self.assertFalse(zipfile.is_zipfile(fp)) def test_damaged_zipfile(self): """Check that zipfiles with missing bytes at the end raise BadZipFile.""" diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 894b4d37233923..9a233b7520faef 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -234,8 +234,18 @@ def strip(cls, data, xids): def _check_zipfile(fp): try: - if _EndRecData(fp): - return True # file has correct magic number + endrec = _EndRecData(fp) + if endrec: + if endrec[_ECD_ENTRIES_TOTAL] == 0 and endrec[_ECD_SIZE] == 0 and endrec[_ECD_OFFSET] == 0: + return True # Empty zipfiles are still zipfiles + elif endrec[_ECD_DISK_NUMBER] == endrec[_ECD_DISK_START]: + fp.seek(endrec[_ECD_OFFSET]) # Central directory is on the same disk + if fp.tell() == endrec[_ECD_OFFSET] and endrec[_ECD_SIZE] >= sizeCentralDir: + data = fp.read(sizeCentralDir) # CD is where we expect it to be + if len(data) == sizeCentralDir: + centdir = struct.unpack(structCentralDir, data) # CD is the right size + if centdir[_CD_SIGNATURE] == stringCentralDir: + return True # First central directory entry has correct magic number except OSError: pass return False diff --git a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst new file mode 100644 index 00000000000000..d1aaf5d0de3bd8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst @@ -0,0 +1,13 @@ +Improve zipfile validation in `zipfile.is_zipfile`. + +Before this change `zipfile.is_zipfile()` only checked the End Central Directory +signature. If the signature could be found in the last 64k of the file, +success! This produced false positives on any file with `'PK\x05\x06'` in the +last 64k of the file - including PDFs and PNGs. + +This is now corrected by actually validating the Central Directory location +and size based on the information provided by the End Central Directory +along with verifying the Central Directory signature of the first entry. + +This should be sufficient for the vast number of zipfiles with fewer +false positives. From c981205cd7403bb5fba44c9628a50afd2718276d Mon Sep 17 00:00:00 2001 From: Tim Hatch Date: Mon, 19 May 2025 08:21:08 -0700 Subject: [PATCH 2/4] Reuse 'concat' handling for is_zipfile --- Lib/zipfile/__init__.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 9a233b7520faef..18caeb3e04a2b5 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -239,8 +239,9 @@ def _check_zipfile(fp): if endrec[_ECD_ENTRIES_TOTAL] == 0 and endrec[_ECD_SIZE] == 0 and endrec[_ECD_OFFSET] == 0: return True # Empty zipfiles are still zipfiles elif endrec[_ECD_DISK_NUMBER] == endrec[_ECD_DISK_START]: - fp.seek(endrec[_ECD_OFFSET]) # Central directory is on the same disk - if fp.tell() == endrec[_ECD_OFFSET] and endrec[_ECD_SIZE] >= sizeCentralDir: + # Central directory is on the same disk + fp.seek(sum(_handle_prepended_data(endrec))) + if endrec[_ECD_SIZE] >= sizeCentralDir: data = fp.read(sizeCentralDir) # CD is where we expect it to be if len(data) == sizeCentralDir: centdir = struct.unpack(structCentralDir, data) # CD is the right size @@ -268,6 +269,22 @@ def is_zipfile(filename): pass return result +def _handle_prepended_data(endrec, debug=0): + size_cd = endrec[_ECD_SIZE] # bytes in central directory + offset_cd = endrec[_ECD_OFFSET] # offset of central directory + + # "concat" is zero, unless zip was concatenated to another file + concat = endrec[_ECD_LOCATION] - size_cd - offset_cd + if endrec[_ECD_SIGNATURE] == stringEndArchive64: + # If Zip64 extension structures are present, account for them + concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) + + if debug > 2: + inferred = concat + offset_cd + print("given, inferred, offset", offset_cd, inferred, concat) + + return offset_cd, concat + def _EndRecData64(fpin, offset, endrec): """ Read the ZIP64 end-of-archive records and use that to update endrec @@ -1511,28 +1528,21 @@ def _RealGetContents(self): raise BadZipFile("File is not a zip file") if self.debug > 1: print(endrec) - size_cd = endrec[_ECD_SIZE] # bytes in central directory - offset_cd = endrec[_ECD_OFFSET] # offset of central directory self._comment = endrec[_ECD_COMMENT] # archive comment - # "concat" is zero, unless zip was concatenated to another file - concat = endrec[_ECD_LOCATION] - size_cd - offset_cd - if endrec[_ECD_SIGNATURE] == stringEndArchive64: - # If Zip64 extension structures are present, account for them - concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) + offset_cd, concat = _handle_prepended_data(endrec, self.debug) + + # self.start_dir: Position of start of central directory + self.start_dir = offset_cd + concat # store the offset to the beginning of data for the # .data_offset property self._data_offset = concat - if self.debug > 2: - inferred = concat + offset_cd - print("given, inferred, offset", offset_cd, inferred, concat) - # self.start_dir: Position of start of central directory - self.start_dir = offset_cd + concat if self.start_dir < 0: raise BadZipFile("Bad offset for central directory") fp.seek(self.start_dir, 0) + size_cd = endrec[_ECD_SIZE] data = fp.read(size_cd) fp = io.BytesIO(data) total = 0 From 4bfec3e29d339231116cecb9c16b9dfca03c1110 Mon Sep 17 00:00:00 2001 From: Tim Hatch Date: Mon, 19 May 2025 11:23:53 -0700 Subject: [PATCH 3/4] Improve NEWS entry --- .../Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst index d1aaf5d0de3bd8..ad6066fbf65a1e 100644 --- a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst +++ b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst @@ -1,13 +1,13 @@ -Improve zipfile validation in `zipfile.is_zipfile`. +Improve Zip file validation in :func:`zipfile.is_zipfile`. -Before this change `zipfile.is_zipfile()` only checked the End Central Directory +Before this change :func:`zipfile.is_zipfile` only checked the End Central Directory signature. If the signature could be found in the last 64k of the file, -success! This produced false positives on any file with `'PK\x05\x06'` in the +success! This produced false positives on any file with ``'PK\x05\x06'`` in the last 64k of the file - including PDFs and PNGs. This is now corrected by actually validating the Central Directory location -and size based on the information provided by the End Central Directory +and size based on the information provided by the End of Central Directory along with verifying the Central Directory signature of the first entry. -This should be sufficient for the vast number of zipfiles with fewer +This should be sufficient for the vast number of Zip files with fewer false positives. From c354a2f27c5880fc0caee0ee1bb7b66880e5f7d2 Mon Sep 17 00:00:00 2001 From: Tim Hatch Date: Mon, 19 May 2025 15:05:49 -0700 Subject: [PATCH 4/4] Shorten changelog entry --- .../2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst index ad6066fbf65a1e..0c518983770d5d 100644 --- a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst +++ b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst @@ -1,13 +1 @@ -Improve Zip file validation in :func:`zipfile.is_zipfile`. - -Before this change :func:`zipfile.is_zipfile` only checked the End Central Directory -signature. If the signature could be found in the last 64k of the file, -success! This produced false positives on any file with ``'PK\x05\x06'`` in the -last 64k of the file - including PDFs and PNGs. - -This is now corrected by actually validating the Central Directory location -and size based on the information provided by the End of Central Directory -along with verifying the Central Directory signature of the first entry. - -This should be sufficient for the vast number of Zip files with fewer -false positives. +Improve Zip file validation false positive rate in :func:`zipfile.is_zipfile`.