From e35a25418e39f64a6ed27aa0bb39541f91495092 Mon Sep 17 00:00:00 2001 From: Kevin Hanselman Date: Thu, 30 Jan 2020 17:39:54 -0500 Subject: [PATCH] utils: more robust text file detection on checksum In the case where binary files have a large text header, the current checksum routine will treat said files as text files and normalize line-endings before performing the checksum. Not only is it dangerous to manipulate binary files like this, it also doubles the runtime of the checksum routine, as every block of data must be read twice. This patch makes the process for detecting text files more robust by increasing the number of bytes interrogated by DVC used to classify the file. --- dvc/istextfile.py | 20 ++++++++++++-------- dvc/utils/__init__.py | 9 ++++++--- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/dvc/istextfile.py b/dvc/istextfile.py index 2e10592d9c..fc2bc516f3 100644 --- a/dvc/istextfile.py +++ b/dvc/istextfile.py @@ -7,15 +7,11 @@ TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b" -def istextfile(fname, blocksize=512): - """ Uses heuristics to guess whether the given file is text or binary, - by reading a single block of bytes from the file. - If more than 30% of the chars in the block are non-text, or there - are NUL ('\x00') bytes in the block, assume this is a binary file. +def istext(block): + """ Uses heuristics to guess whether the given block of bytes is text or + binary. If more than 30% of the chars in the block are non-text, or there + are NUL ('\x00') bytes in the block, assume this is a binary file. """ - with open(fname, "rb") as fobj: - block = fobj.read(blocksize) - if not block: # An empty file is considered a valid text file return True @@ -28,3 +24,11 @@ def istextfile(fname, blocksize=512): # occurrences of TEXT_CHARS from the block nontext = block.translate(None, TEXT_CHARS) return float(len(nontext)) / len(block) <= 0.30 + + +def istextfile(fname, blocksize=2048): + """ Uses heuristics on the first 'blocksize' bytes in a file to guess + whether the given file is text or binary. + """ + with open(fname, "rb") as fobj: + return istext(fobj.read(blocksize)) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index bf02ecdb24..0011770435 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -32,13 +32,13 @@ def dos2unix(data): def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm - from dvc.istextfile import istextfile + from dvc.istextfile import istext fname = fspath_py35(fname) if os.path.exists(fname): hash_md5 = hashlib.md5() - binary = not istextfile(fname) + is_file_binary = None size = os.path.getsize(fname) no_progress_bar = True if size >= LARGE_FILE_SIZE: @@ -62,7 +62,10 @@ def file_md5(fname): if not data: break - if binary: + if is_file_binary is None: + is_file_binary = not istext(data) + + if is_file_binary: chunk = data else: chunk = dos2unix(data)